diff --git a/.cargo/config.in b/.cargo/config.in index 50bf1f063287..36032c98b8a7 100644 --- a/.cargo/config.in +++ b/.cargo/config.in @@ -17,6 +17,11 @@ git = "https://github.com/mozilla/neqo" replace-with = "vendored-sources" rev = "a17c1e83" +[source."https://github.com/jfkthame/mapped_hyph.git"] +git = "https://github.com/jfkthame/mapped_hyph.git" +replace-with = "vendored-sources" +tag = "v0.3.0" + [source."https://github.com/hsivonen/packed_simd"] branch = "rust_1_32" git = "https://github.com/hsivonen/packed_simd" diff --git a/Cargo.lock b/Cargo.lock index 712445f36e35..185437caaf55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1261,6 +1261,7 @@ dependencies = [ "kvstore 0.1.0", "lmdb-rkv-sys 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "mapped_hyph 0.3.0 (git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0)", "mdns_service 0.1.0", "mozurl 0.0.1", "mp4parse_capi 0.11.2", @@ -1800,6 +1801,15 @@ dependencies = [ "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "mapped_hyph" +version = "0.3.0" +source = "git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0#3b5fffbe17e8cdcc6814886a9b9170fde3db13bd" +dependencies = [ + "arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "marionette" version = "0.1.0" @@ -2889,7 +2899,7 @@ dependencies = [ "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "digest 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -4283,6 +4293,7 @@ dependencies = [ "checksum lzw 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7d947cbb889ed21c2a84be6ffbaebf5b4e0f4340638cba0444907e38b56be084" "checksum mach 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" "checksum malloc_size_of_derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "35adee9ed962cf7d07d62cb58bc45029f3227f5b5b86246caa8632f06c187bc3" +"checksum mapped_hyph 0.3.0 (git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0)" = "" "checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376" "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" "checksum memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" diff --git a/intl/hyphenation/glue/hnjalloc.h b/intl/hyphenation/glue/hnjalloc.h deleted file mode 100644 index 6c88158f1c22..000000000000 --- a/intl/hyphenation/glue/hnjalloc.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -/* - * To enable us to load hyphenation dictionaries from arbitrary resource URIs, - * not just through file paths using stdio, we override the (few) stdio APIs - * that hyphen.c uses and provide our own reimplementation that calls Gecko - * i/o methods. - */ - -#include /* ensure stdio.h is loaded before our macros */ - -#undef FILE -#define FILE hnjFile - -#define fopen(path, mode) hnjFopen(path, mode) -#define fclose(file) hnjFclose(file) -#define fgets(buf, count, file) hnjFgets(buf, count, file) -#define feof(file) hnjFeof(file) -#define fgetc(file) hnjFgetc(file) - -typedef struct hnjFile_ hnjFile; - -#ifdef __cplusplus -extern "C" { -#endif - -void* hnj_malloc(size_t size); -void* hnj_realloc(void* ptr, size_t size); -void hnj_free(void* ptr); - -hnjFile* hnjFopen(const char* aURISpec, const char* aMode); - -int hnjFclose(hnjFile* f); - -char* hnjFgets(char* s, int n, hnjFile* f); - -int hnjFeof(hnjFile* f); - -int hnjFgetc(hnjFile* f); - -#ifdef __cplusplus -} -#endif diff --git a/intl/hyphenation/glue/hnjstdio.cpp b/intl/hyphenation/glue/hnjstdio.cpp deleted file mode 100644 index 40e4faa8ca04..000000000000 --- a/intl/hyphenation/glue/hnjstdio.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// This file provides substitutes for the basic stdio routines used by hyphen.c -// to read its dictionary files. We #define the stdio names to these versions -// in hnjalloc.h, so that we can use nsIURI and nsIInputStream to specify and -// access the dictionary resources. - -#include "hnjalloc.h" - -#undef FILE // Undo #defines from hnjalloc.h before #including other headers -#undef fopen -#undef fclose -#undef fgets -#undef feof -#undef fgetc - -#include "nsNetUtil.h" -#include "nsIInputStream.h" -#include "nsIURI.h" -#include "nsContentUtils.h" - -#define BUFSIZE 1024 - -struct hnjFile_ { - nsCOMPtr mStream; - char mBuffer[BUFSIZE]; - uint32_t mCurPos; - uint32_t mLimit; - bool mEOF; -}; - -// replacement for fopen() -// (not a full substitute: only supports read access) -hnjFile* hnjFopen(const char* aURISpec, const char* aMode) { - // this override only needs to support "r" - NS_ASSERTION(!strcmp(aMode, "r"), "unsupported fopen() mode in hnjFopen"); - - nsCOMPtr uri; - nsresult rv = NS_NewURI(getter_AddRefs(uri), aURISpec); - if (NS_FAILED(rv)) { - return nullptr; - } - - nsCOMPtr channel; - rv = NS_NewChannel(getter_AddRefs(channel), uri, - nsContentUtils::GetSystemPrincipal(), - nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL, - nsIContentPolicy::TYPE_OTHER); - if (NS_FAILED(rv)) { - return nullptr; - } - - nsCOMPtr instream; - rv = channel->Open(getter_AddRefs(instream)); - if (NS_FAILED(rv)) { - return nullptr; - } - - hnjFile* f = new hnjFile; - f->mStream = instream; - f->mCurPos = 0; - f->mLimit = 0; - f->mEOF = false; - - return f; -} - -// replacement for fclose() -int hnjFclose(hnjFile* f) { - NS_ASSERTION(f && f->mStream, "bad argument to hnjFclose"); - - int result = 0; - nsresult rv = f->mStream->Close(); - if (NS_FAILED(rv)) { - result = EOF; - } - f->mStream = nullptr; - - delete f; - return result; -} - -// replacement for fgetc() -int hnjFgetc(hnjFile* f) { - if (f->mCurPos >= f->mLimit) { - f->mCurPos = 0; - - nsresult rv = f->mStream->Read(f->mBuffer, BUFSIZE, &f->mLimit); - if (NS_FAILED(rv)) { - f->mLimit = 0; - } - - if (f->mLimit == 0) { - f->mEOF = true; - return EOF; - } - } - - return f->mBuffer[f->mCurPos++]; -} - -// replacement for fgets() -// (not a full reimplementation, but sufficient for libhyphen's needs) -char* hnjFgets(char* s, int n, hnjFile* f) { - NS_ASSERTION(s && f, "bad argument to hnjFgets"); - - int i = 0; - while (i < n - 1) { - int c = hnjFgetc(f); - - if (c == EOF) { - break; - } - - s[i++] = c; - - if (c == '\n' || c == '\r') { - break; - } - } - - if (i == 0) { - return nullptr; // end of file - } - - s[i] = '\0'; // null-terminate the returned string - return s; -} - -int hnjFeof(hnjFile* f) { return f->mEOF ? EOF : 0; } diff --git a/intl/hyphenation/glue/moz.build b/intl/hyphenation/glue/moz.build index 92e8ad11aeca..a66d08c26662 100644 --- a/intl/hyphenation/glue/moz.build +++ b/intl/hyphenation/glue/moz.build @@ -14,16 +14,18 @@ UNIFIED_SOURCES += [ 'nsHyphenator.cpp', ] -# These files cannot be built in unified mode because they include hnjalloc.h. -SOURCES += [ - 'hnjstdio.cpp', -] - -LOCAL_INCLUDES += [ - '../hyphen', -] - FINAL_LIBRARY = 'xul' if CONFIG['CC_TYPE'] in ('clang', 'gcc'): CXXFLAGS += ['-Wno-error=shadow'] + +if CONFIG['COMPILE_ENVIRONMENT']: + GENERATED_FILES += [ + 'mapped_hyph.h' + ] + + generated = GENERATED_FILES['mapped_hyph.h'] + generated.script = '/layout/style/RunCbindgen.py:generate' + generated.inputs = [ + '/third_party/rust/mapped_hyph' + ] diff --git a/intl/hyphenation/glue/nsHyphenationManager.cpp b/intl/hyphenation/glue/nsHyphenationManager.cpp index b20e6b754579..68953a23f12c 100644 --- a/intl/hyphenation/glue/nsHyphenationManager.cpp +++ b/intl/hyphenation/glue/nsHyphenationManager.cpp @@ -37,8 +37,7 @@ static const char kMemoryPressureNotification[] = "memory-pressure"; static const char kParentShuttingDownNotification[] = "profile-before-change"; static const char kChildShuttingDownNotification[] = "content-child-shutdown"; -class HyphenReporter final : public nsIMemoryReporter, - public CountingAllocatorBase { +class HyphenReporter final : public nsIMemoryReporter { private: ~HyphenReporter() = default; @@ -47,14 +46,19 @@ class HyphenReporter final : public nsIMemoryReporter, // For telemetry, we report the memory rounded up to the nearest KB. static uint32_t MemoryAllocatedInKB() { - return (MemoryAllocated() + 1023) / 1024; + size_t total = 0; + if (nsHyphenationManager::Instance()) { + total = nsHyphenationManager::Instance()->SizeOfIncludingThis( + moz_malloc_size_of); + } + return (total + 1023) / 1024; } NS_IMETHOD CollectReports(nsIHandleReportCallback* aHandleReport, nsISupports* aData, bool aAnonymize) override { - size_t total = MemoryAllocated(); + size_t total = 0; if (nsHyphenationManager::Instance()) { - total += nsHyphenationManager::Instance()->SizeOfIncludingThis( + total = nsHyphenationManager::Instance()->SizeOfIncludingThis( moz_malloc_size_of); } MOZ_COLLECT_REPORT("explicit/hyphenation", KIND_HEAP, UNITS_BYTES, total, @@ -65,30 +69,6 @@ class HyphenReporter final : public nsIMemoryReporter, NS_IMPL_ISUPPORTS(HyphenReporter, nsIMemoryReporter) -template <> -CountingAllocatorBase::AmountType - CountingAllocatorBase::sAmount(0); - -/** - * Allocation wrappers to track the amount of memory allocated by libhyphen. - * Note that libhyphen assumes its malloc/realloc functions are infallible! - */ -extern "C" { -void* hnj_malloc(size_t aSize); -void* hnj_realloc(void* aPtr, size_t aSize); -void hnj_free(void* aPtr); -}; - -void* hnj_malloc(size_t aSize) { - return HyphenReporter::InfallibleCountingMalloc(aSize); -} - -void* hnj_realloc(void* aPtr, size_t aSize) { - return HyphenReporter::InfallibleCountingRealloc(aPtr, aSize); -} - -void hnj_free(void* aPtr) { HyphenReporter::CountingFree(aPtr); } - nsHyphenationManager* nsHyphenationManager::sInstance = nullptr; NS_IMPL_ISUPPORTS(nsHyphenationManager, nsIObserver) @@ -257,7 +237,7 @@ void nsHyphenationManager::LoadPatternListFromOmnijar(Omnijar::Type aType) { } nsZipFind* find; - zip->FindInit("hyphenation/hyph_*.dic", &find); + zip->FindInit("hyphenation/hyph_*.hyf", &find); if (!find) { return; } @@ -278,7 +258,7 @@ void nsHyphenationManager::LoadPatternListFromOmnijar(Omnijar::Type aType) { continue; } ToLowerCase(locale); - locale.SetLength(locale.Length() - 4); // strip ".dic" + locale.SetLength(locale.Length() - 4); // strip ".hyf" locale.Cut(0, locale.RFindChar('/') + 1); // strip directory if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) { locale.Cut(0, 5); @@ -323,13 +303,13 @@ void nsHyphenationManager::LoadPatternListFromDir(nsIFile* aDir) { file->GetLeafName(dictName); NS_ConvertUTF16toUTF8 locale(dictName); ToLowerCase(locale); - if (!StringEndsWith(locale, NS_LITERAL_CSTRING(".dic"))) { + if (!StringEndsWith(locale, NS_LITERAL_CSTRING(".hyf"))) { continue; } if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) { locale.Cut(0, 5); } - locale.SetLength(locale.Length() - 4); // strip ".dic" + locale.SetLength(locale.Length() - 4); // strip ".hyf" for (uint32_t i = 0; i < locale.Length(); ++i) { if (locale[i] == '_') { locale.Replace(i, 1, '-'); @@ -383,9 +363,6 @@ size_t nsHyphenationManager::SizeOfIncludingThis(MallocSizeOf aMallocSizeOf) { // finds it is worthwhile. result += mHyphenators.ShallowSizeOfExcludingThis(aMallocSizeOf); - for (auto i = mHyphenators.ConstIter(); !i.Done(); i.Next()) { - result += aMallocSizeOf(i.Data().get()); - } return result; } diff --git a/intl/hyphenation/glue/nsHyphenator.cpp b/intl/hyphenation/glue/nsHyphenator.cpp index 2d116516683e..bff3e8265eca 100644 --- a/intl/hyphenation/glue/nsHyphenator.cpp +++ b/intl/hyphenation/glue/nsHyphenator.cpp @@ -4,34 +4,147 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "nsHyphenator.h" -#include "nsIFile.h" -#include "nsUTF8Utils.h" -#include "nsUnicodeProperties.h" -#include "nsIURI.h" -#include "mozilla/Telemetry.h" -#include "hyphen.h" +#include "mozilla/Telemetry.h" +#include "nsContentUtils.h" +#include "nsIChannel.h" +#include "nsIFile.h" +#include "nsIFileURL.h" +#include "nsIInputStream.h" +#include "nsIJARURI.h" +#include "nsIURI.h" +#include "nsNetUtil.h" +#include "nsUnicodeProperties.h" +#include "nsUTF8Utils.h" + +#include "mapped_hyph.h" + +static const void* GetItemPtrFromJarURI(nsIJARURI* aJAR, uint32_t* aLength) { + // Try to get the jarfile's nsZipArchive, find the relevant item, and return + // a pointer to its data provided it is stored uncompressed. + nsCOMPtr jarFile; + if (NS_FAILED(aJAR->GetJARFile(getter_AddRefs(jarFile)))) { + return nullptr; + } + nsCOMPtr fileUrl = do_QueryInterface(jarFile); + if (!fileUrl) { + return nullptr; + } + nsCOMPtr file; + fileUrl->GetFile(getter_AddRefs(file)); + if (!file) { + return nullptr; + } + RefPtr archive = mozilla::Omnijar::GetReader(file); + if (archive) { + nsCString path; + aJAR->GetJAREntry(path); + nsZipItem* item = archive->GetItem(path.get()); + if (item && item->Compression() == 0 && item->Size() > 0) { + // We do NOT own this data, but it won't go away until the omnijar + // file is closed during shutdown. + const uint8_t* data = archive->GetData(item); + if (data) { + *aLength = item->Size(); + return data; + } + } + } + return nullptr; +} + +static const void* LoadResourceFromURI(nsIURI* aURI, uint32_t* aLength) { + nsCOMPtr channel; + if (NS_FAILED(NS_NewChannel(getter_AddRefs(channel), aURI, + nsContentUtils::GetSystemPrincipal(), + nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL, + nsIContentPolicy::TYPE_OTHER))) { + return nullptr; + } + nsCOMPtr instream; + if (NS_FAILED(channel->Open(getter_AddRefs(instream)))) { + return nullptr; + } + // Check size, bail out if it is excessively large (the largest of the + // hyphenation files currently shipped with Firefox is around 1MB + // uncompressed). + uint64_t available; + if (NS_FAILED(instream->Available(&available)) || !available || + available > 16 * 1024 * 1024) { + return nullptr; + } + char* buffer = static_cast(malloc(available)); + if (!buffer) { + return nullptr; + } + uint32_t bytesRead = 0; + if (NS_FAILED(instream->Read(buffer, available, &bytesRead)) || + bytesRead != available) { + free(buffer); + return nullptr; + } + *aLength = bytesRead; + return buffer; +} nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized) - : mDict(nullptr), mHyphenateCapitalized(aHyphenateCapitalized) { - nsCString uriSpec; - nsresult rv = aURI->GetSpec(uriSpec); - if (NS_FAILED(rv)) { - return; - } + : mDict(nullptr), + mDictSize(0), + mOwnsDict(false), + mHyphenateCapitalized(aHyphenateCapitalized) { Telemetry::AutoTimer telemetry; - mDict = hnj_hyphen_load(uriSpec.get()); -#ifdef DEBUG - if (mDict) { - printf("loaded hyphenation patterns from %s\n", uriSpec.get()); + + nsCOMPtr jar = do_QueryInterface(aURI); + if (jar) { + // This gives us a raw pointer into the omnijar's data (if uncompressed); + // we do not own it and must not attempt to free it! + mDict = GetItemPtrFromJarURI(jar, &mDictSize); + if (!mDict) { + // Omnijar must be compressed: we need to decompress the item into our + // own buffer. (Currently this is the case on Android.) + // TODO: Allocate in shared memory for all content processes to use. + mDict = LoadResourceFromURI(aURI, &mDictSize); + mOwnsDict = true; + } + if (mDict) { + // Reject the resource from omnijar if it fails to validate. (If this + // happens, we will hit the MOZ_ASSERT_UNREACHABLE at the end of the + // constructor, indicating the build is broken in some way.) + if (!mapped_hyph_is_valid_hyphenator(static_cast(mDict), + mDictSize)) { + if (mOwnsDict) { + free(const_cast(mDict)); + } + mDict = nullptr; + mDictSize = 0; + } + } + } else if (mozilla::net::SchemeIsFile(aURI)) { + // Ask the Rust lib to mmap the file. In this case our mDictSize field + // remains zero; mDict is not a pointer to the raw data but an opaque + // reference to a Rust object, and can only be freed by passing it to + // mapped_hyph_free_dictionary(). + nsAutoCString path; + aURI->GetFilePath(path); + mDict = mapped_hyph_load_dictionary(path.get()); + } + + if (!mDict) { + // This should never happen, unless someone has included an invalid + // hyphenation file that fails to load. + MOZ_ASSERT_UNREACHABLE("invalid hyphenation resource?"); } -#endif } nsHyphenator::~nsHyphenator() { - if (mDict != nullptr) { - hnj_hyphen_free((HyphenDict*)mDict); - mDict = nullptr; + if (mDict) { + if (mDictSize) { + if (mOwnsDict) { + free(const_cast(mDict)); + } + } else { + mapped_hyph_free_dictionary((HyphDic*)mDict); + } } } @@ -83,13 +196,12 @@ nsresult nsHyphenator::Hyphenate(const nsAString& aString, void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, uint32_t aLimit, nsTArray& aHyphens) { - // Convert word from aStart and aLimit in aString to utf-8 for libhyphen, + // Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph, // lowercasing it as we go so that it will match the (lowercased) patterns // (bug 1105644). nsAutoCString utf8; - const char16_t* const begin = aString.BeginReading(); - const char16_t* cur = begin + aStart; - const char16_t* end = begin + aLimit; + const char16_t* cur = aString.BeginReading() + aStart; + const char16_t* end = aString.BeginReading() + aLimit; bool firstLetter = true; while (cur < end) { uint32_t ch = *cur++; @@ -98,10 +210,10 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { ch = SURROGATE_TO_UCS4(ch, *cur++); } else { - ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR + return; // unpaired surrogate: bail out, don't hyphenate broken text } } else if (NS_IS_LOW_SURROGATE(ch)) { - ch = 0xfffd; // unpaired surrogate + return; // unpaired surrogate } // XXX What about language-specific casing? Consider Turkish I/i... @@ -111,15 +223,11 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, ch = ToLowerCase(ch); if (ch != origCh) { - if (firstLetter) { - // Avoid hyphenating capitalized words (bug 1550532) unless explicitly - // allowed by prefs for the language in use. - if (!mHyphenateCapitalized) { - return; - } - } else { - // Also never auto-hyphenate a word that has internal caps, as it may - // well be an all-caps acronym or a quirky name like iTunes. + // Avoid hyphenating capitalized words (bug 1550532) unless explicitly + // allowed by prefs for the language in use. + // Also never auto-hyphenate a word that has internal caps, as it may + // well be an all-caps acronym or a quirky name like iTunes. + if (!mHyphenateCapitalized || !firstLetter) { return; } } @@ -142,31 +250,43 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, } } - AutoTArray utf8hyphens; - utf8hyphens.SetLength(utf8.Length() + 5); - char** rep = nullptr; - int* pos = nullptr; - int* cut = nullptr; - int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(), - utf8.Length(), utf8hyphens.Elements(), - nullptr, &rep, &pos, &cut); - if (!err) { - // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer - // from utf8 code unit indexing (which would match the utf8 input - // string directly) to Unicode character indexing. - // We then need to convert this to utf16 code unit offsets for Gecko. - const char* hyphPtr = utf8hyphens.Elements(); - const char16_t* cur = begin + aStart; - const char16_t* end = begin + aLimit; - while (cur < end) { - if (*hyphPtr & 0x01) { - aHyphens[cur - begin] = true; + AutoTArray hyphenValues; + hyphenValues.SetLength(utf8.Length()); + int32_t result; + if (mDictSize > 0) { + result = mapped_hyph_find_hyphen_values_raw( + static_cast(mDict), mDictSize, utf8.BeginReading(), + utf8.Length(), hyphenValues.Elements(), hyphenValues.Length()); + } else { + result = mapped_hyph_find_hyphen_values_dic( + static_cast(mDict), utf8.BeginReading(), utf8.Length(), + hyphenValues.Elements(), hyphenValues.Length()); + } + if (result > 0) { + // We need to convert UTF-8 indexing as used by the hyphenation lib into + // UTF-16 indexing of the aHyphens[] array for Gecko. + uint32_t utf16index = 0; + for (uint32_t utf8index = 0; utf8index < utf8.Length();) { + // We know utf8 is valid, so we only need to look at the first byte of + // each character to determine its length and the corresponding UTF-16 + // length to add to utf16index. + const uint8_t leadByte = utf8[utf8index]; + if (leadByte < 0x80) { + utf8index += 1; + } else if (leadByte < 0xE0) { + utf8index += 2; + } else if (leadByte < 0xF0) { + utf8index += 3; + } else { + utf8index += 4; } - cur++; - if (cur < end && NS_IS_SURROGATE_PAIR(*(cur - 1), *cur)) { - cur++; + // The hyphenation value of interest is the one for the last code unit + // of the utf-8 character, and is recorded on the last code unit of the + // utf-16 character (in the case of a surrogate pair). + utf16index += leadByte >= 0xF0 ? 2 : 1; + if (utf16index > 0 && (hyphenValues[utf8index - 1] & 0x01)) { + aHyphens[aStart + utf16index - 1] = true; } - hyphPtr++; } } } diff --git a/intl/hyphenation/glue/nsHyphenator.h b/intl/hyphenation/glue/nsHyphenator.h index 790e3b32e509..0eb8a2468ff4 100644 --- a/intl/hyphenation/glue/nsHyphenator.h +++ b/intl/hyphenation/glue/nsHyphenator.h @@ -28,7 +28,14 @@ class nsHyphenator { void HyphenateWord(const nsAString& aString, uint32_t aStart, uint32_t aLimit, nsTArray& aHyphens); - void* mDict; + const void* mDict; // If mDictSize > 0, this points to a raw byte buffer + // containing the hyphenation dictionary data (in the + // memory-mapped omnijar, or owned by us if mOwnsDict); + // if mDictSize == 0, it's a HyphDic reference created + // by mapped_hyph_load_dictionary() and must be released + // by calling mapped_hyph_free_dictionary(). + uint32_t mDictSize; + bool mOwnsDict; bool mHyphenateCapitalized; }; diff --git a/intl/hyphenation/hyphen/AUTHORS b/intl/hyphenation/hyphen/AUTHORS deleted file mode 100644 index e1e0f3c844ec..000000000000 --- a/intl/hyphenation/hyphen/AUTHORS +++ /dev/null @@ -1,17 +0,0 @@ -Libhnj was written by Raph Levien . - -Original Libhnj source with OOo's patches are managed by Rene Engelhard and -Chris Halls at Debian: http://packages.debian.org/stable/libdevel/libhnj-dev -and http://packages.debian.org/unstable/source/libhnj - -This subset of Libhnj was extended by -Peter Novodvorsky (OOo integration), -László Németh (non-standard and compound -hyphenation with Unicode support), -Nanning Buitenhuis (substrings.c) - -Write bug reports to László Németh or in the bug tracker of hunspell.sf.net. - ---- -Please contact Raph Levien for information about licensing for -proprietary applications. diff --git a/intl/hyphenation/hyphen/COPYING b/intl/hyphenation/hyphen/COPYING deleted file mode 100644 index 4c278cb77352..000000000000 --- a/intl/hyphenation/hyphen/COPYING +++ /dev/null @@ -1,17 +0,0 @@ -GPL 2.0/LGPL 2.1/MPL 1.1 tri-license - -The contents of this software may be used under the terms of -the GNU General Public License Version 2 or later (the "GPL"), or -the GNU Lesser General Public License Version 2.1 or later (the "LGPL", -see COPYING.LGPL) or the Mozilla Public License Version 1.1 or later -(the "MPL", see COPYING.MPL). - -The Plain TeX hyphenation tables "hyphen.tex" by Donald E. Knuth -has a non MPL/LGPL compatible license, but freely redistributable: -"Unlimited copying and redistribution of this file are permitted as long -as this file is not modified. Modifications are permitted, but only if -the resulting file is not named hyphen.tex." - -Software distributed under these licenses is distributed on an "AS IS" basis, -WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences -for the specific language governing rights and limitations under the licenses. diff --git a/intl/hyphenation/hyphen/COPYING.LGPL b/intl/hyphenation/hyphen/COPYING.LGPL deleted file mode 100644 index c4792dd27a32..000000000000 --- a/intl/hyphenation/hyphen/COPYING.LGPL +++ /dev/null @@ -1,515 +0,0 @@ - - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations -below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. -^L - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it -becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. -^L - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control -compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. -^L - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. -^L - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. -^L - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. -^L - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply, and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License -may add an explicit geographical distribution limitation excluding those -countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. -^L - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS -^L - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms -of the ordinary General Public License). - - To apply these terms, attach the following notices to the library. -It is safest to attach them to the start of each source file to most -effectively convey the exclusion of warranty; and each file should -have at least the "copyright" line and a pointer to where the full -notice is found. - - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper -mail. - -You should also get your employer (if you work as a programmer) or -your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James -Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/intl/hyphenation/hyphen/COPYING.MPL b/intl/hyphenation/hyphen/COPYING.MPL deleted file mode 100644 index 7714141d1542..000000000000 --- a/intl/hyphenation/hyphen/COPYING.MPL +++ /dev/null @@ -1,470 +0,0 @@ - MOZILLA PUBLIC LICENSE - Version 1.1 - - --------------- - -1. Definitions. - - 1.0.1. "Commercial Use" means distribution or otherwise making the - Covered Code available to a third party. - - 1.1. "Contributor" means each entity that creates or contributes to - the creation of Modifications. - - 1.2. "Contributor Version" means the combination of the Original - Code, prior Modifications used by a Contributor, and the Modifications - made by that particular Contributor. - - 1.3. "Covered Code" means the Original Code or Modifications or the - combination of the Original Code and Modifications, in each case - including portions thereof. - - 1.4. "Electronic Distribution Mechanism" means a mechanism generally - accepted in the software development community for the electronic - transfer of data. - - 1.5. "Executable" means Covered Code in any form other than Source - Code. - - 1.6. "Initial Developer" means the individual or entity identified - as the Initial Developer in the Source Code notice required by Exhibit - A. - - 1.7. "Larger Work" means a work which combines Covered Code or - portions thereof with code not governed by the terms of this License. - - 1.8. "License" means this document. - - 1.8.1. "Licensable" means having the right to grant, to the maximum - extent possible, whether at the time of the initial grant or - subsequently acquired, any and all of the rights conveyed herein. - - 1.9. "Modifications" means any addition to or deletion from the - substance or structure of either the Original Code or any previous - Modifications. When Covered Code is released as a series of files, a - Modification is: - A. Any addition to or deletion from the contents of a file - containing Original Code or previous Modifications. - - B. Any new file that contains any part of the Original Code or - previous Modifications. - - 1.10. "Original Code" means Source Code of computer software code - which is described in the Source Code notice required by Exhibit A as - Original Code, and which, at the time of its release under this - License is not already Covered Code governed by this License. - - 1.10.1. "Patent Claims" means any patent claim(s), now owned or - hereafter acquired, including without limitation, method, process, - and apparatus claims, in any patent Licensable by grantor. - - 1.11. "Source Code" means the preferred form of the Covered Code for - making modifications to it, including all modules it contains, plus - any associated interface definition files, scripts used to control - compilation and installation of an Executable, or source code - differential comparisons against either the Original Code or another - well known, available Covered Code of the Contributor's choice. The - Source Code can be in a compressed or archival form, provided the - appropriate decompression or de-archiving software is widely available - for no charge. - - 1.12. "You" (or "Your") means an individual or a legal entity - exercising rights under, and complying with all of the terms of, this - License or a future version of this License issued under Section 6.1. - For legal entities, "You" includes any entity which controls, is - controlled by, or is under common control with You. For purposes of - this definition, "control" means (a) the power, direct or indirect, - to cause the direction or management of such entity, whether by - contract or otherwise, or (b) ownership of more than fifty percent - (50%) of the outstanding shares or beneficial ownership of such - entity. - -2. Source Code License. - - 2.1. The Initial Developer Grant. - The Initial Developer hereby grants You a world-wide, royalty-free, - non-exclusive license, subject to third party intellectual property - claims: - (a) under intellectual property rights (other than patent or - trademark) Licensable by Initial Developer to use, reproduce, - modify, display, perform, sublicense and distribute the Original - Code (or portions thereof) with or without Modifications, and/or - as part of a Larger Work; and - - (b) under Patents Claims infringed by the making, using or - selling of Original Code, to make, have made, use, practice, - sell, and offer for sale, and/or otherwise dispose of the - Original Code (or portions thereof). - - (c) the licenses granted in this Section 2.1(a) and (b) are - effective on the date Initial Developer first distributes - Original Code under the terms of this License. - - (d) Notwithstanding Section 2.1(b) above, no patent license is - granted: 1) for code that You delete from the Original Code; 2) - separate from the Original Code; or 3) for infringements caused - by: i) the modification of the Original Code or ii) the - combination of the Original Code with other software or devices. - - 2.2. Contributor Grant. - Subject to third party intellectual property claims, each Contributor - hereby grants You a world-wide, royalty-free, non-exclusive license - - (a) under intellectual property rights (other than patent or - trademark) Licensable by Contributor, to use, reproduce, modify, - display, perform, sublicense and distribute the Modifications - created by such Contributor (or portions thereof) either on an - unmodified basis, with other Modifications, as Covered Code - and/or as part of a Larger Work; and - - (b) under Patent Claims infringed by the making, using, or - selling of Modifications made by that Contributor either alone - and/or in combination with its Contributor Version (or portions - of such combination), to make, use, sell, offer for sale, have - made, and/or otherwise dispose of: 1) Modifications made by that - Contributor (or portions thereof); and 2) the combination of - Modifications made by that Contributor with its Contributor - Version (or portions of such combination). - - (c) the licenses granted in Sections 2.2(a) and 2.2(b) are - effective on the date Contributor first makes Commercial Use of - the Covered Code. - - (d) Notwithstanding Section 2.2(b) above, no patent license is - granted: 1) for any code that Contributor has deleted from the - Contributor Version; 2) separate from the Contributor Version; - 3) for infringements caused by: i) third party modifications of - Contributor Version or ii) the combination of Modifications made - by that Contributor with other software (except as part of the - Contributor Version) or other devices; or 4) under Patent Claims - infringed by Covered Code in the absence of Modifications made by - that Contributor. - -3. Distribution Obligations. - - 3.1. Application of License. - The Modifications which You create or to which You contribute are - governed by the terms of this License, including without limitation - Section 2.2. The Source Code version of Covered Code may be - distributed only under the terms of this License or a future version - of this License released under Section 6.1, and You must include a - copy of this License with every copy of the Source Code You - distribute. You may not offer or impose any terms on any Source Code - version that alters or restricts the applicable version of this - License or the recipients' rights hereunder. However, You may include - an additional document offering the additional rights described in - Section 3.5. - - 3.2. Availability of Source Code. - Any Modification which You create or to which You contribute must be - made available in Source Code form under the terms of this License - either on the same media as an Executable version or via an accepted - Electronic Distribution Mechanism to anyone to whom you made an - Executable version available; and if made available via Electronic - Distribution Mechanism, must remain available for at least twelve (12) - months after the date it initially became available, or at least six - (6) months after a subsequent version of that particular Modification - has been made available to such recipients. You are responsible for - ensuring that the Source Code version remains available even if the - Electronic Distribution Mechanism is maintained by a third party. - - 3.3. Description of Modifications. - You must cause all Covered Code to which You contribute to contain a - file documenting the changes You made to create that Covered Code and - the date of any change. You must include a prominent statement that - the Modification is derived, directly or indirectly, from Original - Code provided by the Initial Developer and including the name of the - Initial Developer in (a) the Source Code, and (b) in any notice in an - Executable version or related documentation in which You describe the - origin or ownership of the Covered Code. - - 3.4. Intellectual Property Matters - (a) Third Party Claims. - If Contributor has knowledge that a license under a third party's - intellectual property rights is required to exercise the rights - granted by such Contributor under Sections 2.1 or 2.2, - Contributor must include a text file with the Source Code - distribution titled "LEGAL" which describes the claim and the - party making the claim in sufficient detail that a recipient will - know whom to contact. If Contributor obtains such knowledge after - the Modification is made available as described in Section 3.2, - Contributor shall promptly modify the LEGAL file in all copies - Contributor makes available thereafter and shall take other steps - (such as notifying appropriate mailing lists or newsgroups) - reasonably calculated to inform those who received the Covered - Code that new knowledge has been obtained. - - (b) Contributor APIs. - If Contributor's Modifications include an application programming - interface and Contributor has knowledge of patent licenses which - are reasonably necessary to implement that API, Contributor must - also include this information in the LEGAL file. - - (c) Representations. - Contributor represents that, except as disclosed pursuant to - Section 3.4(a) above, Contributor believes that Contributor's - Modifications are Contributor's original creation(s) and/or - Contributor has sufficient rights to grant the rights conveyed by - this License. - - 3.5. Required Notices. - You must duplicate the notice in Exhibit A in each file of the Source - Code. If it is not possible to put such notice in a particular Source - Code file due to its structure, then You must include such notice in a - location (such as a relevant directory) where a user would be likely - to look for such a notice. If You created one or more Modification(s) - You may add your name as a Contributor to the notice described in - Exhibit A. You must also duplicate this License in any documentation - for the Source Code where You describe recipients' rights or ownership - rights relating to Covered Code. You may choose to offer, and to - charge a fee for, warranty, support, indemnity or liability - obligations to one or more recipients of Covered Code. However, You - may do so only on Your own behalf, and not on behalf of the Initial - Developer or any Contributor. You must make it absolutely clear than - any such warranty, support, indemnity or liability obligation is - offered by You alone, and You hereby agree to indemnify the Initial - Developer and every Contributor for any liability incurred by the - Initial Developer or such Contributor as a result of warranty, - support, indemnity or liability terms You offer. - - 3.6. Distribution of Executable Versions. - You may distribute Covered Code in Executable form only if the - requirements of Section 3.1-3.5 have been met for that Covered Code, - and if You include a notice stating that the Source Code version of - the Covered Code is available under the terms of this License, - including a description of how and where You have fulfilled the - obligations of Section 3.2. The notice must be conspicuously included - in any notice in an Executable version, related documentation or - collateral in which You describe recipients' rights relating to the - Covered Code. You may distribute the Executable version of Covered - Code or ownership rights under a license of Your choice, which may - contain terms different from this License, provided that You are in - compliance with the terms of this License and that the license for the - Executable version does not attempt to limit or alter the recipient's - rights in the Source Code version from the rights set forth in this - License. If You distribute the Executable version under a different - license You must make it absolutely clear that any terms which differ - from this License are offered by You alone, not by the Initial - Developer or any Contributor. You hereby agree to indemnify the - Initial Developer and every Contributor for any liability incurred by - the Initial Developer or such Contributor as a result of any such - terms You offer. - - 3.7. Larger Works. - You may create a Larger Work by combining Covered Code with other code - not governed by the terms of this License and distribute the Larger - Work as a single product. In such a case, You must make sure the - requirements of this License are fulfilled for the Covered Code. - -4. Inability to Comply Due to Statute or Regulation. - - If it is impossible for You to comply with any of the terms of this - License with respect to some or all of the Covered Code due to - statute, judicial order, or regulation then You must: (a) comply with - the terms of this License to the maximum extent possible; and (b) - describe the limitations and the code they affect. Such description - must be included in the LEGAL file described in Section 3.4 and must - be included with all distributions of the Source Code. Except to the - extent prohibited by statute or regulation, such description must be - sufficiently detailed for a recipient of ordinary skill to be able to - understand it. - -5. Application of this License. - - This License applies to code to which the Initial Developer has - attached the notice in Exhibit A and to related Covered Code. - -6. Versions of the License. - - 6.1. New Versions. - Netscape Communications Corporation ("Netscape") may publish revised - and/or new versions of the License from time to time. Each version - will be given a distinguishing version number. - - 6.2. Effect of New Versions. - Once Covered Code has been published under a particular version of the - License, You may always continue to use it under the terms of that - version. You may also choose to use such Covered Code under the terms - of any subsequent version of the License published by Netscape. No one - other than Netscape has the right to modify the terms applicable to - Covered Code created under this License. - - 6.3. Derivative Works. - If You create or use a modified version of this License (which you may - only do in order to apply it to code which is not already Covered Code - governed by this License), You must (a) rename Your license so that - the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", - "MPL", "NPL" or any confusingly similar phrase do not appear in your - license (except to note that your license differs from this License) - and (b) otherwise make it clear that Your version of the license - contains terms which differ from the Mozilla Public License and - Netscape Public License. (Filling in the name of the Initial - Developer, Original Code or Contributor in the notice described in - Exhibit A shall not of themselves be deemed to be modifications of - this License.) - -7. DISCLAIMER OF WARRANTY. - - COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, - WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF - DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. - THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE - IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, - YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE - COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER - OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF - ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. - -8. TERMINATION. - - 8.1. This License and the rights granted hereunder will terminate - automatically if You fail to comply with terms herein and fail to cure - such breach within 30 days of becoming aware of the breach. All - sublicenses to the Covered Code which are properly granted shall - survive any termination of this License. Provisions which, by their - nature, must remain in effect beyond the termination of this License - shall survive. - - 8.2. If You initiate litigation by asserting a patent infringement - claim (excluding declatory judgment actions) against Initial Developer - or a Contributor (the Initial Developer or Contributor against whom - You file such action is referred to as "Participant") alleging that: - - (a) such Participant's Contributor Version directly or indirectly - infringes any patent, then any and all rights granted by such - Participant to You under Sections 2.1 and/or 2.2 of this License - shall, upon 60 days notice from Participant terminate prospectively, - unless if within 60 days after receipt of notice You either: (i) - agree in writing to pay Participant a mutually agreeable reasonable - royalty for Your past and future use of Modifications made by such - Participant, or (ii) withdraw Your litigation claim with respect to - the Contributor Version against such Participant. If within 60 days - of notice, a reasonable royalty and payment arrangement are not - mutually agreed upon in writing by the parties or the litigation claim - is not withdrawn, the rights granted by Participant to You under - Sections 2.1 and/or 2.2 automatically terminate at the expiration of - the 60 day notice period specified above. - - (b) any software, hardware, or device, other than such Participant's - Contributor Version, directly or indirectly infringes any patent, then - any rights granted to You by such Participant under Sections 2.1(b) - and 2.2(b) are revoked effective as of the date You first made, used, - sold, distributed, or had made, Modifications made by that - Participant. - - 8.3. If You assert a patent infringement claim against Participant - alleging that such Participant's Contributor Version directly or - indirectly infringes any patent where such claim is resolved (such as - by license or settlement) prior to the initiation of patent - infringement litigation, then the reasonable value of the licenses - granted by such Participant under Sections 2.1 or 2.2 shall be taken - into account in determining the amount or value of any payment or - license. - - 8.4. In the event of termination under Sections 8.1 or 8.2 above, - all end user license agreements (excluding distributors and resellers) - which have been validly granted by You or any distributor hereunder - prior to termination shall survive termination. - -9. LIMITATION OF LIABILITY. - - UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT - (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL - DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, - OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR - ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY - CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, - WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER - COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN - INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF - LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY - RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW - PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE - EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO - THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. - -10. U.S. GOVERNMENT END USERS. - - The Covered Code is a "commercial item," as that term is defined in - 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer - software" and "commercial computer software documentation," as such - terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 - C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), - all U.S. Government End Users acquire Covered Code with only those - rights set forth herein. - -11. MISCELLANEOUS. - - This License represents the complete agreement concerning subject - matter hereof. If any provision of this License is held to be - unenforceable, such provision shall be reformed only to the extent - necessary to make it enforceable. This License shall be governed by - California law provisions (except to the extent applicable law, if - any, provides otherwise), excluding its conflict-of-law provisions. - With respect to disputes in which at least one party is a citizen of, - or an entity chartered or registered to do business in the United - States of America, any litigation relating to this License shall be - subject to the jurisdiction of the Federal Courts of the Northern - District of California, with venue lying in Santa Clara County, - California, with the losing party responsible for costs, including - without limitation, court costs and reasonable attorneys' fees and - expenses. The application of the United Nations Convention on - Contracts for the International Sale of Goods is expressly excluded. - Any law or regulation which provides that the language of a contract - shall be construed against the drafter shall not apply to this - License. - -12. RESPONSIBILITY FOR CLAIMS. - - As between Initial Developer and the Contributors, each party is - responsible for claims and damages arising, directly or indirectly, - out of its utilization of rights under this License and You agree to - work with Initial Developer and Contributors to distribute such - responsibility on an equitable basis. Nothing herein is intended or - shall be deemed to constitute any admission of liability. - -13. MULTIPLE-LICENSED CODE. - - Initial Developer may designate portions of the Covered Code as - "Multiple-Licensed". "Multiple-Licensed" means that the Initial - Developer permits you to utilize portions of the Covered Code under - Your choice of the NPL or the alternative licenses, if any, specified - by the Initial Developer in the file described in Exhibit A. - -EXHIBIT A -Mozilla Public License. - - ``The contents of this file are subject to the Mozilla Public License - Version 1.1 (the "License"); you may not use this file except in - compliance with the License. You may obtain a copy of the License at - http://www.mozilla.org/MPL/ - - Software distributed under the License is distributed on an "AS IS" - basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the - License for the specific language governing rights and limitations - under the License. - - The Original Code is ______________________________________. - - The Initial Developer of the Original Code is ________________________. - Portions created by ______________________ are Copyright (C) ______ - _______________________. All Rights Reserved. - - Contributor(s): ______________________________________. - - Alternatively, the contents of this file may be used under the terms - of the _____ license (the "[___] License"), in which case the - provisions of [______] License are applicable instead of those - above. If you wish to allow use of your version of this file only - under the terms of the [____] License and not to allow others to use - your version of this file under the MPL, indicate your decision by - deleting the provisions above and replace them with the notice and - other provisions required by the [___] License. If you do not delete - the provisions above, a recipient may use your version of this file - under either the MPL or the [___] License." - - [NOTE: The text of this Exhibit A may differ slightly from the text of - the notices in the Source Code files of the Original Code. You should - use the text of this Exhibit A rather than the text found in the - Original Code Source Code for Your Modifications.] - diff --git a/intl/hyphenation/hyphen/NEWS b/intl/hyphenation/hyphen/NEWS deleted file mode 100755 index efaa78b24994..000000000000 --- a/intl/hyphenation/hyphen/NEWS +++ /dev/null @@ -1,106 +0,0 @@ -2014-09-18 Hyphen 2.8.8: - - remove last coverity warning, 0 remaining - -2014-06-27 Hyphen 2.8.7: - - various clang scan-build warning fixes - -2012-09-13 Hyphen 2.8.6: - - righthyphenmin fix for 3-byte or more UTF-8 - multibyte characters by Steven Dickson - - fix for fdo#43931 (removing hard hyphen hyphenation for LibreOffice) - -2012-07-12 Hyphen 2.8.5: - - fix short alloc - -2012-06-29 Hyphen 2.8.4: - - coverity warnings - -2011-10-10 Hyphen 2.8.3: - - fix NOHYPHEN - - fix unbalanced hyphenation of LibreOffice/OOo - - set default COMPOUNDHYPHENMIN=3 at hyphens and apostrophes - - fix VERBOSE in hyphen.c - - new ./example option: -n to print hyphenation vector - -2011-10-07 Hyphen 2.8.2: - - fix for explicite COMPOUNDHYPHENMIN values - -2011-10-06 Hyphen 2.8.1: - - force minimal lefthyphenmin and righthyphenmin values of the dictionary - (eg. righthyphenmin=3 of English dictionaries in LibreOffice/OOo, - also the original TeX hyphenation patterns are correct only with this - righthyphenmin value). - -2011-10-04 Hyphen 2.8: - - Ignore leading and ending numbers (eg. manual/field based indexes - in LibreOffice/OOo) - - - Fix LibreOffice/OpenOffice.org hyphenation errors at apostrophes and - hyphens, n-dashes with default NOHYPHEN separators. - Eg. *o'c=lock -> o'clock. - -2010-12-01 Hyphen 2.7.1 bug fix release - -2010-11-27 Hyphen 2.7 release: - - The new hyphenation problem of OpenOffice.org 3.2, related to its - modified word breaking of words with hyphen characters, can be fixed - with the new NOHYPHEN feature. Also it's possible to solve the similar old - problem with apostrophes. More information: README.compound. - - - improved English dictionaries - -2010-08-10 Hyphen 2.6 release: - - maintainance release, fix all warnings, tidy up - make check with VALGRIND=memcheck, etc. - -2010-02-23 Hyphen 2.5 release: - - add Unicode ligature support for correct hyphenmin calculation - (ff, fi, fl, St, st are 1-character, ffi and ffl are 2-character length for - hyphenation) - - fix lefthyphenmin calculation for UTF-8 encoded input - - - en_US hyphenation dictionary: - - add OpenOffice.org patch to fix apostrophe handling - - add correct hyphenation for words with Unicode f-ligatures - (NOTE: hyphenation within ligatures is not supported yet - because of an implementation problem of OpenOffice.org, - see OOo issue 71608.) - - - small patches from OpenOffice.org - -2008-05-01 Hyphen 2.4 release: - - compound word hyphenation support by recursive pattern matching - based on two hyphenation pattern sets, see README.compound. - Especially useful for languages with arbitrary number of compounds (Danish, - Dutch, Finnish, German, Hungarian, Icelandic, Norwegian, Swedish etc.). - - - new dictionary parameters (minimal character numbers for hyph. distances): - LEFTHYPHENMIN: minimal hyphenation distance from the left end of the word - RIGHTHYPHENMIN: minimal hyphenation distance from the right end of the word - COMPOUNDLEFTHYPHENMIN: min. hyph. dist. from the left compound word boundary - COMPOUNDRIGHTHYPHENMIN: min. hyph. dist. from the right comp. word boundary - - - new API function: hnj_hyphen_hyphenate3() (like hyphenate2(), but - with hyphenmin options) - -en_US hyphenation patterns: - - - extended hyph_en_US.dic with TugBoat hyphenation log (fix thousand - incompletely or badly hyphenated words, for example acad-e-my, acro-nym, - acryl-amide, adren-a-line, aero-space, am-phet-a-mine, anom-aly etc.) - - - fixed hyph_en_US.dic: set the right default hyphenation distance of - the original TeX hyphenation patterns: - LEFTHYPHENMIN 2 - RIGHTHYPHENMIN 3 (not 2!) - It is not only a typographical issue. It seems, TeX hyphenation - patterns are right only with these settings, for example, - the bad "anoma-ly" is restricted in TeX only by the default - \righthyphenmin=3 (but not restricted in OpenOffice.org, until now). - - - documentation (README_hyph_en_US.dic) - - - fixes for automake configuration, compiling and checking, see ChangeLog - -2008-02-19: Hyphen 2.3.1 release: - - fix obsolete API function hnj_hyphen_hyphenate() diff --git a/intl/hyphenation/hyphen/README b/intl/hyphenation/hyphen/README deleted file mode 100644 index 82c612724fe7..000000000000 --- a/intl/hyphenation/hyphen/README +++ /dev/null @@ -1,134 +0,0 @@ -Hyphen - hyphenation library to use converted TeX hyphenation patterns - -(C) 1998 Raph Levien -(C) 2001 ALTLinux, Moscow -(C) 2006, 2007, 2008, 2010, 2011 László Németh - -This was part of libHnj library by Raph Levien. - -Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj -to use it in OpenOffice.org. - -Compound word and non-standard hyphenation support by László Németh. - -License is the original LibHnj license: -LibHnj is dual licensed under LGPL and MPL (see also README.libhnj). - -Because LGPL allows GPL relicensing, COPYING contains now -LGPL/GPL/MPL tri-license for explicit Mozilla source compatibility. - -Original Libhnj source with OOo's patches are managed by Rene Engelhard -and Chris Halls at Debian: - -http://packages.debian.org/stable/libdevel/libhnj-dev -and http://packages.debian.org/unstable/source/libhnj - - -OTHER FILES - -This distribution is the source of the en_US hyphenation patterns -"hyph_en_US.dic", too. See README_hyph_en_US.txt. - -Source files of hyph_en_US.dic in the distribution: - -hyphen.tex (en_US hyphenation patterns from plain TeX) - - Source: http://tug.ctan.org/text-archive/macros/plain/base/hyphen.tex - -tbhyphext.tex: hyphenation exception log from TugBoat archive - - Source of the hyphenation exception list: - http://www.ctan.org/tex-archive/info/digests/tugboat/tb0hyf.tex - - Generated with the hyphenex script - (http://www.ctan.org/tex-archive/info/digests/tugboat/hyphenex.sh) - - sh hyphenex.sh tbhyphext.tex - - -INSTALLATION - -autoreconf -fvi -./configure -make -make install - -UNIT TESTS (WITH VALGRIND DEBUGGER) - -make check -VALGRIND=memcheck make check - -USAGE - -./example hyph_en_US.dic mywords.txt - -or (under Linux) - -echo example | ./example hyph_en_US.dic /dev/stdin - -NOTE: In the case of Unicode encoded input, convert your words -to lowercase before hyphenation (under UTF-8 console environment): - -cat mywords.txt | awk '{print tolower($0)}' >mywordslow.txt - -BUILD DLL USING CROSS-COMPILATION - -./configure --host i586-mingw32 --prefix=/tmp/hyphen-dll -make -make install - -DEVELOPMENT - -See README.hyphen for hyphenation algorithm, README.nonstandard -and doc/tb87nemeth.pdf for non-standard hyphenation, -README.compound for compound word hyphenation, and tests/*. - -Description of the dictionary format: - -First line contains the character encoding (ISO8859-x, UTF-8). - -Possible options in the following lines: - -LEFTHYPHENMIN num minimal hyphenation distance from the left word end -RIGHTHYPHENMIN num minimal hyphation distance from the right word end -COMPOUNDLEFTHYPHENMIN num min. hyph. dist. from the left compound word boundary -COMPOUNDRIGHTHYPHENMIN num min. hyph. dist. from the right comp. word boundary - -hyphenation patterns see README.* files - -NEXTWORD separate the two compound sets (see README.compound) - -Default values: -Without explicite declarations, hyphenmin fields of dict struct -are zeroes, but in this case the lefthyphenmin and righthyphenmin -will be the default 2 under the hyphenation (for backward compatibility). - -Comments - -Use percent sign at the beginning of the lines to add comments to your -hpyhenation patterns (after the character encoding in the first line): - -% comment - -***************************************************************************** -* Warning! Correct working of Libhnj *needs* prepared hyphenation patterns. * - -For example, generating hyph_en_US.dic from "hyphen.us" TeX patterns: - -perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1 - -or with default LEFTHYPHENMIN and RIGHTHYPHENMIN values: - -perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1 2 3 -perl substrings.pl hyphen.gb hyph_en_GB.dic ISO8859-1 3 3 -**************************************************************************** - -OTHERS - -Java hyphenation: Peter B. West (Folio project) implements a hyphenator with -non standard hyphenation facilities based on extended Libhnj. The HyFo module -is released in binary form as jar files and in source form as zip files. -See http://sourceforge.net/project/showfiles.php?group_id=119136 - -László Németh - diff --git a/intl/hyphenation/hyphen/README.compound b/intl/hyphenation/hyphen/README.compound deleted file mode 100644 index bcb265853df0..000000000000 --- a/intl/hyphenation/hyphen/README.compound +++ /dev/null @@ -1,87 +0,0 @@ -New option of Libhyphen 2.7: NOHYPHEN - -Hyphen, apostrophe and other characters may be word boundary characters, -but they don't need (extra) hyphenation. With NOHYPHEN option -it's possible to hyphenate the words parts correctly. - -Example: - -ISO8859-1 -NOHYPHEN -,' -1-1 -1'1 -NEXTLEVEL - -Description: - -1-1 and 1'1 declare hyphen and apostrophe as word boundary characters -and NOHYPHEN with the comma separated character (or character sequence) -list forbid the (extra) hyphens at the hyphen and apostrophe characters. - -Implicite NOHYPHEN declaration - -Without explicite NEXTLEVEL declaration, Hyphen 2.8 uses the -previous settings, plus in UTF-8 encoding, endash (U+2013) and -typographical apostrophe (U+2019) are NOHYPHEN characters, too. - -It's possible to enlarge the hyphenation distance from these -NOHYPHEN characters by using COMPOUNDLEFTHYPHENMIN and -COMPOUNDRIGHTHYPHENMIN attributes. - -Compound word hyphenation - -Hyphen library supports better compound word hyphenation and special -rules of compound word hyphenation of German languages and other -languages with arbitrary number of compound words. The new options, -COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN help to set the right -style for the hyphenation of compound words. - -Algorithm - -The algorithm is an extension of the original pattern based hyphenation -algorithm. It uses two hyphenation pattern sets, defined in the same -pattern file and separated by the NEXTLEVEL keyword. First pattern -set is for hyphenation only at compound word boundaries, the second one -is for hyphenation within words or word parts. - -Recursive compound level hyphenation - -The algorithm is recursive: every word parts of a successful -first (compound) level hyphenation will be rehyphenated -by the same (first) pattern set. - -Finally, when first level hyphenation is not possible, Hyphen uses -the second level hyphenation for the word or the word parts. - -Word endings and word parts - -Patterns for word endings (patterns with ellipses) match the -word parts, too. - -Options - -COMPOUNDLEFTHYPHENMIN: min. hyph. dist. from the left compound word boundary -COMPOUNDRIGHTHYPHENMIN: min. hyph. dist. from the right comp. word boundary -NEXTLEVEL: sign second level hyphenation patterns - -Default hyphenmin values - -Default values of COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN are 0, -and 0 under the hyphenation, too. ("0" values of -LEFTHYPHENMIN and RIGHTHYPHENMIN mean the default "2" under the hyphenation.) - -Examples - -See tests/compound* test files. - -Preparation of hyphenation patterns - -It hasn't been special pattern generator tool for compound hyphenation -patterns, yet. It is possible to use PATGEN to generate both of -pattern sets, concatenate it manually and set the requested HYPHENMIN values. -(But don't forget the preprocessing steps by substrings.pl before -concatenation.) One of the disadvantage of this method, that PATGEN -doesn't know recursive compound hyphenation of Hyphen. - -László Németh - diff --git a/intl/hyphenation/hyphen/README.hyphen b/intl/hyphenation/hyphen/README.hyphen deleted file mode 100644 index 8aa8c8767922..000000000000 --- a/intl/hyphenation/hyphen/README.hyphen +++ /dev/null @@ -1,108 +0,0 @@ -Brief explanation of the hyphenation algorithm herein.[1] - -Raph Levien -4 Aug 1998 - - The hyphenation algorithm is basically the same as Knuth's TeX -algorithm. However, the implementation is quite a bit faster. - - The hyphenation files from TeX can almost be used directly. There -is a preprocessing step, however. If you don't do the preprocessing -step, you'll get bad hyphenations (i.e. a silent failure). - - Start with a file such as hyphen.us. This is the TeX ushyph1.tex -file, with the exception dictionary encoded using the same rules as -the main portion of the file. Any line beginning with % is a comment. -Each other line should contain exactly one rule. - - Then, do the preprocessing - "perl substrings.pl hyphen.us". The -resulting file is hyphen.mashed. It's in Perl, and it's fairly slow -(it uses brute force algorithms; about 17 seconds on a P100), but it -could probably be redone in C with clever algorithms. This would be -valuable, for example, if it was handle user-supplied exception -dictionaries by integrating them into the rule table.[2] - - Once the rules are preprocessed, loading them is quite quick - -about 200ms on a P100. It then hyphenates at about 40,000 words per -second on a P100. I haven't benchmarked it against other -implementations (both TeX and groff contain essentially the same -algorithm), but expect that it runs quite a bit faster than any of -them. - -Knuth's algorithm - - This section contains a brief explanation of Knuth's algorithm, in -case you missed it from the TeX books. We'll use the semi-word -"example" as our running example. - - Since the beginning and end of a word are special, the algorithm is -actually run over the prepared word (prep_word in the source) -".example.". Knuths algorithm basically just does pattern matches from -the rule set, then applies the matches. The patterns in this case that -match are "xa", "xam", "mp", and "pl". These are actually stored as -"x1a", "xam3", "4m1p", and "1p2l2". Whenever numbers appear between -the letters, they are added in. If two (or more) patterns have numbers -in the same place, the highest number wins. Here's the example: - - . e x a m p l e . - x1a - x a m3 - 4m1p - 1p2l2 - ----------------- - . e x1a4m3p2l2e . - - Finally, hyphens are placed wherever odd numbers appear. They are, -however, suppressed after the first letter and before the last letter -of the word (TeX actually suppresses them before the next-to-last, as -well). So, it's "ex-am-ple", which is correct. - - Knuth uses a trie to implement this. I.e. he stores each rule in a -trie structure. For each position in the word, he searches the trie, -searching for a match. Most patterns are short, so efficiency should -be quite good. - -Theory of the algorithm - - The algorithm works as a slightly modified finite state machine. -There are two kinds of transitions: those that consume one letter of -input (which work just like your regular finite state machine), and -"fallback" transitions, which don't consume any input. If no -transition matching the next letter is found, the fallback is used. -One way of looking at this is a form of compression of the transition -tables - i.e. it behaves the same as a completely vanilla state -machine in which the actual transition table of a node is made up of -the union of transition tables of the node itself, plus its fallbacks. - - Each state is represented by a string. Thus, if the current state -is "am" and the next letter is "p", then the next state is "amp". -Fallback transitions go to states which chop off one or (sometimes) -more letters from the beginning. For example, if none of the -transitions from "amp" match the next letter, then it will fall back -to "mp". Similarly, if none of the transitions from "mp" match the -next letter, it will fall back to "m". - - Each state is also associated with a (possibly null) "match" -string. This represents the union of all patterns which are -right-justified substrings of the match string. I.e. the pattern "mp" -is a right-justified substring of the state "amp", so it's numbers get -added in. The actual calculation of this union is done by the -Perl preprocessing script, but could probably be done in C just about -as easily. - - Because each state transition either consumes one input character -or shortens the state string by one character, the total number of -state transitions is linear in the length of the word. - -[1] Documentations: - -Franklin M. Liang: Word Hy-phen-a-tion by Com-put-er. -Stanford University, 1983. http://www.tug.org/docs/liang. - -László Németh: Automatic non-standard hyphenation in OpenOffice.org, -TUGboat (27), 2006. No. 2., http://hunspell.sourceforge.net/tb87nemeth.pdf - -[2] There is the C version of pattern converter "substrings.c" -in the distribution written by Nanning Buitenhuis. Unfortunatelly, -this version hasn't handled the non standard extension of the -algorithm, yet. diff --git a/intl/hyphenation/hyphen/README.nonstandard b/intl/hyphenation/hyphen/README.nonstandard deleted file mode 100644 index fd80d12c689f..000000000000 --- a/intl/hyphenation/hyphen/README.nonstandard +++ /dev/null @@ -1,122 +0,0 @@ -Non-standard hyphenation ------------------------- - -Some languages use non-standard hyphenation; `discretionary' -character changes at hyphenation points. For example, -Catalan: paral·lel -> paral-lel, -Dutch: omaatje -> oma-tje, -German (before the new orthography): Schiffahrt -> Schiff-fahrt, -Hungarian: asszonnyal -> asz-szony-nyal (multiple occurance!) -Swedish: tillata -> till-lata. - -Using this extended library, you can define -non-standard hyphenation patterns. For example: - -l·1l/l=l -a1atje./a=t,1,3 -.schif1fahrt/ff=f,5,2 -.as3szon/sz=sz,2,3 -n1nyal./ny=ny,1,3 -.til1lata./ll=l,3,2 - -or with narrow boundaries: - -l·1l/l=,1,2 -a1atje./a=,1,1 -.schif1fahrt/ff=,5,1 -.as3szon/sz=,2,1 -n1nyal./ny=,1,1 -.til1lata./ll=,3,1 - -Note: Libhnj uses modified patterns by preparing substrings.pl. -Unfortunatelly, now the conversion step can generate bad non-standard -patterns (non-standard -> standard pattern conversion), so using -narrow boundaries may be better for recent Libhnj. For example, -substrings.pl generates a few bad patterns for Hungarian hyphenation -patterns resulting bad non-standard hyphenation in a few cases. Using narrow -boundaries solves this problem. Java HyFo module can check this problem. - -Syntax of the non-standard hyphenation patterns ------------------------------------------------- - -pat1tern/change[,start,cut] - -If this pattern matches the word, and this pattern win (see README.hyphen) -in the change region of the pattern, then pattern[start, start + cut - 1] -substring will be replaced with the "change". - -For example, a German ff -> ff-f hyphenation: - -f1f/ff=f - -or with expansion - -f1f/ff=f,1,2 - -will change every "ff" with "ff=f" at hyphenation. - -A more real example: - -% simple ff -> f-f hyphenation -f1f -% Schiffahrt -> Schiff-fahrt hyphenation -% -schif3fahrt/ff=f,5,2 - -Specification - -- Pattern: matching patterns of the original Liang's algorithm - - patterns must contain only one hyphenation point at change region - signed with an one-digit odd number (1, 3, 5, 7 or 9). - These point may be at subregion boundaries: schif3fahrt/ff=,5,1 - - only the greater value guarantees the win (don't mix non-standard and - non-standard patterns with the same value, for example - instead of f3f and schif3fahrt/ff=f,5,2 use f3f and schif5fahrt/ff=f,5,2) - -- Change: new characters. - Arbitrary character sequence. Equal sign (=) signs hyphenation points - for OpenOffice.org (like in the example). (In a possible German LaTeX - preprocessor, ff could be replaced with "ff, for a Hungarian one, ssz - with `ssz, according to the German and Hungarian Babel settings.) - -- Start: starting position of the change region. - - begins with 1 (not 0): schif3fahrt/ff=f,5,2 - - start dot doesn't matter: .schif3fahrt/ff=f,5,2 - - numbers don't matter: .s2c2h2i2f3f2ahrt/ff=f,5,2 - - In UTF-8 encoding, use Unicode character positions: össze/sz=sz,2,3 - ("össze" looks "össze" in an ISO 8859-1 8-bit editor). - -- Cut: length of the removed character sequence in the original word. - - In UTF-8 encoding, use Unicode character length: paral·1lel/l=l,5,3 - ("paral·lel" looks "paral·1lel" in an ISO 8859-1 8-bit editor). - -Dictionary developing ---------------------- - -There hasn't been extended PatGen pattern generator for non-standard -hyphenation patterns, yet. - -Fortunatelly, non-standard hyphenation points are forbidden in the PatGen -generated hyphenation patterns, so with a little patch can be develop -non-standard hyphenation patterns also in this case. - -Warning: If you use UTF-8 Unicode encoding in your patterns, call -substrings.pl with UTF-8 parameter to calculate right -character positions for non-standard hyphenation: - -./substrings.pl input output UTF-8 - -Programming ------------ - -Use hyphenate2() or hyphenate3() to handle non-standard hyphenation. -See hyphen.h for the documentation of the hyphenate*() functions. -See example.c for processing the output of the hyphenate*() functions. - -Warning: change characters are lower cased in the source, so you may need -case conversion of the change characters based on input word case detection. -For example, see OpenOffice.org source -(lingucomponent/source/hyphenator/altlinuxhyph/hyphen/hyphenimp.cxx). - -László Németh - diff --git a/intl/hyphenation/hyphen/hyphen.c b/intl/hyphenation/hyphen/hyphen.c deleted file mode 100644 index bd7e9a790cbc..000000000000 --- a/intl/hyphenation/hyphen/hyphen.c +++ /dev/null @@ -1,1201 +0,0 @@ -/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both - * licenses follows. - */ - -/* LibHnj - a library for high quality hyphenation and justification - * Copyright (C) 1998 Raph Levien, - * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), - * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) - * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307 USA. -*/ - -/* - * The contents of this file are subject to the Mozilla Public License - * Version 1.0 (the "MPL"); you may not use this file except in - * compliance with the MPL. You may obtain a copy of the MPL at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the MPL is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL - * for the specific language governing rights and limitations under the - * MPL. - * - */ -#include /* for NULL, malloc */ -#include /* for fprintf */ -#include /* for strdup */ -#include /* for INT_MAX */ - -#ifdef UNX -#include /* for exit */ -#endif - -#define noVERBOSE - -/* calculate hyphenmin values with long ligature length (2 or 3 characters - * instead of 1 or 2) for comparison with hyphenation without ligatures */ -#define noLONG_LIGATURE - -#ifdef LONG_LIGATURE -#define LIG_xx 1 -#define LIG_xxx 2 -#else -#define LIG_xx 0 -#define LIG_xxx 1 -#endif - -#include "hnjalloc.h" -#include "hyphen.h" - -static char * -hnj_strdup (const char *s) -{ - char *newstr; - int l; - - l = strlen (s); - newstr = (char *) hnj_malloc (l + 1); - memcpy (newstr, s, l); - newstr[l] = 0; - return newstr; -} - -/* remove cross-platform text line end characters */ -void hnj_strchomp(char * s) -{ - int k = strlen(s); - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; -} - -/* a little bit of a hash table implementation. This simply maps strings - to state numbers */ - -typedef struct _HashTab HashTab; -typedef struct _HashEntry HashEntry; - -/* A cheap, but effective, hack. */ -#define HASH_SIZE 31627 - -struct _HashTab { - HashEntry *entries[HASH_SIZE]; -}; - -struct _HashEntry { - HashEntry *next; - char *key; - int val; -}; - -/* a char* hash function from ASU - adapted from Gtk+ */ -static unsigned int -hnj_string_hash (const char *s) -{ - const char *p; - unsigned int h=0, g; - for(p = s; *p != '\0'; p += 1) { - h = ( h << 4 ) + *p; - if ( ( g = h & 0xf0000000 ) ) { - h = h ^ (g >> 24); - h = h ^ g; - } - } - return h /* % M */; -} - -static HashTab * -hnj_hash_new (void) -{ - HashTab *hashtab; - int i; - - hashtab = (HashTab *) hnj_malloc (sizeof(HashTab)); - for (i = 0; i < HASH_SIZE; i++) - hashtab->entries[i] = NULL; - - return hashtab; -} - -static void -hnj_hash_free (HashTab *hashtab) -{ - int i; - HashEntry *e, *next; - - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = next) - { - next = e->next; - hnj_free (e->key); - hnj_free (e); - } - - hnj_free (hashtab); -} - -/* assumes that key is not already present! */ -static void -hnj_hash_insert (HashTab *hashtab, const char *key, int val) -{ - int i; - HashEntry *e; - - i = hnj_string_hash (key) % HASH_SIZE; - e = (HashEntry *) hnj_malloc (sizeof(HashEntry)); - e->next = hashtab->entries[i]; - e->key = hnj_strdup (key); - e->val = val; - hashtab->entries[i] = e; -} - -/* return val if found, otherwise -1 */ -static int -hnj_hash_lookup (HashTab *hashtab, const char *key) -{ - int i; - HashEntry *e; - i = hnj_string_hash (key) % HASH_SIZE; - for (e = hashtab->entries[i]; e; e = e->next) - if (!strcmp (key, e->key)) - return e->val; - return -1; -} - -/* Get the state number, allocating a new state if necessary. */ -static int -hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) -{ - int state_num; - - state_num = hnj_hash_lookup (hashtab, string); - - if (state_num >= 0) - return state_num; - - hnj_hash_insert (hashtab, string, dict->num_states); - /* predicate is true if dict->num_states is a power of two */ - if (!(dict->num_states & (dict->num_states - 1))) - { - dict->states = (HyphenState *) hnj_realloc (dict->states, - (dict->num_states << 1) * - sizeof(HyphenState)); - } - dict->states[dict->num_states].match = NULL; - dict->states[dict->num_states].repl = NULL; - dict->states[dict->num_states].fallback_state = -1; - dict->states[dict->num_states].num_trans = 0; - dict->states[dict->num_states].trans = NULL; - return dict->num_states++; -} - -/* add a transition from state1 to state2 through ch - assumes that the - transition does not already exist */ -static void -hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) -{ - int num_trans; - - num_trans = dict->states[state1].num_trans; - if (num_trans == 0) - { - dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans)); - } - else if (!(num_trans & (num_trans - 1))) - { - dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans, - (num_trans << 1) * - sizeof(HyphenTrans)); - } - dict->states[state1].trans[num_trans].ch = ch; - dict->states[state1].trans[num_trans].new_state = state2; - dict->states[state1].num_trans++; -} - -#ifdef VERBOSE -HashTab *global[1]; - -static char * -get_state_str (int state, int level) -{ - int i; - HashEntry *e; - - for (i = 0; i < HASH_SIZE; i++) - for (e = global[level]->entries[i]; e; e = e->next) - if (e->val == state) - return e->key; - return NULL; -} -#endif - -void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) { - int i, j; - char word[MAX_CHARS]; - char pattern[MAX_CHARS]; - char * repl; - signed char replindex; - signed char replcut; - int state_num = 0; - int last_state; - char ch; - int found; - - if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { - dict->lhmin = atoi(buf + 13); - return; - } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { - dict->rhmin = atoi(buf + 14); - return; - } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { - dict->clhmin = atoi(buf + 21); - return; - } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { - dict->crhmin = atoi(buf + 22); - return; - } else if (strncmp(buf, "NOHYPHEN", 8) == 0) { - char * space = buf + 8; - while (*space != '\0' && (*space == ' ' || *space == '\t')) space++; - if (*buf != '\0') dict->nohyphen = hnj_strdup(space); - if (dict->nohyphen) { - char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1; - *nhe = 0; - for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) { - if (*nhe == ',') { - dict->nohyphenl++; - *nhe = 0; - } - } - } - return; - } - j = 0; - pattern[j] = '0'; - repl = strchr(buf, '/'); - replindex = 0; - replcut = 0; - if (repl) { - char * index = strchr(repl + 1, ','); - *repl = '\0'; - if (index) { - char * index2 = strchr(index + 1, ','); - *index = '\0'; - if (index2) { - *index2 = '\0'; - replindex = (signed char) atoi(index + 1) - 1; - replcut = (signed char) atoi(index2 + 1); - } - } else { - hnj_strchomp(repl + 1); - replindex = 0; - replcut = (signed char) strlen(buf); - } - repl = hnj_strdup(repl + 1); - } - for (i = 0; (unsigned char)buf[i] > (unsigned char)' '; i++) - { - if (buf[i] >= '0' && buf[i] <= '9') - pattern[j] = buf[i]; - else - { - word[j] = buf[i]; - pattern[++j] = '0'; - } - } - word[j] = '\0'; - pattern[j + 1] = '\0'; - - i = 0; - if (!repl) { - /* Optimize away leading zeroes */ - for (; pattern[i] == '0'; i++); - } else { - if (*word == '.') i++; - /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ - if (dict->utf8) { - int pu = -1; /* unicode character position */ - int ps = -1; /* unicode start position (original replindex) */ - size_t pc = (*word == '.') ? 1: 0; /* 8-bit character position */ - for (; pc < (strlen(word) + 1); pc++) { - /* beginning of an UTF-8 character (not '10' start bits) */ - if ((((unsigned char) word[pc]) >> 6) != 2) pu++; - if ((ps < 0) && (replindex == pu)) { - ps = replindex; - replindex = (signed char) pc; - } - if ((ps >= 0) && ((pu - ps) == replcut)) { - replcut = (signed char) (pc - replindex); - break; - } - } - if (*word == '.') replindex--; - } - } - -#ifdef VERBOSE - printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); -#endif - found = hnj_hash_lookup (hashtab, word); - state_num = hnj_get_state (dict, hashtab, word); - dict->states[state_num].match = hnj_strdup (pattern + i); - dict->states[state_num].repl = repl; - dict->states[state_num].replindex = replindex; - if (!replcut) { - dict->states[state_num].replcut = (signed char) strlen(word); - } else { - dict->states[state_num].replcut = replcut; - } - - /* now, put in the prefix transitions */ - for (; found < 0 && j > 0; --j) - { - last_state = state_num; - ch = word[j - 1]; - word[j - 1] = '\0'; - found = hnj_hash_lookup (hashtab, word); - state_num = hnj_get_state (dict, hashtab, word); - hnj_add_trans (dict, state_num, last_state, ch); - } -} - -HyphenDict * -hnj_hyphen_load (const char *fn) -{ - HyphenDict *result; - FILE *f; - f = fopen (fn, "r"); - if (f == NULL) - return NULL; - - result = hnj_hyphen_load_file(f); - - fclose(f); - return result; -} - -HyphenDict * -hnj_hyphen_load_file (FILE *f) -{ - HyphenDict *dict[2]; - HashTab *hashtab; - char buf[MAX_CHARS]; - int nextlevel = 0; - int i, j, k; - HashEntry *e; - int state_num = 0; -/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */ -for (k = 0; k < 2; k++) { - hashtab = hnj_hash_new (); -#ifdef VERBOSE - global[k] = hashtab; -#endif - hnj_hash_insert (hashtab, "", 0); - dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict)); - dict[k]->num_states = 1; - dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState)); - dict[k]->states[0].match = NULL; - dict[k]->states[0].repl = NULL; - dict[k]->states[0].fallback_state = -1; - dict[k]->states[0].num_trans = 0; - dict[k]->states[0].trans = NULL; - dict[k]->nextlevel = NULL; - dict[k]->lhmin = 0; - dict[k]->rhmin = 0; - dict[k]->clhmin = 0; - dict[k]->crhmin = 0; - dict[k]->nohyphen = NULL; - dict[k]->nohyphenl = 0; - - /* read in character set info */ - if (k == 0) { - for (i=0;icset[i]= 0; - if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { - for (i=0;icset[i] == '\r') || (dict[k]->cset[i] == '\n')) - dict[k]->cset[i] = 0; - } else { - dict[k]->cset[0] = 0; - } - dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); - } else { - strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1); - dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0'; - dict[k]->utf8 = dict[0]->utf8; - } - - if (k == 0 || nextlevel) { - while (fgets(buf, sizeof(buf), f) != NULL) { - - /* discard lines that don't fit in buffer */ - if (!feof(f) && strchr(buf, '\n') == NULL) { - int c; - while ((c = fgetc(f)) != '\n' && c != EOF); - /* issue warning if not a comment */ - if (buf[0] != '%') { - fprintf(stderr, "Warning: skipping too long pattern (more than %zu chars)\n", sizeof(buf)); - } - continue; - } - - if (strncmp(buf, "NEXTLEVEL", 9) == 0) { - nextlevel = 1; - break; - } else if (buf[0] != '%') { - hnj_hyphen_load_line(buf, dict[k], hashtab); - } - } - } else if (k == 1) { - /* default first level: hyphen and ASCII apostrophe */ - if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab); - else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab); - strncpy(buf, "1-1\n", MAX_CHARS-1); /* buf rewritten by hnj_hyphen_load here */ - buf[MAX_CHARS-1] = '\0'; - hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */ - hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */ - if (dict[0]->utf8) { - hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */ - hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */ - } - } - - /* Could do unioning of matches here (instead of the preprocessor script). - If we did, the pseudocode would look something like this: - - foreach state in the hash table - foreach i = [1..length(state) - 1] - state to check is substr (state, i) - look it up - if found, and if there is a match, union the match in. - - It's also possible to avoid the quadratic blowup by doing the - search in order of increasing state string sizes - then you - can break the loop after finding the first match. - - This step should be optional in any case - if there is a - preprocessed rule table, it's always faster to use that. - -*/ - - /* put in the fallback states */ - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = e->next) - { - if (*(e->key)) for (j = 1; 1; j++) - { - state_num = hnj_hash_lookup (hashtab, e->key + j); - if (state_num >= 0) - break; - } - /* KBH: FIXME state 0 fallback_state should always be -1? */ - if (e->val) - dict[k]->states[e->val].fallback_state = state_num; - } -#ifdef VERBOSE - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = e->next) - { - printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, - dict[k]->states[e->val].fallback_state); - for (j = 0; j < dict[k]->states[e->val].num_trans; j++) - printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, - dict[k]->states[e->val].trans[j].new_state); - } -#endif - -#ifndef VERBOSE - hnj_hash_free (hashtab); -#endif - state_num = 0; -} - if (nextlevel) dict[0]->nextlevel = dict[1]; - else { - dict[1] -> nextlevel = dict[0]; - dict[1]->lhmin = dict[0]->lhmin; - dict[1]->rhmin = dict[0]->rhmin; - dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3); - dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3); -#ifdef VERBOSE - HashTab *r = global[0]; - global[0] = global[1]; - global[1] = r; -#endif - return dict[1]; - } - return dict[0]; -} - -void hnj_hyphen_free (HyphenDict *dict) -{ - int state_num; - HyphenState *hstate; - - for (state_num = 0; state_num < dict->num_states; state_num++) - { - hstate = &dict->states[state_num]; - if (hstate->match) - hnj_free (hstate->match); - if (hstate->repl) - hnj_free (hstate->repl); - if (hstate->trans) - hnj_free (hstate->trans); - } - if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); - - if (dict->nohyphen) hnj_free(dict->nohyphen); - - hnj_free (dict->states); - - hnj_free (dict); -} - -#define MAX_WORD 256 - -int hnj_hyphen_hyphenate (HyphenDict *dict, - const char *word, int word_size, - char *hyphens) -{ - char *prep_word; - int i, j, k; - int state; - char ch; - HyphenState *hstate; - char *match; - int offset; - - prep_word = (char*) hnj_malloc (word_size + 3); - - j = 0; - prep_word[j++] = '.'; - - for (i = 0; i < word_size; i++) { - if (word[i] <= '9' && word[i] >= '0') { - prep_word[j++] = '.'; - } else { - prep_word[j++] = word[i]; - } - } - - prep_word[j++] = '.'; - prep_word[j] = '\0'; - - for (i = 0; i < word_size + 5; i++) - hyphens[i] = '0'; - -#ifdef VERBOSE - printf ("prep_word = %s\n", prep_word); -#endif - - /* now, run the finite state machine */ - state = 0; - for (i = 0; i < j; i++) - { - ch = prep_word[i]; - for (;;) - { - - if (state == -1) { - /* return 1; */ - /* KBH: FIXME shouldn't this be as follows? */ - state = 0; - goto try_next_letter; - } - -#ifdef VERBOSE - char *state_str; - state_str = get_state_str (state, 0); - - for (k = 0; k < i - strlen (state_str); k++) - putchar (' '); - printf ("%s", state_str); -#endif - - hstate = &dict->states[state]; - for (k = 0; k < hstate->num_trans; k++) - if (hstate->trans[k].ch == ch) - { - state = hstate->trans[k].new_state; - goto found_state; - } - state = hstate->fallback_state; -#ifdef VERBOSE - printf (" falling back, fallback_state %d\n", state); -#endif - } - found_state: -#ifdef VERBOSE - printf ("found state %d\n",state); -#endif - /* Additional optimization is possible here - especially, - elimination of trailing zeroes from the match. Leading zeroes - have already been optimized. */ - match = dict->states[state].match; - /* replacing rules not handled by hyphen_hyphenate() */ - if (match && !dict->states[state].repl) - { - offset = i + 1 - strlen (match); -#ifdef VERBOSE - for (k = 0; k < offset; k++) - putchar (' '); - printf ("%s\n", match); -#endif - /* This is a linear search because I tried a binary search and - found it to be just a teeny bit slower. */ - for (k = 0; match[k]; k++) - if (hyphens[offset + k] < match[k]) - hyphens[offset + k] = match[k]; - } - - /* KBH: we need this to make sure we keep looking in a word */ - /* for patterns even if the current character is not known in state 0 */ - /* since patterns for hyphenation may occur anywhere in the word */ - try_next_letter: ; - - } -#ifdef VERBOSE - for (i = 0; i < j; i++) - putchar (hyphens[i]); - putchar ('\n'); -#endif - - for (i = 0; i < j - 4; i++) -#if 0 - if (hyphens[i + 1] & 1) - hyphens[i] = '-'; -#else - hyphens[i] = hyphens[i + 1]; -#endif - hyphens[0] = '0'; - for (; i < word_size; i++) - hyphens[i] = '0'; - hyphens[word_size] = '\0'; - - hnj_free (prep_word); - - return 0; -} - -/* Unicode ligature length */ -int hnj_ligature(unsigned char c) { - switch (c) { - case 0x80: /* ff */ - case 0x81: /* fi */ - case 0x82: return LIG_xx; /* fl */ - case 0x83: /* ffi */ - case 0x84: return LIG_xxx; /* ffl */ - case 0x85: /* long st */ - case 0x86: return LIG_xx; /* st */ - } - return 0; -} - -/* character length of the first n byte of the input word */ -int hnj_hyphen_strnlen(const char * word, int n, int utf8) -{ - int i = 0; - int j = 0; - while (j < n && word[j] != '\0') { - i++; - /* Unicode ligature support */ - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { - i += hnj_ligature(word[j + 2]); - } - for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); - } - return i; -} - -int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut, int lhmin) -{ - int i = 1, j; - - /* Unicode ligature support */ - if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { - i += hnj_ligature(word[2]); - } - - /* ignore numbers */ - for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--; - - for (j = 0; i < lhmin && word[j] != '\0'; i++) do { - /* check length of the non-standard part */ - if (*rep && *pos && *cut && (*rep)[j]) { - char * rh = strchr((*rep)[j], '='); - if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + - hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { - free((*rep)[j]); - (*rep)[j] = NULL; - hyphens[j] = '0'; - } - } else { - hyphens[j] = '0'; - } - j++; - - /* Unicode ligature support */ - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { - i += hnj_ligature(word[j + 2]); - } - } while (utf8 && (word[j] & 0xc0) == 0x80); - return 0; -} - -int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut, int rhmin) -{ - int i = 0; - int j; - - /* ignore numbers */ - for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--; - - for (j = word_size - 1; i < rhmin && j > 0; j--) { - /* check length of the non-standard part */ - if (*rep && *pos && *cut && (*rep)[j]) { - char * rh = strchr((*rep)[j], '='); - if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + - hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { - free((*rep)[j]); - (*rep)[j] = NULL; - hyphens[j] = '0'; - } - } else { - hyphens[j] = '0'; - } - if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++; - } - return 0; -} - -/* recursive function for compound level hyphenation */ -int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, - char * hyphens, char *** rep, int ** pos, int ** cut, - int clhmin, int crhmin, int lend, int rend) -{ - char *prep_word; - int i, j, k; - int state; - char ch; - HyphenState *hstate; - char *match; - char *repl; - signed char replindex; - signed char replcut; - int offset; - int * matchlen; - int * matchindex; - char ** matchrepl; - int isrepl = 0; - int nHyphCount; - - size_t prep_word_size = word_size + 3; - prep_word = (char*) hnj_malloc (prep_word_size); - matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); - matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int)); - matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *)); - - j = 0; - prep_word[j++] = '.'; - - for (i = 0; i < word_size; i++) { - if (word[i] <= '9' && word[i] >= '0') { - prep_word[j++] = '.'; - } else { - prep_word[j++] = word[i]; - } - } - - - - prep_word[j++] = '.'; - prep_word[j] = '\0'; - - for (i = 0; i < j; i++) - hyphens[i] = '0'; - -#ifdef VERBOSE - printf ("prep_word = %s\n", prep_word); -#endif - - /* now, run the finite state machine */ - state = 0; - for (i = 0; i < j; i++) - { - ch = prep_word[i]; - for (;;) - { - - if (state == -1) { - /* return 1; */ - /* KBH: FIXME shouldn't this be as follows? */ - state = 0; - goto try_next_letter; - } - -#ifdef VERBOSE - char *state_str; - state_str = get_state_str (state, 1); - - for (k = 0; k < i - strlen (state_str); k++) - putchar (' '); - printf ("%s", state_str); -#endif - - hstate = &dict->states[state]; - for (k = 0; k < hstate->num_trans; k++) - if (hstate->trans[k].ch == ch) - { - state = hstate->trans[k].new_state; - goto found_state; - } - state = hstate->fallback_state; -#ifdef VERBOSE - printf (" falling back, fallback_state %d\n", state); -#endif - } - found_state: -#ifdef VERBOSE - printf ("found state %d\n",state); -#endif - /* Additional optimization is possible here - especially, - elimination of trailing zeroes from the match. Leading zeroes - have already been optimized. */ - match = dict->states[state].match; - repl = dict->states[state].repl; - replindex = dict->states[state].replindex; - replcut = dict->states[state].replcut; - /* replacing rules not handled by hyphen_hyphenate() */ - if (match) - { - offset = i + 1 - strlen (match); -#ifdef VERBOSE - for (k = 0; k < offset; k++) - putchar (' '); - printf ("%s (%s)\n", match, repl); -#endif - if (repl) { - if (!isrepl) for(; isrepl < word_size; isrepl++) { - matchrepl[isrepl] = NULL; - matchindex[isrepl] = -1; - } - matchlen[offset + replindex] = replcut; - } - /* This is a linear search because I tried a binary search and - found it to be just a teeny bit slower. */ - for (k = 0; match[k]; k++) { - if ((hyphens[offset + k] < match[k])) { - hyphens[offset + k] = match[k]; - if (match[k]&1) { - matchrepl[offset + k] = repl; - if (repl && (k >= replindex) && (k <= replindex + replcut)) { - matchindex[offset + replindex] = offset + k; - } - } - } - } - - } - - /* KBH: we need this to make sure we keep looking in a word */ - /* for patterns even if the current character is not known in state 0 */ - /* since patterns for hyphenation may occur anywhere in the word */ - try_next_letter: ; - - } -#ifdef VERBOSE - for (i = 0; i < j; i++) - putchar (hyphens[i]); - putchar ('\n'); -#endif - - for (i = 0; i < j - 3; i++) -#if 0 - if (hyphens[i + 1] & 1) - hyphens[i] = '-'; -#else - hyphens[i] = hyphens[i + 1]; -#endif - for (; i < word_size; i++) - hyphens[i] = '0'; - hyphens[word_size] = '\0'; - - /* now create a new char string showing hyphenation positions */ - /* count the hyphens and allocate space for the new hyphenated string */ - nHyphCount = 0; - for (i = 0; i < word_size; i++) - if (hyphens[i]&1) - nHyphCount++; - j = 0; - for (i = 0; i < word_size; i++) { - if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { - if (rep && pos && cut) { - if (!*rep) - *rep = (char **) calloc(word_size, sizeof(char *)); - if (!*pos) - *pos = (int *) calloc(word_size, sizeof(int)); - if (!*cut) { - *cut = (int *) calloc(word_size, sizeof(int)); - } - (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); - (*pos)[matchindex[i] - 1] = matchindex[i] - i; - (*cut)[matchindex[i] - 1] = matchlen[i]; - } - j += strlen(matchrepl[matchindex[i]]); - i += matchlen[i] - 1; - } - } - - hnj_free (matchrepl); - hnj_free (matchlen); - hnj_free (matchindex); - - /* recursive hyphenation of the first (compound) level segments */ - if (dict->nextlevel) { - char ** rep2; - int * pos2; - int * cut2; - char * hyphens2; - int begin = 0; - - rep2 = (char**) hnj_malloc (word_size * sizeof(char *)); - pos2 = (int*) hnj_malloc (word_size * sizeof(int)); - cut2 = (int*) hnj_malloc (word_size * sizeof(int)); - hyphens2 = (char*) hnj_malloc (word_size + 3); - for (i = 0; i < word_size; i++) rep2[i] = NULL; - for (i = 0; i < word_size; i++) if - (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { - if (i - begin > 0) { - int hyph = 0; - prep_word[i + 2] = '\0'; - /* non-standard hyphenation at compound boundary (Schiffahrt) */ - if (rep && *rep && *pos && *cut && (*rep)[i]) { - char * l = strchr((*rep)[i], '='); - size_t offset = 2 + i - (*pos)[i]; - strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1); - prep_word[prep_word_size - 1] = '\0'; - if (l) { - hyph = (l - (*rep)[i]) - (*pos)[i]; - prep_word[2 + i + hyph] = '\0'; - } - } - hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, - hyphens2, &rep2, &pos2, &cut2, clhmin, - crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); - for (j = 0; j < i - begin; j++) { - hyphens[begin + j] = hyphens2[j]; - if (rep2[j] && rep && pos && cut) { - if (!*rep && !*pos && !*cut) { - int k; - *rep = (char **) malloc(sizeof(char *) * word_size); - *pos = (int *) malloc(sizeof(int) * word_size); - *cut = (int *) malloc(sizeof(int) * word_size); - for (k = 0; k < word_size; k++) { - (*rep)[k] = NULL; - (*pos)[k] = 0; - (*cut)[k] = 0; - } - } - (*rep)[begin + j] = rep2[j]; - (*pos)[begin + j] = pos2[j]; - (*cut)[begin + j] = cut2[j]; - } - } - prep_word[i + 2] = word[i + 1]; - if (*rep && *pos && *cut && (*rep)[i]) { - size_t offset = 1; - strncpy(prep_word + offset, word, prep_word_size - offset - 1); - prep_word[prep_word_size - 1] = '\0'; - } - } - begin = i + 1; - for (j = 0; j < word_size; j++) rep2[j] = NULL; - } - - /* non-compound */ - if (begin == 0) { - hnj_hyphen_hyph_(dict->nextlevel, word, word_size, - hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); - if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, clhmin); - if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, crhmin); - } - - free(rep2); - free(cut2); - free(pos2); - free(hyphens2); - } - - hnj_free (prep_word); - return 0; -} - -/* UTF-8 normalization of hyphen and non-standard positions */ -int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut) -{ - int i, j, k; - if ((((unsigned char) word[0]) >> 6) == 2) { - fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); - return 1; - } - - /* calculate UTF-8 character positions */ - for (i = 0, j = -1; i < word_size; i++) { - /* beginning of an UTF-8 character (not '10' start bits) */ - if ((((unsigned char) word[i]) >> 6) != 2) j++; - hyphens[j] = hyphens[i]; - if (rep && pos && cut && *rep && *pos && *cut) { - int l = (*pos)[i]; - (*pos)[j] = 0; - for (k = 0; k < l; k++) { - if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; - } - k = i - l + 1; - l = k + (*cut)[i]; - (*cut)[j] = 0; - for (; k < l; k++) { - if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; - } - (*rep)[j] = (*rep)[i]; - if (j < i) { - (*rep)[i] = NULL; - (*pos)[i] = 0; - (*cut)[i] = 0; - } - } - } - hyphens[j + 1] = '\0'; -#ifdef VERBOSE - printf ("nums: %s\n", hyphens); -#endif - return 0; -} - -/* get the word with all possible hyphenations (output: hyphword) */ -void hnj_hyphen_hyphword(const char * word, int word_size, const char * hyphens, - char * hyphword, char *** rep, int ** pos, int ** cut) -{ - - if (word_size <= 0 || word_size > INT_MAX / 2) { - hyphword[0] = '\0'; - return; - } - - /* hyphword buffer size must be at least 2 * l */ - int hyphword_size = 2 * word_size - 1; - - int nonstandard = 0; - if (*rep && *pos && *cut) { - nonstandard = 1; - } - - int i; - int j = 0; - for (i = 0; i < word_size && j < hyphword_size; i++) { - hyphword[j++] = word[i]; - if (hyphens[i]&1 && j < hyphword_size) { - if (nonstandard && (*rep)[i] && j >= (*pos)[i]) { - /* non-standard */ - j -= (*pos)[i]; - char *s = (*rep)[i]; - while (*s && j < hyphword_size) { - hyphword[j++] = *s++; - } - i += (*cut)[i] - (*pos)[i]; - } else { - /* standard */ - hyphword[j++] = '='; - } - } - } - hyphword[j] = '\0'; -} - - -/* main api function with default hyphenmin parameters */ -int hnj_hyphen_hyphenate2 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphword, char *** rep, int ** pos, int ** cut) -{ - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, - dict->clhmin, dict->crhmin, 1, 1); - hnj_hyphen_lhmin(dict->utf8, word, word_size, - hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); - hnj_hyphen_rhmin(dict->utf8, word, word_size, - hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); - - /* nohyphen */ - if (dict->nohyphen) { - char * nh = dict->nohyphen; - int nhi; - for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { - char * nhy = (char *) strstr(word, nh); - while (nhy) { - hyphens[nhy - word + strlen(nh) - 1] = '0'; - if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = '0'; - nhy = (char *) strstr(nhy + 1, nh); - } - nh = nh + strlen(nh) + 1; - } - } - - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); -#ifdef VERBOSE - printf ("nums: %s\n", hyphens); -#endif - return 0; -} - -/* previous main api function with hyphenmin parameters */ -int hnj_hyphen_hyphenate3 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphword, char *** rep, int ** pos, int ** cut, - int lhmin, int rhmin, int clhmin, int crhmin) -{ - lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin; - rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin; - clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin; - crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin; - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, - clhmin, crhmin, 1, 1); - hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, (lhmin > 0 ? lhmin : 2)); - hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, (rhmin > 0 ? rhmin : 2)); - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); - - /* nohyphen */ - if (dict->nohyphen) { - char * nh = dict->nohyphen; - int nhi; - for (nhi = 0; nhi <= dict->nohyphenl; nhi++) { - char * nhy = (char *) strstr(word, nh); - while (nhy) { - hyphens[nhy - word + strlen(nh) - 1] = 0; - if (nhy - word - 1 >= 0) hyphens[nhy - word - 1] = 0; - nhy = (char *) strstr(nhy + 1, nh); - } - nh = nh + strlen(nh) + 1; - } - } - - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); - return 0; -} diff --git a/intl/hyphenation/hyphen/hyphen.h b/intl/hyphenation/hyphen/hyphen.h deleted file mode 100644 index 2b4e146421ff..000000000000 --- a/intl/hyphenation/hyphen/hyphen.h +++ /dev/null @@ -1,175 +0,0 @@ -/* Hyphen - hyphenation library using converted TeX hyphenation patterns - * - * (C) 1998 Raph Levien - * (C) 2001 ALTLinux, Moscow - * (C) 2006, 2007, 2008 László Németh - * - * This was part of libHnj library by Raph Levien. - * - * Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj - * to use it in OpenOffice.org. - * - * Non-standard and compound word hyphenation support by László Németh. - * - * License is the original LibHnj license: - * - * LibHnj is dual licensed under LGPL and MPL. Boilerplate for both - * licenses follows. - */ - -/* LibHnj - a library for high quality hyphenation and justification - * Copyright (C) 1998 Raph Levien - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307 USA. -*/ - -/* - * The contents of this file are subject to the Mozilla Public License - * Version 1.0 (the "MPL"); you may not use this file except in - * compliance with the MPL. You may obtain a copy of the MPL at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the MPL is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL - * for the specific language governing rights and limitations under the - * MPL. - * - */ -#ifndef __HYPHEN_H__ -#define __HYPHEN_H__ - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -#include - -typedef struct _HyphenDict HyphenDict; -typedef struct _HyphenState HyphenState; -typedef struct _HyphenTrans HyphenTrans; -#define MAX_CHARS 100 -#define MAX_NAME 20 - -struct _HyphenDict { - /* user options */ - char lhmin; /* lefthyphenmin: min. hyph. distance from the left side */ - char rhmin; /* righthyphenmin: min. hyph. distance from the right side */ - char clhmin; /* min. hyph. distance from the left compound boundary */ - char crhmin; /* min. hyph. distance from the right compound boundary */ - char * nohyphen; /* comma separated list of characters or character - sequences with forbidden hyphenation */ - int nohyphenl; /* count of elements in nohyphen */ - /* system variables */ - int num_states; - char cset[MAX_NAME]; - int utf8; - HyphenState *states; - HyphenDict *nextlevel; -}; - -struct _HyphenState { - char *match; - char *repl; - signed char replindex; - signed char replcut; - int fallback_state; - int num_trans; - HyphenTrans *trans; -}; - -struct _HyphenTrans { - char ch; - int new_state; -}; - -HyphenDict *hnj_hyphen_load (const char *fn); -HyphenDict *hnj_hyphen_load_file (FILE *f); -void hnj_hyphen_free (HyphenDict *dict); - -/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */ -int hnj_hyphen_hyphenate (HyphenDict *dict, - const char *word, int word_size, - char *hyphens); - -/* - - int hnj_hyphen_hyphenate2(): non-standard hyphenation. - - (It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish - etc. orthography, see documentation.) - - input data: - word: input word - word_size: byte length of the input word - - hyphens: allocated character buffer (size = word_size + 5) - hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL - rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers - (size=word_size) or with NULL value) or NULL - - output data: - hyphens: hyphenation vector (hyphenation points signed with odd numbers) - hyphenated_word: hyphenated input word (hyphens signed with `='), - optional (NULL input) - rep: NULL (only standard hyph.), or replacements (hyphenation points - signed with `=' in replacements); - pos: NULL, or difference of the actual position and the beginning - positions of the change in input words; - cut: NULL, or counts of the removed characters of the original words - at hyphenation, - - Note: rep, pos, cut are complementary arrays to the hyphens, indexed with the - character positions of the input word. - - For example: - Schiffahrt -> Schiff=fahrt, - pattern: f1f/ff=f,1,2 - output: rep[5]="ff=f", pos[5] = 1, cut[5] = 2 - - Note: hnj_hyphen_hyphenate2() can allocate rep, pos, cut (word_size - length arrays): - - char ** rep = NULL; - int * pos = NULL; - int * cut = NULL; - char hyphens[MAXWORDLEN]; - hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut); - - See example in the source distribution. - -*/ - -int hnj_hyphen_hyphenate2 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphenated_word, char *** rep, int ** pos, int ** cut); - -/* like hnj_hyphen_hyphenate2, but with hyphenmin parameters */ -/* lhmin: lefthyphenmin - * rhmin: righthyphenmin - * clhmin: compoundlefthyphemin - * crhmin: compoundrighthyphenmin - * (see documentation) */ - -int hnj_hyphen_hyphenate3 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphword, char *** rep, int ** pos, int ** cut, - int lhmin, int rhmin, int clhmin, int crhmin); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif /* __HYPHEN_H__ */ diff --git a/intl/hyphenation/hyphen/moz.build b/intl/hyphenation/hyphen/moz.build deleted file mode 100644 index a93ab6835e2b..000000000000 --- a/intl/hyphenation/hyphen/moz.build +++ /dev/null @@ -1,19 +0,0 @@ -# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- -# vim: set filetype=python: -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. - -# These files cannot be built in unified mode because they include hnjalloc.h. -SOURCES += [ - 'hyphen.c', -] - -FINAL_LIBRARY = 'xul' - -LOCAL_INCLUDES += [ - '../glue', -] - -# We allow warnings for third-party code that can be updated from upstream. -AllowCompilerWarnings() diff --git a/intl/locales/af/hyphenation/hyph_af.hyf b/intl/locales/af/hyphenation/hyph_af.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/bg/hyphenation/hyph_bg.hyf b/intl/locales/bg/hyphenation/hyph_bg.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/ca/hyphenation/hyph_ca.hyf b/intl/locales/ca/hyphenation/hyph_ca.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/cy/hyphenation/hyph_cy.hyf b/intl/locales/cy/hyphenation/hyph_cy.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/da/hyphenation/hyph_da.hyf b/intl/locales/da/hyphenation/hyph_da.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/de-1901/hyphenation/hyph_de-1901.hyf b/intl/locales/de-1901/hyphenation/hyph_de-1901.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/de-1996/hyphenation/hyph_de-1996.hyf b/intl/locales/de-1996/hyphenation/hyph_de-1996.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/de-CH/hyphenation/hyph_de-CH.hyf b/intl/locales/de-CH/hyphenation/hyph_de-CH.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/en-US/hyphenation/hyph_en_US.hyf b/intl/locales/en-US/hyphenation/hyph_en_US.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/eo/hyphenation/hyph_eo.hyf b/intl/locales/eo/hyphenation/hyph_eo.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/es/hyphenation/hyph_es.hyf b/intl/locales/es/hyphenation/hyph_es.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/et/hyphenation/hyph_et.hyf b/intl/locales/et/hyphenation/hyph_et.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/fi/hyphenation/hyph_fi.hyf b/intl/locales/fi/hyphenation/hyph_fi.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/fr/hyphenation/hyph_fr.hyf b/intl/locales/fr/hyphenation/hyph_fr.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/gl/hyphenation/hyph_gl.hyf b/intl/locales/gl/hyphenation/hyph_gl.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/hr/hyphenation/hyph_hr.hyf b/intl/locales/hr/hyphenation/hyph_hr.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/hsb/hyphenation/hyph_hsb.dic b/intl/locales/hsb/hyphenation/hyph_hsb.dic index db038cc7a3d3..98f40b4bb4b4 100644 --- a/intl/locales/hsb/hyphenation/hyph_hsb.dic +++ b/intl/locales/hsb/hyphenation/hyph_hsb.dic @@ -1589,8 +1589,8 @@ izn4j iz1no 2z1p 2z1s -.W8a8r9s8z8a9w8a. -.Warsza3w2a +.w8a8r9s8z8a9w8a. +.warsza3w2a .d8o9z8n8a. .do1z2na1 .n8j8e8j9s8y8m. diff --git a/intl/locales/hsb/hyphenation/hyph_hsb.hyf b/intl/locales/hsb/hyphenation/hyph_hsb.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/hu/hyphenation/hyph_hu.hyf b/intl/locales/hu/hyphenation/hyph_hu.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/ia/hyphenation/hyph_ia.hyf b/intl/locales/ia/hyphenation/hyph_ia.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/is/hyphenation/hyph_is.hyf b/intl/locales/is/hyphenation/hyph_is.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/it/hyphenation/hyph_it.hyf b/intl/locales/it/hyphenation/hyph_it.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/kmr/hyphenation/hyph_kmr.hyf b/intl/locales/kmr/hyphenation/hyph_kmr.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/la/hyphenation/hyph_la.hyf b/intl/locales/la/hyphenation/hyph_la.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/lt/hyphenation/hyph_lt.hyf b/intl/locales/lt/hyphenation/hyph_lt.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/mn/hyphenation/hyph_mn.hyf b/intl/locales/mn/hyphenation/hyph_mn.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/moz.build b/intl/locales/moz.build index a952f6ebb491..958911c0a2d5 100644 --- a/intl/locales/moz.build +++ b/intl/locales/moz.build @@ -42,7 +42,7 @@ locales = [ 'tr', 'uk', ] -filename = '{locale}/hyphenation/hyph_{locale}.dic' +filename = '{locale}/hyphenation/hyph_{locale}.hyf' FINAL_TARGET_FILES.hyphenation += [filename.format(locale=locale) for locale in locales] # en-US is a special case: the dic file is named like en_US. -FINAL_TARGET_FILES.hyphenation += ['en-US/hyphenation/hyph_en_US.dic'] +FINAL_TARGET_FILES.hyphenation += ['en-US/hyphenation/hyph_en_US.hyf'] diff --git a/intl/locales/nb/hyphenation/hyph_nb.hyf b/intl/locales/nb/hyphenation/hyph_nb.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/nl/hyphenation/hyph_nl.hyf b/intl/locales/nl/hyphenation/hyph_nl.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/nn/hyphenation/hyph_nn.hyf b/intl/locales/nn/hyphenation/hyph_nn.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/pl/hyphenation/hyph_pl.hyf b/intl/locales/pl/hyphenation/hyph_pl.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/pt/hyphenation/hyph_pt.hyf b/intl/locales/pt/hyphenation/hyph_pt.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/ru/hyphenation/hyph_ru.hyf b/intl/locales/ru/hyphenation/hyph_ru.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/sh/hyphenation/hyph_sh.hyf b/intl/locales/sh/hyphenation/hyph_sh.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/sl/hyphenation/hyph_sl.hyf b/intl/locales/sl/hyphenation/hyph_sl.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/sv/hyphenation/hyph_sv.hyf b/intl/locales/sv/hyphenation/hyph_sv.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/tr/hyphenation/hyph_tr.hyf b/intl/locales/tr/hyphenation/hyph_tr.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/locales/uk/hyphenation/hyph_uk.hyf b/intl/locales/uk/hyphenation/hyph_uk.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/intl/moz.build b/intl/moz.build index 05d94899a111..4d1906f3a45c 100644 --- a/intl/moz.build +++ b/intl/moz.build @@ -9,7 +9,6 @@ TEST_DIRS += [ ] DIRS += [ - 'hyphenation/hyphen', 'hyphenation/glue', 'locale', 'locales', diff --git a/layout/style/RunCbindgen.py b/layout/style/RunCbindgen.py index d49943a36e34..9a88fae37bb7 100644 --- a/layout/style/RunCbindgen.py +++ b/layout/style/RunCbindgen.py @@ -29,7 +29,8 @@ def generate(output, cbindgen_crate_path, *in_tree_dependencies): "--lockfile", CARGO_LOCK, "--crate", - _get_crate_name(cbindgen_crate_path) + _get_crate_name(cbindgen_crate_path), + "--cpp-compat" ], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() diff --git a/testing/testsuite-targets.mk b/testing/testsuite-targets.mk index 739301972cea..5bb9a998e14e 100644 --- a/testing/testsuite-targets.mk +++ b/testing/testsuite-targets.mk @@ -224,7 +224,7 @@ stage-android: make-stage-dir $(NSINSTALL) $(topsrcdir)/mobile/android/fonts $(DEPTH)/_tests/reftest $(NSINSTALL) $(topsrcdir)/mobile/android/fonts $(DEPTH)/_tests/testing/mochitest $(NSINSTALL) -D $(DEPTH)/_tests/reftest/hyphenation - $(NSINSTALL) $(wildcard $(topsrcdir)/intl/locales/*/hyphenation/*.dic) $(DEPTH)/_tests/reftest/hyphenation + $(NSINSTALL) $(wildcard $(topsrcdir)/intl/locales/*/hyphenation/*.hyf) $(DEPTH)/_tests/reftest/hyphenation ifdef MOZ_COPY_PDBS CPP_UNIT_TEST_BINS=$(filter-out $(wildcard $(DIST)/cppunittests/*.pdb), $(wildcard $(DIST)/cppunittests/*)) diff --git a/third_party/rust/mapped_hyph/.cargo-checksum.json b/third_party/rust/mapped_hyph/.cargo-checksum.json new file mode 100644 index 000000000000..9c7e639fa3c5 --- /dev/null +++ b/third_party/rust/mapped_hyph/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"COPYRIGHT":"4df931055b82b96e13ad475c4cee3de5afa69a54a4c611c9d7dc6252d858d9c8","Cargo.toml":"ed3016de5a5dbfb0904cd3a442fa98cb66f8b4d8c1b801bcdcba777b57abe69d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"14cbfed88443a2e7ffb5beb788cae17e19d7329e9ef6c7ebdbd45c67751f4a06","benches/bench.rs":"ed7143e66ecf8bfb12c87d1f9344157d97696b8194de9132d061129bc80d8d52","cbindgen.toml":"07d22767e85ed64cf190038205e189a8fffea8910bbe923d04f425b36b9e9e93","doc/mapped_hyph_format.md":"2f2487cf536fe4b03db6e4b384be06744ec30b3f299519492288306a93127fbb","hyph_en_US.hyf":"6262b4c5118fe277ab4add8689d9524ca72097564652baec67a8fcd5029ec9b0","src/bin/hyf_compile.rs":"8dfcad9c6e6f27bda9eb6ac6493114fdec0187fef144d86e097ffe488d00a49c","src/builder.rs":"7d4bb46ab2e00bb1cad1de8365781102a44817f23518ca617db17c07d44f5f7e","src/ffi.rs":"bdcff084276418788f4c8a1c525d7a6fd0bce900ca1561ff0353029e1171d9f1","src/lib.rs":"0126ba46f1c30a2dea2f72dec9e9639635aaba85f4b0da7b1a6e2f52624243ed","src/main.rs":"666befeb39cb1a7dfb66c6b9218d5f7b6c4ed09dbbbc8cfff6b749a33a99ebcf","tests/base.hyf":"d8bf57c6280cfa1d357d3fdba156ce64afbd9df58e28eeb084dfe3f80972b73f","tests/base.hyph":"a3f1fab24c101701fdf21e8359685d80611ab970304e2bd89ef024768b3700c8","tests/base.word":"1136c9a421b242262661b9a65723f87a5ecf77ae38eabcea057832d036d567fd","tests/compound.hyf":"929c1ba6676e4c43bc649d0abf4275ea9e8b02bffaa5acdf704a710813a7a13c","tests/compound4.hyf":"2093287bc41ee30ff9bdbf278f1f8209cb1d1a78236b46e9060af2a881572b8e","tests/compound5.hyf":"0942a5dfbb8d0ef3a937ab9da0418abb41300357cde49f4c477a59a11b2cb6bd","tests/compound6.hyf":"ebad958c2692a5b439b31e324020ed27c42dc05bd5b8c6a6dea4669e6ccf76b4","tests/hyphen.hyf":"92b8a5c86aac6a0b9f0eb7330a057065d6985fd047e851cae47039995c682d4d","tests/lhmin.hyf":"23c886704fafee7d9c54b2478029cf69a5fa946c2f2442bd86697bca5933c88d","tests/num.hyf":"4834fabe78b5c81815434d4562ce3322541649e1ea1edc555a498574bc8b237e","tests/rhmin.hyf":"239cb3d4d7f904abb43b57241e12cc1396e636220c3806e64666aca7ca46cc42","tests/settings2.hyf":"9fc4855e0b952a3593db1efef080b93ce7f1c6fe6798db0440e2bf0cc986ffa2","tests/settings3.hyf":"867db207b485a06e7d60ad10735c9111f10516ee3a5afd6306c683ace3454491","tests/test.rs":"5c81ae59b9384b70d9461407999dac1fde9214398876c4433fbbde9571cc1d94"},"package":null} \ No newline at end of file diff --git a/third_party/rust/mapped_hyph/COPYRIGHT b/third_party/rust/mapped_hyph/COPYRIGHT new file mode 100644 index 000000000000..a1254361b371 --- /dev/null +++ b/third_party/rust/mapped_hyph/COPYRIGHT @@ -0,0 +1,12 @@ +mapped_hyph is copyright 2019 Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 + or the MIT +license , +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. + +Code in the subdirectories /test/ and /bench/ is dedicated +to the Public Domain. diff --git a/third_party/rust/mapped_hyph/Cargo.toml b/third_party/rust/mapped_hyph/Cargo.toml new file mode 100644 index 000000000000..76380c3cd978 --- /dev/null +++ b/third_party/rust/mapped_hyph/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "mapped_hyph" +description = "Hyphenation using precompiled memory-mapped tables" +version = "0.3.0" +authors = ["Jonathan Kew "] +license = "MIT/Apache-2.0" +edition = "2018" + +[dependencies] +memmap = "0.7.0" +arrayref = "0.3.5" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "bench" +harness = false diff --git a/third_party/rust/mapped_hyph/LICENSE-APACHE b/third_party/rust/mapped_hyph/LICENSE-APACHE new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/third_party/rust/mapped_hyph/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/mapped_hyph/LICENSE-MIT b/third_party/rust/mapped_hyph/LICENSE-MIT new file mode 100644 index 000000000000..b4850c952004 --- /dev/null +++ b/third_party/rust/mapped_hyph/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2019 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/mapped_hyph/README.md b/third_party/rust/mapped_hyph/README.md new file mode 100644 index 000000000000..6ea145d18c40 --- /dev/null +++ b/third_party/rust/mapped_hyph/README.md @@ -0,0 +1,75 @@ +# mapped_hyph + +mapped_hyph is a reimplementation of the hyphenation algorithm from the +[libhyphen](https://github.com/hunspell/hyphen) library +that is intended to reduce the in-memory footprint of loaded +hyphenation dictionaries, especially when the same dictionary +may be in use by multiple processes. + +To reduce memory footprint, mapped_hyph uses hyphenation dictionaries that are +"precompiled" into a flat, position-independent binary format that is used +directly by the runtime hyphenation functions. +Therefore, dictionaries do not have to be parsed into a dynamic structure in memory; +the files can simply be mmap'd into the address space and immediately used. +In addition, a compiled dictionary mapped into a shared-memory block +can be made available to multiple processes for no added physical memory cost. + +One deliberate simplification compared to libhyphen +is that mapped_hyph only accepts UTF-8 text and hyphenation dictionaries; +legacy non-Unicode encodings are not supported. + +mapped_hyph has been created primarily for use by Gecko, replacing the use of libhyphen, +and so its features (and limitations) are based on this use case. +However, it is hoped that it will also be more generally useful. + +## Functionality + +Currently, mapped_hyph supports only "standard" hyphenation, where spelling does not +change around the hyphenation position. At present this is the only kind of +hyphenation supported in Gecko. + +The compiled hyphenation dictionary format includes provision for replacement +strings and indexes, as used by libhyphen to support non-standard hyphenations +(e.g. German "Schiffahrt" -> "Schiff-fahrt"), but the `find_hyphen_values` function +will ignore any such hyphenation positions it finds. +(None of the hyphenation dictionaries shipping with Firefox includes such rules.) + +## Licensing + +mapped_hyph is dual licensed under the Apache-2.0 and MIT licenses; +see the file COPYRIGHT. + +## Documentation + +Use `cargo doc --open` to view (admittedly brief) documentation generated from +comments in the source. + +## C and C++ bindings + +See the `mapped_hyph.h` header for C/C++ APIs that can be used to load hyphenation files +and to locate valid hyphenation positions in a word. + +## Sample programs + +See main.rs for a simple example program. + +## Compiled dictionaries + +The `hyf_compile` tool is used to generate `.hyf` files for mapped_hyph +from standard `.dic` (or `.pat`) files as used by libhyphen, LibreOffice, etc. + +(A compiled version of the `hyph_en_US` dictionary from libhyphen is currently +included here, as it is handy for testing purposes.) + +## Release Notes + +### 0.2.0 + +* Implemented a hyphenation table compiler in the `builder` submodule, + and `hyf_compile` command-line tool. + +* Moved C-callable API functions into an `ffi` submodule. + +### 0.1.0 + +* Initial release. diff --git a/third_party/rust/mapped_hyph/benches/bench.rs b/third_party/rust/mapped_hyph/benches/bench.rs new file mode 100644 index 000000000000..cf4ad6cb2fb6 --- /dev/null +++ b/third_party/rust/mapped_hyph/benches/bench.rs @@ -0,0 +1,50 @@ +// Any copyright to the test code below is dedicated to the Public Domain. +// http://creativecommons.org/publicdomain/zero/1.0/ + +use criterion::black_box; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::BenchmarkId; +use criterion::Criterion; + +use mapped_hyph::Hyphenator; +use std::fs; + +const SAMPLE_SIZE: usize = 300; +const DIC_PATH: &str = "hyph_en_US.hyf"; + +fn bench_construct(c: &mut Criterion) { + c.bench_function("construct", |b| { + b.iter(|| { + let dic = unsafe { mapped_hyph::load_file(DIC_PATH) } + .expect(&format!("failed to load dictionary {}", DIC_PATH)); + let _ = Hyphenator::new(black_box(&*dic)); + }) + }); +} + +fn bench_find_hyphen_values(c: &mut Criterion) { + // XXX: Should we copy this file to the crate to ensure reproducability? + let data = fs::read_to_string("/usr/share/dict/words").expect("File reading failed."); + let words: Vec<&str> = data.lines().take(SAMPLE_SIZE).collect(); + + let dic = unsafe { mapped_hyph::load_file(DIC_PATH) } + .expect(&format!("failed to load dictionary {}", DIC_PATH)); + let hyph = Hyphenator::new(&*dic); + + c.bench_with_input( + BenchmarkId::new("bench_word", SAMPLE_SIZE), + &words, + |b, words| { + b.iter(|| { + let mut values: Vec = vec![0; 1000]; + for w in words { + hyph.find_hyphen_values(&w, &mut values); + } + }); + }, + ); +} + +criterion_group!(benches, bench_construct, bench_find_hyphen_values,); +criterion_main!(benches); diff --git a/third_party/rust/mapped_hyph/cbindgen.toml b/third_party/rust/mapped_hyph/cbindgen.toml new file mode 100644 index 000000000000..9ad425292c11 --- /dev/null +++ b/third_party/rust/mapped_hyph/cbindgen.toml @@ -0,0 +1,114 @@ +# This is a template cbindgen.toml file with all of the default values. +# Some values are commented out because their absence is the real default. +# +# See https://github.com/eqrion/cbindgen/blob/master/docs.md#cbindgentoml +# for detailed documentation of every option here. + +language = "C" + +############## Options for Wrapping the Contents of the Header ################# + +header = """/* + * Copyright 2019 Mozilla Foundation. See the COPYRIGHT + * file at the top-level directory of this distribution. + * + * Licensed under the Apache License, Version 2.0 or the MIT license + * , at your + * option. This file may not be copied, modified, or distributed + * except according to those terms. +**/ + +/* clang-format off */ +""" +trailer = "/* clang-format on */" +include_guard = "mapped_hyph_h" +autogen_warning = """/* + * Warning, this file is autogenerated by cbindgen. Don't modify this manually. + */ +""" +include_version = false +# namespace = "my_namespace" +namespaces = [] +# using_namespaces = [] +sys_includes = ["stdbool.h","stdint.h"] +includes = [] +no_includes = true + +############################ Code Style Options ################################ + +braces = "SameLine" +line_length = 100 +tab_width = 2 +documentation_style = "auto" + +############################# Codegen Options ################################## + +style = "both" + +[defines] +# "target_os = freebsd" = "DEFINE_FREEBSD" +# "feature = serde" = "DEFINE_SERDE" + +[export] +include = [] +exclude = [] +# prefix = "CAPI_" +item_types = [] +renaming_overrides_prefixing = false + +[export.rename] + +[export.body] + +[fn] +rename_args = "None" +# must_use = "MUST_USE_FUNC" +# prefix = "START_FUNC" +# postfix = "END_FUNC" +args = "auto" + +[struct] +rename_fields = "None" +# must_use = "MUST_USE_STRUCT" +derive_constructor = false +derive_eq = false +derive_neq = false +derive_lt = false +derive_lte = false +derive_gt = false +derive_gte = false + +[enum] +rename_variants = "None" +# must_use = "MUST_USE_ENUM" +add_sentinel = false +prefix_with_name = false +derive_helper_methods = false +derive_const_casts = false +derive_mut_casts = false +# cast_assert_name = "ASSERT" +derive_tagged_enum_destructor = false +derive_tagged_enum_copy_constructor = false +private_default_tagged_enum_constructor = false + +[const] +allow_static_const = true + +[macro_expansion] +bitflags = false + +############## Options for How Your Rust library Should Be Parsed ############## + +[parse] +parse_deps = false +# include = [] +exclude = [] +clean = false +extra_bindings = [] + +[parse.expand] +crates = [] +all_features = false +default_features = true +features = [] diff --git a/third_party/rust/mapped_hyph/doc/mapped_hyph_format.md b/third_party/rust/mapped_hyph/doc/mapped_hyph_format.md new file mode 100644 index 000000000000..d98162d7ea11 --- /dev/null +++ b/third_party/rust/mapped_hyph/doc/mapped_hyph_format.md @@ -0,0 +1,98 @@ +# Compiled hyphenation table format for mapped_hyph + +The file is a "flattened" representation of the list of `HyphenDict` structs +and descendant objects used by libhyphen +(see [hyphen.h](https://github.com/hunspell/hyphen/blob/master/hyphen.h)). + +Note that multi-byte integer types in the file are stored in _little-endian_ byte order. + +## Overall file header + +The file begins with a 4-byte "signature", followed by a count of the number +of hyphenation levels, and an array of offsets to each hyphenation level. +A "level" is essentially equivalent to libhyphen's `HyphenDict`. + +### Header (size: 8 bytes + 4 * numLevels) +Type | Name | Description +-----|------|------------ +uint8[4] | magicNumber | 4-byte file identification code: ['H', 'y', 'f', '0'] +uint32 | numLevels | number of hyphenation levels present +uint32[numLevels] | levelOffset | offset from start of file to each Level + +Currently, there are normally 2 hyphenation levels, as the parser/compiler will +generate a default first level if no NEXTLEVEL keyword is present in the pattern file. + +## Hyphenation Level + +Each level of the hyphenation pattern begins with a Level header, followed by +the data for its states and the strings they refer to. +When the hyphenation machine is executed, we always begin at state offset 0 +(from the level's stateDataBase); each transition to a new state represents the +target directly by its offset from stateDataBase. +A state offset of 0xFFFFFF is considered invalid. + +Strings are represented as offsets from the level's stringDataBase; each string +is encoded as a one-byte length followed by `length` bytes of utf-8 data. +(So the maximum string length is 255 utf-8 code units; this is far more than any actual +hyphenation dictionary uses). +A string offset of 0xFFFF is considered invalid and represents an absent string. + +The minimum number of characters that must be kept together at the start/end of a word, +or of a component of a compound (i.e. the `...Min` values) is a count of _Unicode characters_, +not UTF-8 code units. (Note that the presentation-form ligature characters U+FB00 'ff' through U+FB06 'st' +are counted as 2 or 3 characters for this purpose.) + +### Level (size: 16 bytes + state data + string data, padded to a 4-byte boundary) +Type | Name | Description +-----|------|------------ +uint32 | stateDataBase | offset from beginning of Level to start of level's State data +uint32 | stringDataBase | offset from beginning of Level to start of level's packed String data +uint16 | noHyphenStringOffset | from level's stringDataBase +uint16 | noHyphenCount | number of (NUL-separated) strings in the nohyphen string +uint8 | leftHyphenMin | minimum number of characters kept together at start of word +uint8 | rightHyphenMin | minimum number of characters kept together at end of word +uint8 | compoundLeftHyphenMin | minimum number of characters kept together at start of second component of a compound +uint8 | compoundRightHyphenMin | minimum number of characters kept together at end of first component of a compound + +## State + +Each state, referred to by its offset from the level's stateDataBase, consists of a header +followed by an array of transitions for input bytes that need to be matched in this state. +The state also records a fallback state offset, which is the transition to be taken +if the next input byte does not match any of the transition records. + +If a match string is present (i.e. `matchStringOffset` is not 0xFFFF), it is a string of hyphenation values +(encoded as ASCII digits '0'..'9') to be applied at the current position in the word. + +### StateHeader (size: 8 bytes) +Type | Name | Description +-----|------|------------ +uint32 | fallbackStateOffset | (from level's stateDataBase) +uint16 | matchStringOffset | (from level's stringDataBase) +uint8 | numTransitions | count of Transitions that follow the StateHeader and optional StateHeaderExtension +uint8 | isExtended | if non-zero, the StateHeader is immediately followed by a StateHeaderExtension + +If the `isExtended` flag in the state header is set, this state includes a potential spelling change +and there is an extended form of the header present before the array of transitions. +(Note that extended states with spelling-change rules are not yet supported by the mapped_hyph engine; +none of the hyphenation dictionaries shipped with Firefox includes such rules.) + +### StateHeaderExtension (size: 4 bytes) +Type | Name | Description +-----|------|------------ +uint16 | replacementStringOffset | (from level's stringDataBase) the replacement string +int8 | replacementIndex | index of the byte position (relative to current position in the word) at which the spelling replacement should happen +int8 | replacementCut | number of bytes to cut from the original word when making the replacement + +## Transitions + +The state's transitions are encoded as an array of Transition records, each corresponding to an input byte +and providing the offset of the new state. The transitions for each state are sorted by ascending value of input byte +(although in practice there are usually only a few valid transitions, and so a binary search does not seem to be +worthwhile). + +### Transition (size: 4 bytes) +Type | Name | Description +-----|------|------------ +uint24 | newStateOffset | (from level's stateDataBase) +uint8 | inputByte | the input byte (utf-8 code unit) for this transition diff --git a/third_party/rust/mapped_hyph/hyph_en_US.hyf b/third_party/rust/mapped_hyph/hyph_en_US.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs new file mode 100644 index 000000000000..4e1671102c0b --- /dev/null +++ b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs @@ -0,0 +1,25 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate mapped_hyph; + +use std::env; +use std::fs::File; + +fn main() -> std::io::Result<()> { + let args: Vec = env::args().collect(); + if args.len() == 3 { + let in_file = File::open(&args[1])?; + let mut out_file = File::create(&args[2])?; + mapped_hyph::builder::write_hyf_file(&mut out_file, mapped_hyph::builder::read_dic_file(&in_file))?; + } else { + println!("usage: hyf_compile "); + } + Ok(()) +} diff --git a/third_party/rust/mapped_hyph/src/builder.rs b/third_party/rust/mapped_hyph/src/builder.rs new file mode 100644 index 000000000000..7a13947aa44b --- /dev/null +++ b/third_party/rust/mapped_hyph/src/builder.rs @@ -0,0 +1,473 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// Functions to compile human-readable patterns into a mapped_hyph +/// flattened representation of the hyphenation state machine. + +use std::io::{Read,BufRead,BufReader,Write}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::hash::{Hash,Hasher}; + +// Wrap a HashMap so that we can implement the Hash trait. +#[derive(PartialEq, Eq, Clone)] +struct TransitionMap (HashMap); + +impl TransitionMap { + fn new() -> TransitionMap { + TransitionMap(HashMap::::new()) + } +} + +impl Hash for TransitionMap { + fn hash(&self, state: &mut H) { + // We only look at the values here; that's likely to be enough + // for a reasonable hash. + let mut transitions: Vec<&i32> = self.0.values().collect(); + transitions.sort(); + for t in transitions { + t.hash(state); + } + } +} + +#[derive(PartialEq, Eq, Hash, Clone)] +struct State { + match_string: Option>, + #[allow(dead_code)] + repl_string: Option>, + #[allow(dead_code)] + repl_index: i32, + #[allow(dead_code)] + repl_cut: i32, + fallback_state: i32, + transitions: TransitionMap, +} + +impl State { + fn new() -> State { + State { + match_string: None, + repl_string: None, + repl_index: -1, + repl_cut: -1, + fallback_state: -1, + transitions: TransitionMap::new(), + } + } +} + +/// This is only public because the read_dic_file() function returns a Vec +/// of LevelBuilder structs, which can then be passed to write_hyf_file() +/// to create the flattened output. +pub struct LevelBuilder { + states: Vec, + str_to_state: HashMap,i32>, + encoding: Option, + nohyphen: Option, + lh_min: u8, + rh_min: u8, + clh_min: u8, + crh_min: u8, +} + +impl LevelBuilder { + fn new() -> LevelBuilder { + let mut result = LevelBuilder { + states: Vec::::new(), + str_to_state: HashMap::,i32>::new(), + encoding: None, + nohyphen: None, + lh_min: 0, + rh_min: 0, + clh_min: 0, + crh_min: 0, + }; + // Initialize the builder with an empty start state. + result.str_to_state.insert(vec![], 0); + result.states.push(State::new()); + result + } + + fn find_state_number_for(&mut self, text: &[u8]) -> i32 { + let count = self.states.len() as i32; + let index = *self.str_to_state.entry(text.to_vec()).or_insert(count); + if index == count { + self.states.push(State::new()); + } + index + } + + fn add_pattern(&mut self, pattern: &str) { + let mut bytes = pattern.as_bytes(); + let mut text = Vec::::with_capacity(bytes.len()); + let mut digits = Vec::::with_capacity(bytes.len() + 1); + let mut repl_str = None; + let mut repl_index = 0; + let mut repl_cut = 0; + + // Check for replacement rule (non-standard hyphenation spelling change). + if let Some(slash) = bytes.iter().position(|x| *x == b'/') { + let parts = bytes.split_at(slash); + bytes = parts.0; + let mut it = parts.1[1 ..].split(|x| *x == b','); + if let Some(repl) = it.next() { + repl_str = Some(repl.to_vec()); + } + if let Some(num) = it.next() { + repl_index = std::str::from_utf8(num).unwrap().parse::().unwrap() - 1; + } + if let Some(num) = it.next() { + repl_cut = std::str::from_utf8(num).unwrap().parse::().unwrap(); + } + } + + // Separate the input pattern into parallel arrays of text (bytes) and digits. + let mut got_digit = false; + for byte in bytes { + if *byte <= b'9' && *byte >= b'0' { + assert!(!got_digit, "invalid pattern \"{}\": consecutive digits", pattern); + digits.push(*byte); + got_digit = true; + } else { + text.push(*byte); + if got_digit { + got_digit = false; + } else { + digits.push(b'0'); + } + } + } + if !got_digit { + digits.push(b'0'); + } + + if repl_str.is_none() { + // Optimize away leading zeroes from the digits array. + while !digits.is_empty() && digits[0] == b'0' { + digits.remove(0); + } + } else { + // Convert repl_index and repl_cut from Unicode char to byte indexing. + let start = if text[0] == b'.' { 1 } else { 0 }; + if start == 1 { + assert_eq!(digits[0], b'0', "unexpected digit before start of word"); + digits.remove(0); + } + let word = std::str::from_utf8(&text[start..]).unwrap(); + let mut chars: Vec<_> = word.char_indices().collect(); + chars.push((word.len(), '.')); + repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32; + repl_index = chars[repl_index as usize].0 as i32; + } + + // Create the new state, or add pattern into an existing state + // (which should not already have a match_string). + let mut state_num = self.find_state_number_for(&text); + let mut state = &mut self.states[state_num as usize]; + assert!(state.match_string.is_none(), "duplicate pattern?"); + if !digits.is_empty() { + state.match_string = Some(digits); + } + if repl_str.is_some() { + state.repl_string = repl_str; + state.repl_index = repl_index; + state.repl_cut = repl_cut; + } + + // Set up prefix transitions, inserting additional states as needed. + while !text.is_empty() { + let last_state = state_num; + let ch = *text.last().unwrap(); + text.truncate(text.len() - 1); + state_num = self.find_state_number_for(&text); + if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) { + assert_eq!(exists, last_state, "overwriting existing transition?"); + break; + } + } + } + + fn merge_duplicate_states(&mut self) { + // We loop here because when we eliminate a duplicate, and update the transitons + // that referenced it, we may thereby create new duplicates that another pass + // will find and compress further. + loop { + let orig_len = self.states.len(); + // Used to map State records to the (first) index at which they occur. + let mut state_to_index = HashMap::<&State,i32>::new(); + // Mapping of old->new state indexes, and whether each old state is + // a duplicate that should be dropped. + let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len); + let mut next_new_index: i32 = 0; + for index in 0 .. self.states.len() { + // Find existing index for this state, or allocate the next new index to it. + let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index); + // Record the mapping, and whether the state was a duplicate. + mappings.push((new_index, new_index != next_new_index)); + // If we used next_new_index for this state, increment it. + if new_index == next_new_index { + next_new_index += 1; + } + } + // If we didn't find any duplicates, next_new_index will have kept pace with + // index, so we know we're finished. + if next_new_index as usize == self.states.len() { + break; + } + // Iterate over all the states, either deleting them or updating indexes + // according to the mapping we created; then repeat the search. + for index in (0 .. self.states.len()).rev() { + if mappings[index].1 { + self.states.remove(index); + } else { + let state = &mut self.states[index]; + if state.fallback_state != -1 { + state.fallback_state = mappings[state.fallback_state as usize].0; + } + for t in state.transitions.0.iter_mut() { + *t.1 = mappings[*t.1 as usize].0; + } + } + } + } + } + + fn flatten(&self) -> Vec { + // Calculate total space needed for state data, and build the state_to_offset table. + let mut state_data_size = 0; + let mut state_to_offset = Vec::::with_capacity(self.states.len()); + for state in &self.states { + state_to_offset.push(state_data_size); + state_data_size += if state.repl_string.is_some() { 12 } else { 8 }; + state_data_size += state.transitions.0.len() * 4; + } + + // Helper to map a state index to its offset in the final data block. + let get_state_offset_for = |state_index: i32| -> u32 { + if state_index < 0 { + return super::INVALID_STATE_OFFSET; + } + state_to_offset[state_index as usize] as u32 + }; + + // Helper to map a byte string to its offset in the final data block, and + // store the bytes into string_data unless using an already-existing string. + let mut string_to_offset = HashMap::,usize>::new(); + let mut string_data = Vec::::new(); + let mut get_string_offset_for = |bytes: &Option>| -> u16 { + if bytes.is_none() { + return super::INVALID_STRING_OFFSET; + } + assert!(bytes.as_ref().unwrap().len() < 256); + let new_offset = string_data.len(); + let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset); + if offset == new_offset { + string_data.push(bytes.as_ref().unwrap().len() as u8); + string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref()); + } + offset.try_into().unwrap() + }; + + // Handle nohyphen string list if present, converting comma separators to NULs + // and trimming any surplus whitespace. + let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET; + let mut nohyphen_count: u16 = 0; + if self.nohyphen.is_some() { + let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect(); + nohyphen_count = nohyphen_strings.len().try_into().unwrap(); + nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec())); + } + + let mut state_data = Vec::::with_capacity(state_data_size); + for state in &self.states { + state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes()); + state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes()); + state_data.push(state.transitions.0.len() as u8); + // Determine whether to use an extended state record, and if so add the + // replacement string and index fields. + if state.repl_string.is_none() { + state_data.push(0); + } else { + state_data.push(1); + state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes()); + state_data.push(state.repl_index as u8); + state_data.push(state.repl_cut as u8); + } + // Collect transitions into an array so we can sort them. + let mut transitions = vec![]; + for (key, value) in state.transitions.0.iter() { + transitions.push((*key, get_state_offset_for(*value))) + } + transitions.sort(); + for t in transitions { + // New state offset is stored as a 24-bit value, so we do this manually. + state_data.push((t.1 & 0xff) as u8); + state_data.push(((t.1 >> 8) & 0xff) as u8); + state_data.push(((t.1 >> 16) & 0xff) as u8); + state_data.push(t.0); + } + } + assert_eq!(state_data.len(), state_data_size); + + // Pad string data to a 4-byte boundary + while string_data.len() & 3 != 0 { + string_data.push(0); + } + + let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len(); + let mut result = Vec::::with_capacity(total_size); + + let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32; + let string_data_base: u32 = state_data_base + state_data_size as u32; + + result.extend(&state_data_base.to_le_bytes()); + result.extend(&string_data_base.to_le_bytes()); + result.extend(&nohyphen_string_offset.to_le_bytes()); + result.extend(&nohyphen_count.to_le_bytes()); + result.push(self.lh_min); + result.push(self.rh_min); + result.push(self.clh_min); + result.push(self.crh_min); + + result.extend(state_data.iter()); + result.extend(string_data.iter()); + + assert_eq!(result.len(), total_size); + + result + } +} + +/// Read a libhyphen-style pattern file and create the corresponding state +/// machine transitions, etc. +/// The returned Vec can be passed to write_hyf_file() to generate a flattened +/// representation of the state machine in mapped_hyph's binary format. +pub fn read_dic_file(dic_file: T) -> Vec { + let reader = BufReader::new(dic_file); + + let mut builders = Vec::::new(); + builders.push(LevelBuilder::new()); + let mut builder = &mut builders[0]; + + for (index, line) in reader.lines().enumerate() { + let mut trimmed = line.unwrap().trim().to_string(); + // Strip comments. + if let Some(i) = trimmed.find('%') { + trimmed = trimmed[..i].trim().to_string(); + } + // Ignore empty lines. + if trimmed.is_empty() { + continue; + } + // Uppercase indicates keyword rather than pattern. + if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' { + // First line is encoding; we only support UTF-8. + if builder.encoding.is_none() { + assert_eq!(trimmed, "UTF-8", "Only UTF-8 patterns are accepted!"); + builder.encoding = Some(trimmed); + continue; + } + // Check for valid keyword-value pairs. + if trimmed.contains(' ') { + let parts: Vec<&str> = trimmed.split(' ').collect(); + assert!(parts.len() == 2); + let keyword = parts[0]; + let value = parts[1]; + match keyword { + "LEFTHYPHENMIN" => builder.lh_min = value.parse::().unwrap(), + "RIGHTHYPHENMIN" => builder.rh_min = value.parse::().unwrap(), + "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::().unwrap(), + "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::().unwrap(), + "NOHYPHEN" => builder.nohyphen = Some(trimmed), + _ => println!("unknown keyword: {}", trimmed), + } + continue; + } + // Start a new hyphenation level? + if trimmed == "NEXTLEVEL" { + builders.push(LevelBuilder::new()); + builder = builders.last_mut().unwrap(); + continue; + } + println!("unknown keyword: {}", trimmed); + continue; + } + // Patterns should always be provided in lowercase; complain if not. + assert_eq!(trimmed, trimmed.to_lowercase(), "pattern \"{}\" not lowercased at line {}", trimmed, index); + builder.add_pattern(&trimmed); + } + + // Create default first (compound-word) level if only one level was provided. + // (Maybe this should be optional? Currently just copying libhyphen behavior.) + if builders.len() == 1 { + let (lh_min, rh_min, clh_min, crh_min) = + (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min); + builders.insert(0, LevelBuilder::new()); + builder = builders.first_mut().unwrap(); + builder.add_pattern("1-1"); + builder.add_pattern("1'1"); + builder.add_pattern("1\u{2013}1"); // en-dash + builder.add_pattern("1\u{2019}1"); // curly apostrophe + builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string()); + builder.lh_min = lh_min; + builder.rh_min = rh_min; + builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 }; + builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 }; + } + + // Put in fallback states in each builder. + for builder in &mut builders { + for (key, state_index) in builder.str_to_state.iter() { + if key.is_empty() { + continue; + } + let mut fallback_key = key.clone(); + while !fallback_key.is_empty() { + fallback_key.remove(0); + if builder.str_to_state.contains_key(&fallback_key) { + break; + } + } + builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key]; + } + } + + // Merge duplicate states to reduce size. + for builder in &mut builders { + builder.merge_duplicate_states(); + } + + builders +} + +/// Write out the state machines representing a set of hyphenation rules +/// to the given output stream. +pub fn write_hyf_file(hyf_file: &mut T, levels: Vec) -> std::io::Result<()> { + let mut flattened = vec![]; + for level in levels { + flattened.push(level.flatten()); + } + // Write file header: magic number, count of levels. + hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?; + let level_count: u32 = flattened.len() as u32; + hyf_file.write_all(&level_count.to_le_bytes())?; + // Write array of offsets to each level. First level will begin immediately + // after the array of offsets. + let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count; + for flat in &flattened { + hyf_file.write_all(&offset.to_le_bytes())?; + offset += flat.len() as u32; + } + // Write the flattened data for each level. + for flat in &flattened { + hyf_file.write_all(&flat)?; + } + Ok(()) +} diff --git a/third_party/rust/mapped_hyph/src/ffi.rs b/third_party/rust/mapped_hyph/src/ffi.rs new file mode 100644 index 000000000000..6e37596699cd --- /dev/null +++ b/third_party/rust/mapped_hyph/src/ffi.rs @@ -0,0 +1,165 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::slice; +use std::str; +use std::ffi::CStr; +use std::os::raw::c_char; +use std::str::Utf8Error; + +use memmap::Mmap; + +use super::Hyphenator; + +/// Opaque type representing a hyphenation dictionary loaded from a file, +/// for use in FFI function signatures. +pub struct HyphDic; + +// Helper to convert word and hyphen buffer parameters from raw C pointer/length +// pairs to the Rust types expected by mapped_hyph. +unsafe fn params_from_c<'a>(word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> + (Result<&'a str, Utf8Error>, &'a mut [u8]) { + (str::from_utf8(slice::from_raw_parts(word as *const u8, word_len as usize)), + slice::from_raw_parts_mut(hyphens, hyphens_len as usize)) +} + +/// C-callable function to load a hyphenation dictionary from a file at `path`. +/// +/// Returns null on failure. +/// +/// This does not fully validate that the file contains usable hyphenation +/// data, it only opens the file (read-only) and mmap's it into memory, and +/// does some minimal sanity-checking that it *might* be valid. +/// +/// The returned `HyphDic` must be released with `mapped_hyph_free_dictionary`. +/// +/// # Safety +/// The given `path` must be a valid pointer to a NUL-terminated (C-style) +/// string. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_load_dictionary(path: *const c_char) -> *const HyphDic { + let path_str = match CStr::from_ptr(path).to_str() { + Ok(str) => str, + Err(_) => return std::ptr::null(), + }; + let hyph = Box::new(match super::load_file(path_str) { + Some(dic) => dic, + _ => return std::ptr::null(), + }); + Box::into_raw(hyph) as *const HyphDic +} + +/// C-callable function to free a hyphenation dictionary +/// that was loaded by `mapped_hyph_load_dictionary`. +/// +/// # Safety +/// The `dic` parameter must be a `HyphDic` pointer obtained from +/// `mapped_hyph_load_dictionary`, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_free_dictionary(dic: *mut HyphDic) { + Box::from_raw(dic); +} + +/// C-callable function to find hyphenation values for a given `word`, +/// using a dictionary loaded via `mapped_hyph_load_dictionary`. +/// +/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters) +/// long. +/// +/// Caller must supply the `hyphens` output buffer for results; its size is +/// given in `hyphens_len`. +/// It should be at least `word_len` elements long. +/// +/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is +/// too small. +/// Otherwise returns the number of potential hyphenation positions found. +/// +/// # Panics +/// This function may panic if the given dictionary is not valid. +/// +/// # Safety +/// The `dic` parameter must be a `HyphDic` pointer obtained from +/// `mapped_hyph_load_dictionary`. +/// +/// The `word` and `hyphens` parameter must be valid pointers to memory buffers +/// of at least the respective sizes `word_len` and `hyphens_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_dic(dic: *const HyphDic, + word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> i32 { + if word_len > hyphens_len { + return -1; + } + let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len); + if word_str.is_err() { + return -1; + } + Hyphenator::new(&*(dic as *const Mmap)) + .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32 +} + +/// C-callable function to find hyphenation values for a given `word`, +/// using a dictionary loaded and owned by the caller. +/// +/// The dictionary is supplied as a raw memory buffer `dic_buf` of size +/// `dic_len`. +/// +/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters) +/// long. +/// +/// Caller must supply the `hyphens` output buffer for results; its size is +/// given in `hyphens_len`. +/// It should be at least `word_len` elements long. +/// +/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is +/// too small. +/// Otherwise returns the number of potential hyphenation positions found. +/// +/// # Panics +/// This function may panic if the given dictionary is not valid. +/// +/// # Safety +/// The `dic_buf` parameter must be a valid pointer to a memory block of size +/// at least `dic_len`. +/// +/// The `word` and `hyphens` parameter must be valid pointers to memory buffers +/// of at least the respective sizes `word_len` and `hyphens_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_raw(dic_buf: *const u8, dic_len: u32, + word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> i32 { + if word_len > hyphens_len { + return -1; + } + let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len); + if word_str.is_err() { + return -1; + } + Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize)) + .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32 +} + +/// C-callable function to check if a given memory buffer `dic_buf` of size +/// `dic_len` is potentially usable as a hyphenation dictionary. +/// +/// Returns `true` if the given memory buffer looks like it may be a valid +/// hyphenation dictionary, `false` if it is clearly not usable. +/// +/// # Safety +/// The `dic_buf` parameter must be a valid pointer to a memory block of size +/// at least `dic_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic_len: u32) -> bool { + if dic_buf.is_null() { + return false; + } + let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize)); + dic.is_valid_hyphenator() +} diff --git a/third_party/rust/mapped_hyph/src/lib.rs b/third_party/rust/mapped_hyph/src/lib.rs new file mode 100644 index 000000000000..6f68da8a64a3 --- /dev/null +++ b/third_party/rust/mapped_hyph/src/lib.rs @@ -0,0 +1,640 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate arrayref; +extern crate memmap; + +use std::slice; +use std::str; +use std::cmp::max; +use std::fs::File; +use std::mem; + +use memmap::Mmap; + +// Make submodules available publicly. +pub mod builder; +pub mod ffi; + +// 4-byte identification expected at beginning of a compiled dictionary file. +// (This will be updated if an incompatible change to the format is made in +// some future revision.) +const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0']; + +const INVALID_STRING_OFFSET: u16 = 0xffff; +const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff; + +const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels +const LEVEL_HEADER_SIZE: usize = 16; + +// Transition actually holds a 24-bit new state offset and an 8-bit input byte +// to match. We will be interpreting byte ranges as Transition arrays (in the +// State::transitions() method below), so use repr(C) to ensure we have the +// memory layout we expect. +// Transition records do not depend on any specific alignment. +#[repr(C)] +#[derive(Debug,Copy,Clone)] +struct Transition(u8, u8, u8, u8); + +impl Transition { + fn new_state_offset(&self) -> usize { + // Read a 24-bit little-endian number from three bytes. + self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16) + } + fn match_byte(&self) -> u8 { + self.3 + } +} + +// State is an area of the Level's data block that begins with a fixed header, +// followed by an array of transitions. The total size of each State's data +// depends on the number of transitions in the state. Only the basic header +// is defined by the struct here; the rest of the state is accessed via +// pointer magic. +// There are two versions of State, a basic version that supports only simple +// hyphenation (no associated spelling change), and an extended version that +// adds the replacement-string fields to support spelling changes at the +// hyphenation point. Check is_extended() to know which version is present. +// State records are NOT necessarily 4-byte aligned, so multi-byte fields +// should be read with care. +#[derive(Debug,Copy,Clone)] +#[repr(C)] +struct State { + fallback_state: [u8; 4], + match_string_offset: [u8; 2], + num_transitions: u8, + is_extended: u8, +} + +#[repr(C)] +struct StateExtended { + state: State, + repl_string_offset: [u8; 2], + repl_index: i8, + repl_cut: i8, +} + +impl State { + // Accessors for the various State header fields; see file format description. + fn fallback_state(&self) -> usize { + u32::from_le_bytes(self.fallback_state) as usize + } + fn match_string_offset(&self) -> usize { + u16::from_le_bytes(self.match_string_offset) as usize + } + fn num_transitions(&self) -> u8 { + self.num_transitions + } + fn is_extended(&self) -> bool { + self.is_extended != 0 + } + // Accessors that are only valid if is_extended() is true. + // These use `unsafe` to dereference a pointer to the relevant field; + // this is OK because Level::get_state always validates the total state size + // before returning a state reference, so these pointers will be valid for + // any extended state it returns. + #[allow(dead_code)] + fn as_extended(&self) -> &StateExtended { + debug_assert!(self.is_extended()); + unsafe { mem::transmute(self) } + } + #[allow(dead_code)] + fn repl_string_offset(&self) -> usize { + u16::from_le_bytes(self.as_extended().repl_string_offset) as usize + } + #[allow(dead_code)] + fn repl_index(&self) -> i8 { + self.as_extended().repl_index + } + #[allow(dead_code)] + fn repl_cut(&self) -> i8 { + self.as_extended().repl_cut + } + // Return the state's Transitions as a slice reference. + fn transitions(&self) -> &[Transition] { + let count = self.num_transitions() as usize; + if count == 0 { + return &[]; + } + let transition_offset = if self.is_extended() { mem::size_of::() } else { mem::size_of::() } as isize; + // We know the `offset` here will not look beyond the valid range of memory + // because Level::get_state() checks the state length (accounting for the + // number of transitions) before returning a State reference. + let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition }; + // Again, because Level::get_state() already checked the state length, we know + // this slice address and count will be valid. + unsafe { slice::from_raw_parts(trans_ptr, count) } + } + // Look up the Transition for a given input byte, or None. + fn transition_for(&self, b: u8) -> Option { + // The transitions array is sorted by match_byte() value, but there are + // usually very few entries; benchmarking showed that using binary_search_by + // here gave no benefit (possibly slightly slower). + self.transitions().iter().copied().find(|t| t.match_byte() == b) + } + // Just for debugging use... + #[allow(dead_code)] + fn deep_show(&self, prefix: &str, dic: &Level) { + if self.match_string_offset() != INVALID_STRING_OFFSET as usize { + let match_string = dic.string_at_offset(self.match_string_offset()); + println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap()); + } + for t in self.transitions() { + println!("{}{} ->", prefix, t.match_byte() as char); + let next_prefix = format!("{} ", prefix); + dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic); + } + } +} + +// We count the presentation-form ligature characters U+FB00..FB06 as multiple +// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these +// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this +// helper returns the "decomposed length" of the ligature given its trailing +// byte. +fn lig_length(trail_byte: u8) -> usize { + // This is only called on valid UTF-8 where we already know trail_byte + // must be >= 0x80. + // Ligature lengths: ff fi fl ffi ffl long-st st + const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8, 2u8 ]; + if trail_byte > 0x86 { + return 1; + } + LENGTHS[trail_byte as usize - 0x80] as usize +} + +fn is_utf8_trail_byte(byte: u8) -> bool { + (byte & 0xC0) == 0x80 +} + +fn is_ascii_digit(byte: u8) -> bool { + byte <= b'9' && byte >= b'0' +} + +fn is_odd(byte: u8) -> bool { + (byte & 0x01) == 0x01 +} + +// A hyphenation Level has a header followed by State records and packed string +// data. The total size of the slice depends on the number and size of the +// States and Strings it contains. +// Note that the data of the Level may not have any specific alignment! +#[derive(Debug,Copy,Clone)] +struct Level<'a> { + data: &'a [u8], + // Header fields cached by the constructor for faster access: + state_data_base_: usize, + string_data_base_: usize, +} + +impl Level<'_> { + // Constructor that initializes our cache variables. + fn new(data: &[u8]) -> Level { + Level { + data, + state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize, + string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize, + } + } + + // Accessors for Level header fields; see file format description. + fn state_data_base(&self) -> usize { + self.state_data_base_ // cached by constructor + } + fn string_data_base(&self) -> usize { + self.string_data_base_ // cached by constructor + } + fn nohyphen_string_offset(&self) -> usize { + u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize + } + #[allow(dead_code)] + fn nohyphen_count(&self) -> u16 { + u16::from_le_bytes(*array_ref!(self.data, 10, 2)) + } + fn lh_min(&self) -> usize { + max(1, self.data[12] as usize) + } + fn rh_min(&self) -> usize { + max(1, self.data[13] as usize) + } + fn clh_min(&self) -> usize { + max(1, self.data[14] as usize) + } + fn crh_min(&self) -> usize { + max(1, self.data[15] as usize) + } + fn word_boundary_mins(&self) -> (usize, usize, usize, usize) { + (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min()) + } + // Strings are represented as offsets from the Level's string_data_base. + // This returns a byte slice referencing the string at a given offset, + // or an empty slice if invalid. + fn string_at_offset(&self, offset: usize) -> &'_ [u8] { + if offset == INVALID_STRING_OFFSET as usize { + return &[]; + } + let string_base = self.string_data_base() as usize + offset; + // TODO: move this to the validation function. + debug_assert!(string_base < self.data.len()); + if string_base + 1 > self.data.len() { + return &[]; + } + let len = self.data[string_base] as usize; + // TODO: move this to the validation function. + debug_assert!(string_base + 1 + len <= self.data.len()); + if string_base + 1 + len > self.data.len() { + return &[]; + } + self.data.get(string_base + 1 .. string_base + 1 + len).unwrap() + } + // The nohyphen field actually contains multiple NUL-separated substrings; + // return them as a vector of individual byte slices. + fn nohyphen(&self) -> Vec<&[u8]> { + let string_offset = self.nohyphen_string_offset(); + let nohyph_str = self.string_at_offset(string_offset as usize); + if nohyph_str.is_empty() { + return vec![]; + } + nohyph_str.split(|&b| b == 0).collect() + } + // States are represented as an offset from the Level's state_data_base. + // This returns a reference to the State at a given offset, or None if invalid. + fn get_state(&self, offset: usize) -> Option<&State> { + if offset == INVALID_STATE_OFFSET as usize { + return None; + } + debug_assert_eq!(offset & 3, 0); + let state_base = self.state_data_base() + offset; + // TODO: move this to the validation function. + debug_assert!(state_base + mem::size_of::() <= self.string_data_base()); + if state_base + mem::size_of::() > self.string_data_base() { + return None; + } + let state_ptr = &self.data[state_base] as *const u8 as *const State; + // This is safe because we just checked against self.string_data_base() above. + let state = unsafe { state_ptr.as_ref().unwrap() }; + let length = if state.is_extended() { mem::size_of::() } else { mem::size_of::() } + + mem::size_of::() * state.num_transitions() as usize; + // TODO: move this to the validation function. + debug_assert!(state_base + length <= self.string_data_base()); + if state_base + length > self.string_data_base() { + return None; + } + // This is safe because we checked the full state length against self.string_data_base(). + unsafe { state_ptr.as_ref() } + } + // Sets hyphenation values (odd = potential break, even = no break) in values[], + // and returns the change in the number of odd values present, so the caller can + // keep track of the total number of potential breaks in the word. + fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize { + // Bail out immediately if the word is too short to hyphenate. + if word.len() < lh_min + rh_min { + return 0; + } + let start_state = self.get_state(0); + let mut st = start_state; + let mut hyph_count = 0; + for i in 0 .. word.len() + 2 { + // Loop over the word by bytes, with a virtual '.' added at each end + // to match word-boundary patterns. + let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] }; + loop { + // Loop to repeatedly fall back if we don't find a matching transition. + // Note that this could infinite-loop if there is a state whose fallback + // points to itself (or a cycle of fallbacks), but this would represent + // a table compilation error. + // (A potential validation function could check for fallback cycles.) + if st.is_none() { + st = start_state; + break; + } + let state = st.unwrap(); + if let Some(tr) = state.transition_for(b) { + // Found a transition for the current byte. Look up the new state; + // if it has a match_string, merge its weights into `values`. + st = self.get_state(tr.new_state_offset()); + if let Some(state) = st { + let match_offset = state.match_string_offset(); + if match_offset != INVALID_STRING_OFFSET as usize { + if state.is_extended() { + debug_assert!(false, "extended hyphenation not supported by this function"); + } else { + let match_str = self.string_at_offset(match_offset); + let offset = i + 1 - match_str.len(); + assert!(offset + match_str.len() <= word.len() + 2); + for (j, ch) in match_str.iter().enumerate() { + let index = offset + j; + if index >= lh_min && index <= word.len() - rh_min { + // lh_min and rh_min are guaranteed to be >= 1, + // so this will not try to access outside values[]. + let old_value = values[index - 1]; + let value = ch - b'0'; + if value > old_value { + if is_odd(old_value) != is_odd(value) { + // Adjust hyph_count for the change we're making + hyph_count += if is_odd(value) { 1 } else { -1 }; + } + values[index - 1] = value; + } + } + } + } + } + } + // We have handled the current input byte; leave the fallback loop + // and get next input. + break; + } + // No transition for the current byte; go to fallback state and try again. + st = self.get_state(state.fallback_state()); + } + } + + // If the word was not purely ASCII, or if the word begins/ends with + // digits, the use of lh_min and rh_min above may not have correctly + // excluded enough positions, so we need to fix things up here. + let mut index = 0; + let mut count = 0; + let word_bytes = word.as_bytes(); + let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; }; + // Handle lh_min. + while count < lh_min - 1 && index < word_bytes.len() { + let byte = word_bytes[index]; + clear_hyphen_at(index); + if byte < 0x80 { + index += 1; + if is_ascii_digit(byte) { + continue; // ASCII digits don't count + } + } else if byte == 0xEF && word_bytes[index + 1] == 0xAC { + // Unicode presentation-form ligature characters, which we count as + // multiple chars for the purpose of lh_min/rh_min, all begin with + // 0xEF, 0xAC in UTF-8. + count += lig_length(word_bytes[index + 2]); + clear_hyphen_at(index + 1); + clear_hyphen_at(index + 2); + index += 3; + continue; + } else { + index += 1; + while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index]) { + clear_hyphen_at(index); + index += 1; + } + } + count += 1; + } + + // Handle rh_min. + count = 0; + index = word.len(); + while count < rh_min && index > 0 { + index -= 1; + let byte = word_bytes[index]; + if index < word.len() - 1 { + clear_hyphen_at(index); + } + if byte < 0x80 { + // Only count if not an ASCII digit + if !is_ascii_digit(byte) { + count += 1; + } + continue; + } + if is_utf8_trail_byte(byte) { + continue; + } + if byte == 0xEF && word_bytes[index + 1] == 0xAC { + // Presentation-form ligatures count as multiple chars. + count += lig_length(word_bytes[index + 2]); + continue; + } + count += 1; + } + + hyph_count + } +} + +/// Hyphenation engine encapsulating a language-specific set of patterns (rules) +/// that identify possible break positions within a word. +pub struct Hyphenator<'a>(&'a [u8]); + +impl Hyphenator<'_> { + /// Return a Hyphenator that wraps the given buffer. + /// This does *not* check that the given buffer is in fact a valid hyphenation table. + /// Use is_valid_hyphenator() to determine whether it is usable. + /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary, + /// unvalidated data is not unsafe, but may panic.) + pub fn new(buffer: &[u8]) -> Hyphenator { + Hyphenator(buffer) + } + + // Internal implementation details + fn magic_number(&self) -> &[u8] { + &self.0[0 .. 4] + } + fn num_levels(&self) -> usize { + u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize + } + fn level(&self, i: usize) -> Level { + let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize; + let limit = if i == self.num_levels() - 1 { + self.0.len() + } else { + u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize + }; + debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len()); + debug_assert_eq!(offset & 3, 0); + debug_assert_eq!(limit & 3, 0); + Level::new(&self.0[offset .. limit]) + } + + /// Identify acceptable hyphenation positions in the given `word`. + /// + /// The caller-supplied `values` must be at least as long as the `word`. + /// + /// On return, any elements with an odd value indicate positions in the word + /// after which a hyphen could be inserted. + /// + /// Returns the number of possible hyphenation positions that were found. + /// + /// # Panics + /// If the given `values` slice is too small to hold the results. + /// + /// If the block of memory represented by `self.0` is not in fact a valid + /// hyphenation dictionary, this function may panic with an overflow or + /// array bounds violation. + pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize { + assert!(values.len() >= word.len()); + values.iter_mut().for_each(|x| *x = 0); + let top_level = self.level(0); + let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins(); + if word.len() < lh_min + rh_min { + return 0; + } + let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min); + let compound = hyph_count > 0; + // Subsequent levels are applied to fragments between potential breaks + // already found: + for l in 1 .. self.num_levels() { + let level = self.level(l); + if hyph_count > 0 { + let mut begin = 0; + let mut lh = lh_min; + // lh_min and rh_min are both guaranteed to be greater than zero, + // so this loop will not reach fully to the end of the word. + for i in lh_min - 1 .. word.len() - rh_min { + if is_odd(values[i]) { + if i > begin { + // We've found a component of a compound; + // clear the corresponding values and apply the new level. + // (These values must be even, so hyph_count is unchanged.) + values[begin .. i].iter_mut().for_each(|x| { + *x = 0; + }); + hyph_count += level.find_hyphen_values(&word[begin ..= i], + &mut values[begin ..= i], + lh, crh_min); + } + begin = i + 1; + lh = clh_min; + } + } + if begin == 0 { + // No compound-word breaks were found, just apply level to the whole word. + hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); + } else if begin < word.len() { + // Handle trailing component of compound. + hyph_count += level.find_hyphen_values(&word[begin .. word.len()], + &mut values[begin .. word.len()], + clh_min, rh_min); + } + } else { + hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); + } + } + + // Only need to check nohyphen strings if top-level (compound) breaks were found. + if compound && hyph_count > 0 { + let nohyph = top_level.nohyphen(); + if !nohyph.is_empty() { + for i in lh_min ..= word.len() - rh_min { + if is_odd(values[i - 1]) { + for nh in &nohyph { + if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] { + values[i - 1] = 0; + hyph_count -= 1; + break; + } + if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] { + values[i - 1] = 0; + hyph_count -= 1; + break; + } + } + } + } + } + } + + hyph_count + } + + /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char` + /// at each valid break position. + /// + /// # Panics + /// If the block of memory represented by `self` is not in fact a valid + /// hyphenation dictionary, this function may panic with an overflow or + /// array bounds violation. + /// + /// Also panics if the length of the hyphenated word would overflow `usize`. + pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String { + let mut values = vec![0u8; word.len()]; + let hyph_count = self.find_hyphen_values(word, &mut values); + if hyph_count <= 0 { + return word.to_string(); + } + // We know how long the result will be, so we can preallocate here. + let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8(); + let mut result = String::with_capacity(result_len); + let mut n = 0; + for ch in word.char_indices() { + if ch.0 > 0 && is_odd(values[ch.0 - 1]) { + result.push(hyphchar); + n += 1; + } + result.push(ch.1); + } + debug_assert_eq!(n, hyph_count); + debug_assert_eq!(result_len, result.len()); + result + } + + /// Check if the block of memory looks like it could be a valid hyphenation + /// table. + pub fn is_valid_hyphenator(&self) -> bool { + // Size must be at least 4 bytes for magic_number + 4 bytes num_levels; + // smaller than this cannot be safely inspected. + if self.0.len() < FILE_HEADER_SIZE { + return false; + } + if self.magic_number() != MAGIC_NUMBER { + return false; + } + // For each level, there's a 4-byte offset in the header, and the level + // has its own 16-byte header, so we can check a minimum size again here. + let num_levels = self.num_levels(); + if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels { + return false; + } + // Check that state_data_base and string_data_base for each hyphenation + // level are within range. + for l in 0 .. num_levels { + let level = self.level(l); + if level.state_data_base() < LEVEL_HEADER_SIZE || + level.state_data_base() > level.string_data_base() || + level.string_data_base() > level.data.len() { + return false; + } + // TODO: consider doing more extensive validation of states and + // strings within the level? + } + // It's still possible the dic is internally broken, but at least it's + // worth trying to use it! + true + } +} + +/// Load the compiled hyphenation file at `dic_path`, if present. +/// +/// Returns `None` if the specified file cannot be opened or mapped, +/// otherwise returns a `memmap::Mmap` mapping the file. +/// +/// # Safety +/// +/// This is unsafe for the same reason Mmap::map() is unsafe: +/// mapped_hyph does not guarantee safety if the mapped file is modified +/// (e.g. by another process) while we're using it. +/// +/// This verifies that the file looks superficially like it may be a +/// compiled hyphenation table, but does *not* fully check the validity +/// of the file contents! Calling hyphenation functions with the returned +/// data is not unsafe, but may panic if the data is invalid. +pub unsafe fn load_file(dic_path: &str) -> Option { + let file = File::open(dic_path).ok()?; + let dic = Mmap::map(&file).ok()?; + let hyph = Hyphenator(&*dic); + if hyph.is_valid_hyphenator() { + return Some(dic); + } + None +} diff --git a/third_party/rust/mapped_hyph/src/main.rs b/third_party/rust/mapped_hyph/src/main.rs new file mode 100644 index 000000000000..acc24babee2e --- /dev/null +++ b/third_party/rust/mapped_hyph/src/main.rs @@ -0,0 +1,67 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate mapped_hyph; + +use mapped_hyph::Hyphenator; + +fn main() { + let dic_path = "hyph_en_US.hyf"; + + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + + println!("{}", hyph.hyphenate_word("haha", '-')); + println!("{}", hyph.hyphenate_word("hahaha", '-')); + println!("{}", hyph.hyphenate_word("photo", '-')); + println!("{}", hyph.hyphenate_word("photograph", '-')); + println!("{}", hyph.hyphenate_word("photographer", '-')); + println!("{}", hyph.hyphenate_word("photographic", '-')); + println!("{}", hyph.hyphenate_word("photographical", '-')); + println!("{}", hyph.hyphenate_word("photographically", '-')); + println!("{}", hyph.hyphenate_word("supercalifragilisticexpialidocious", '-')); + println!("{}", hyph.hyphenate_word("o'dwyer", '=')); + println!("{}", hyph.hyphenate_word("o'callahan", '=')); + println!("{}", hyph.hyphenate_word("o’dwyer", '=')); + println!("{}", hyph.hyphenate_word("o’callahan", '=')); + println!("{}", hyph.hyphenate_word("petti-fogging", '=')); + println!("{}", hyph.hyphenate_word("e-mailing", '=')); + println!("{}", hyph.hyphenate_word("-x-mailing", '=')); + println!("{}", hyph.hyphenate_word("-strikeout-", '=')); + + let dic2 = match unsafe { mapped_hyph::load_file("tests/compound.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", "tests/compound.hyf"), + }; + + let h2 = Hyphenator::new(&*dic2); + println!("{}", h2.hyphenate_word("motorcycle", '=')); + + let dic3 = match unsafe { mapped_hyph::load_file("tests/rhmin.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let h3 = Hyphenator::new(&*dic3); + println!("{}", h3.hyphenate_word("övéit", '=')); + println!("{}", h3.hyphenate_word("అంగడిధర", '=')); + + let dic4 = match unsafe { mapped_hyph::load_file("tests/num.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", "tests/num.hyf"), + }; + let h4 = Hyphenator::new(&*dic4); + + println!("{}", h4.hyphenate_word("123foobar123", '=')); + println!("{}", h4.hyphenate_word("123foobarfoobar", '=')); + println!("{}", h4.hyphenate_word("foobarfoobar123", '=')); + println!("{}", h4.hyphenate_word("123foobarfoobar123", '=')); +} diff --git a/third_party/rust/mapped_hyph/tests/base.hyf b/third_party/rust/mapped_hyph/tests/base.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/base.hyph b/third_party/rust/mapped_hyph/tests/base.hyph new file mode 100644 index 000000000000..550c57c9ad89 --- /dev/null +++ b/third_party/rust/mapped_hyph/tests/base.hyph @@ -0,0 +1,4543 @@ +aarhus +abase +abate +abbeys +abby +abducts +aber=ra=tions +ab=hor=rer +abil=i=ties +ab=jur=ing +ablest +abodes +abo=li=tion=ist +abor=tion +about +abram +abridged +abruptly +ab=sconds +ab=sently +ab=solved +ab=sorp=tion +ab=sti=nence +ab=strac=tor +abun=dance +abuts +abyssinian +aca=pulco +ac=cel=er=a=tor +ac=cen=tu=ated +ac=cepted +ac=ces=si=bil=ity +ac=ci=den=tal +ac=cli=mated +ac=com=mo=dat=ing +ac=com=pa=ny=ing +ac=com=plish=ments +ac=cords +ac=coun=tant +ac=cre=tion +ac=cul=tur=a=tion +ac=cu=racy +ac=cused +aces +achieve +acid +ac=knowl=edge=able +acme +acous=tics +ac=qui=es=cent +ac=quis=i=tive +acres +acrop=o=lis +acti=nome=ters +ac=ti=va=tors +ac=tors +ac=tu=ar=ial +acute +ada=gios +adap=ta=tion +adapts +ad=dict=ing +ad=di=tions +ad=dresser +ad=duc=ing +aden +ad=her=ents +adi=a=bat=i=cally +ad=join=ing +ad=judg=ing +ad=jured +ad=just=ment +ad=min=is=ter +ad=min=is=tra=tively +ad=mire +ad=mis=sions +ad=mixed +ad=mo=ni=tions +adopted +adore +adrian +ad=sorbs +adul=terer +ad=um=brat=ing +ad=van=ta=geous +ad=ven=tur=ers +ad=versely +ad=ver=tises +ad=visees +ad=vo=cacy +aer=ate +aer=obac=ter +aerosols +af=fairs +af=fec=tions +af=fil=i=at=ing +af=firmed +af=flic=tion +af=fords +afghans +afore=thought +african=izes +af=ter=im=age +af=ter=ward +age +ager +ag=glu=ti=nated +ag=gra=va=tion +ag=gres=sive +ag=ile +ag=i=ta=tor +ag=o=nies +agree=ably +agri=cul=tur=ally +aide +ail=ing +aims +air=drops +air=foil +air=line +air=planes +air=tight +akin +alamo +alas +al=ba=tross +al=bums +al=co=holism +aldrich +alert=ing +alexan=dria +alga +al=ge=rian +al=go=rithms +ali=cia +aligned +al=is=tair +al=lan +al=leges +al=le=gory +al=ler=gic +al=ley=way +al=lit=er=a=tion +al=lo=ca=tor +al=lots +al=low=ing +al=lure=ment +al=maden +al=nico +aloof=ness +al=pha=bet=ized +al=sa=tian +al=ter=ations +al=ter=nates +al=thaea +al=tru=is=ti=cally +alve=o=lar +amal=ga=mate +amass +amaze +ama=zons +am=bigu=ous +am=bled +am=bushed +amend +amer=ica +amer=i=cans +amide +am=mo=nia +among +amor=tized +amour +am=phib=ians +am=pli=fiers +am=pu=tated +amuse=ment +an=abap=tist +ana=gram +ana=logue +an=a=lyt=ic=i=ties +anaphoric +anas=to=moses +anatomy +an=chorite +an=dalu=sia +an=dover +anec=dote +anes=thetic +an=ge=leno +an=gered +an=glia +an=gola +an=gu=lar +an=i=mated +an=i=mism +anita +an=napo=lis +an=ni=hi=lated +an=no=ta=tion +an=noy +an=nu=ally +an=nuls +an=odes +anoma=lously +anselm +ant +an=tag=o=nizes +an=te=date +an=tholo=gies +an=thro=po=mor=phi=cally +an=tic=i=pates +an=ti=dotes +an=ti=mony +an=ti=quate +an=ti=semitism +an=ti=thet=i=cal +an=to=nio +anx=ious +any=way +ap=a=thy +apha=sia +api=ary +apoc=ryphal +apol=o=gist +apos=tolic +ap=pall +ap=par=ently +ap=pear +ap=pease=ment +ap=pended +ap=per=tains +ap=plauds +ap=pli=ca=ble +ap=plier +ap=pointer +ap=por=tion=ing +ap=prais=ers +ap=pre=ci=a=tion +ap=pre=hen=sively +ap=proach +ap=pro=pri=ate +ap=proval +ap=prox=i=mated +april +aptly +aquifer +ara=bi=ans +aramco +ar=bi=trat=ing +ar=cades +ar=chaism +arche=ol=o=gist +archimedes +ar=chi=tec=tures +arc=ing +ar=dently +are=quipa +ar=gos +ar=gu=ment +arid=ity +aris=to=crat +arith=me=tize +arm +arm=chairs +arm=ing +armpits +arousal +ar=rack +ar=range=ment +ar=rears +ar=rhe=nius +ar=ro=gate +ar=royo +ar=te=rial +arthri=tis +ar=tic=u=lately +ar=ti=fact +ar=tillerist +arts +as=cen=dant +as=cent +as=cot +ashamedly +ash=tray +asi=at=ics +ask=ing +as=per=sions +as=pi=ra=tion +ass +as=sas=si=nated +as=say +as=sem=blies +as=serter +as=sess +as=siduity +as=sign=ing +as=sist +as=so=ciate +as=so=ci=a=tor +as=suaged +as=sure +as=syr=i=an=ize +as=ter=oid +as=ton=ish=ingly +astride +as=tro=nom=i=cally +asym=met=ric +asyn=chronously +athe=ism +ath=letes +at=las +at=om=iza=tion +atone=ment +at=ro=phies +at=tach=ing +at=tain +at=tempt +at=ten=dants +at=ten=tion=al=ity +at=ten=u=a=tor +at=tired +at=tracted +at=tributable +at=tune +auburn +au=di=bly +au=diome=ter +au=di=tions +auger +au=gust +au=ral +aus=cul=tated +aus=terely +aus=tri=an=ize +au=then=ti=ca=tor +au=thor=i=ties +au=thors +au=to=cor=re=late +au=todecre=ments +au=toin=dex +au=toma=ton +au=topi=lot +au=tum=nal +availer +avari=cious +av=enues +avers +avian +avionic +avoid=able +avow +awak=ened +awards +aw=ful=ness +awry +ax=i=o=log=i=cal +ax=ioms +ayes +azure +ba=belizes +baby=ing +bac=chus +back=bend +back=fill +back=o=rder +backscat=ters +back=stitch +back=tracks +back=yard +bad=ger +baf=fle +bag=gage +bagro=dia +bailiff +baits +bakes +bal=ancers +bald=win +balka=niza=tion +balks +baller +bal=loon +ballplayer +bal=sam +bam=boo +ban=dage +band=pass +bane +ban=gui +bank +bankrupts +bans +bap=tism +bap=tized +bar=barism +bar=bells +bards +barest +barhop +barks +barn=hard +barom=e=ters +barr +bar=ren +bar=ron +barter +basalt +base=less +bash +ba=sics +bas=ket=ball +bassinets +batavia +bather +bath=tub +bat=ted +bat=ting +bat=tle=ments +baude=laire +bawl=ing +bay=o=net +be +beaded +beaker +bean=bag +bearded +beast +be=at=i=fi=ca=tion +beau +beau=ti=fied +beavers +becker +be=com=ingly +bed=der +bed=post +bed=spread +beecham +beefy +beethoven +be=fell +be=foul +be=fud=dles +beg=gary +be=got=ten +be=guil=ing +be=hav=ior=ism +be=hold +be=ing +be=lay +bel=fry +be=liev=able +be=lit=tles +belles +bel=liger=ents +bells +be=long +belt=ing +be=moans +bend=able +bene=dic=tions +ben=e=fi=ciary +ben=gal +bent +be=queath +be=rat=ing +beres=ford +berib=boned +berlin=ers +bernar=dine +bernoulli +bertie +be=sets +be=smirched +be=spoke +best=ing +bet +be=trayed +bette +be=tween +be=wail +be=wil=der=ment +bianco +bibles +bi=car=bon=ate +bi=con=vex +bid=der +bi=en=nial +big=ger +bi=har=monic +bi=l=abial +bilk +bil=let +billings +bimet=allism +bind +bing=ham=ton +bio=chem=istry +bi=o=log=i=cally +biopsy +bipeds +bird=baths +birm=ing=hamize +births +bi=sec=tors +bisques +bites +bit=terly +bi=valves +blab=ber=mouths +black=burn +black=foots +black=mailed +blacks +blaine +blamers +bland +blan=keters +blares +blas=phe=mous=ness +blatz +bleach=ers +bleat=ing +blem=ishes +bless=ings +blind=fold +blinked +bliss=fully +bliz=zard +bloch +block=ers +blond +blood=i=est +bloom +blos=soms +blow=fish +blud=geons +blueprint +bluish +blunted +blur=ring +blush=ing +boarded +boaster +boathouse +boatswain +bobb=sey +bo=den=heim +body=build=ing +bog=art +bo=gus +boil=ers +bold=face +bol=she=vist +bolton +bom=bas=tic +bo=nan=zas +bonds=man +bon=ham +bon=tempo +book=cases +book=keep=ers +book=store +booms +booster +boo=tle +boot=strap=ping +bor=den +bo=re=alis +born +bor=row=ers +bosses +botanist +bother +bot=tler +bo=tulism +bounce +bounden +bou=quet +bou=tique +bowd=ler=iz=ing +bowl +bow=string +box=ing +boyfriend +braced +brad=bury +brag=ger +braille +brain=storm +brakes +branch=ings +bran=dish=ing +brashly +braun +brav=ing +bray=ing +brazil +bread +bread=win=ners +break=fast +break=through +breast=works +breath=lessly +breed=ing +bren=nan +brevet +brew=ery +bribers +brick=lay=ers +bridge +bridge=work +briefed +brig +brighten +brighton +brim=ming +brings +bris=tle +britisher +broaches +broad=casts +broadly +broglie +bro=ken=ness +bronchial +brooch +brook=field +broth +brow=beat +brow=n=ian +bruce +brunette +brush=ing +bru=tal=ized +bryce +buch=wald +buck=ler +bucky +bud=dies +bud=geters +buff +buf=fet=ings +bug=ger +bugs +built +bulging +bull=doze +bull=frog +bul=ly=ing +bum=bling +bump=tious +bun=dle +bun=gler +bunkhouse +bunted +buoys +bu=reau=cracy +burgher +bur=glarproof=ing +burke +burn +burn=ings +burnt=ness +bur=row=ing +bursty +busch +bush=whacked +busi=nesslike +bus=tards +butchered +but=ter=cup +but=ter=nut +but=ton=holes +bu=tyrate +buz=zard +bye +by=pass=ing +by=stander +byzan=tinizes +cab=i=net +cache +cac=tus +cae=sarize +cager +ca=jole +calais +cal=cu=late +cal=cu=lus +cal=gary +cal=ico +callaghan +cal=loused +calm=ingly +cal=tech +ca=lypso +camem=bert +camino +cam=paign=ing +camps +cana=di=an=ize +can=celed +can=di=dacy +can=dler +ca=nine +can=nery +can=non +canon=i=cal +canopy +canto +can=vassed +ca=pa=ble +ca=pac=i=tors +capita +cap=i=tal=iz=ers +cap=ping +cap=stone +cap=ti=vates +cap=turer +car=a=vans +car=bon=dale +car=boniz=ing +card=board +car=di=ol=ogy +care=fully +ca=ress=ing +car=i=ca=ture +carls=bad +car=na=tion +car=o=line +car=pen=ters +car=riages +car=ruthers +carter +car=ton +carve +cas=cades +cashed +cas=ings +cas=sette +castes +casts +catalina +cat=a=pult +catches +cat=e=go=rizes +cathe=dral +catholi=cisms +cat=tle +caul=drons +causer +cau=tioner +cav=a=lier=ness +cav=ernous +caw=ing +ce=cil +celanese +celer=ity +cel=list +celti=cizes +cen=sor=ing +cen=taur +cen=time=ter +cen=tral=ized +cen=troid +cere=bral +cer=tain=ties +cer=ti=fies +cezanne +chaf=fey +chair=ing +chal=ices +chal=leng=ing +cham=paign +chan=cel=lor +change=abil=ity +chan=neled +chanter +chapel +chap=ter +char=ac=ter=ize +charge=able +char=i=ta=ble +char=lotte +chars +chart=ings +chas=ing +chas=tis=ers +chat=tel +chauf=feured +cheaply +check=book +check=out +cheek=bone +cheer=i=ness +cheeses +chemise +cher=ishes +cheryl +chests +cheyennes +chi=canos +chides +child=hood +chill +chime +chi=nas +chin=ning +chi=ro=prac=tor +chit +chloro=plasts +choir +choose +chop=ping +chore=o=graph +chou +chris=ten=son +chris=tian=iz=ing +christoph +chron=i=cle +chronol=ogy +chuck=les +church=go=ing +churn +ci=cero=ni=an=ize +cin=derella +ci=pher=texts +cir=cuitously +cir=cu=lat=ing +cir=cum=nav=i=gates +cir=cum=stanced +cir=cuses +cities +civet +civ=i=lized +claimed +clam=bers +clams +clap=board +clar=i=fi=ca=tions +clash +classes +clas=si=fiers +clat=tered +claus=tro=pho=bia +cleaned +cleansed +clearer +cleaved +clemente +clerked +cliches +cliffs +climb +clincher +clink +clip=pers +cloaks +clock=ings +clogs +close=ness +clos=ing +cloth=ing +cloud=ing +clowns +clucks +clumsy +clutch=ing +coaches +coali=tion +coastal +coat=ing +coax=ing +cob=web +cock=pit +co=coon +codes +cod=i=fies +co=ef=fi=cient +co=ex=ist +cof=fer +cog=i=tated +cogs +co=her=ing +coils +co=in=cid=ing +colder +col=icky +col=lab=o=ra=tor +col=lared +col=lect=ing +col=leges +collins +colom=bia +colonies +colons +col=or=less +colum=nize +com=bated +com=bi=na=tor +comb=ings +comedic +cometary +com=fort=ing +comma +com=mand=ment +com=mem=o=ra=tive +com=mended +com=ment=ing +com=mis=sion=ers +com=mit=teemen +com=mon=al=i=ties +com=mon=wealth +com=mu=ni=cated +com=mu=nists +com=mut=ing +com=pactors +com=pa=ra=bly +com=par=i=son +com=pas=sion +com=pelling +com=pen=satory +com=pe=ti=tions +com=pil=ers +com=plaint +com=pleted +com=plex=i=ties +com=pli=ca=tions +com=pli=ment=ing +com=pos=edly +com=post +com=pre=hen=si=bil=ity +com=pres=sion +com=pro=mis=ers +com=pul=sory +com=puted +com=radely +con=cate=na=tion +con=cede +con=ceived +con=cen=tra=tors +con=cep=tu=al=ized +con=certed +con=cise=ness +con=coct +con=cretes +con=cur=ring +con=demns +con=di=tional +con=doned +con=duc=tion +con=fec=tionery +con=ferred +con=fes=sions +con=fi=den=tial +con=fig=ure +con=fin=ing +con=fis=cates +con=fo=cal +con=found=ing +con=fu=cian +con=fu=sion +congo +con=gre=gat=ing +con=gress=women +con=joined +con=junc=ture +con=nected +con=nec=tor +con=nors +con=quered +con=rail +con=se=crate +con=sent=ing +con=ser=va=tion +con=served +con=sid=ered +con=sis=tent +con=sol=ers +con=so=nants +con=spir=a=tor +con=stant +con=stituent +con=sti=tu=tions +con=structed +con=structs +con=sul=tant +con=sumed +con=sump=tions +con=tain +con=tam=i=nated +con=tem=pla=tive +con=tender +con=tent=ment +con=text +con=ti=nents +con=tin=u=a=tions +con=tor=tions +con=tract=ing +con=tra=dict=ing +con=trap=tions +con=tribute +con=trite +con=trol=la=bil=ity +con=tro=versy +con=vened +con=ven=tion=ally +con=ver=santly +con=ver=sion +con=vex +con=vict +con=vinces +con=voys +cooked +cool=ers +coon +co=op=er=a=tions +co=or=di=nates +copeland +cop=ings +co=pro=ces=sor +co=quette +cords +corinthian +corks +cor=nered +corns +coro=nary +cor=po=rately +cor=rect +cor=rect=ness +cor=re=spond +cor=ri=dors +cor=rob=o=ra=tive +cor=rup=tion +cor=val=lis +cos=mopoli=tan +costs +cots +cotyle=don +coughs +coun=cil=woman +coun=selors +coun=ter=act=ing +coun=ter=feited +coun=ter=part +coun=ter=sunk +coun=try=wide +cou=plings +courser +cour=te=sies +court=rooms +covenant +cov=er=let +cov=etous=ness +cower +cowl +coypu +crack=ers +cra=dles +craftsper=son +cramps +crank +cranny +crater +craw=ford +craze +creaked +creams +cre=ation +cre=dence +cred=i=tor +creeks +cre=mates +cres=cents +cretin +cricket +crim=i=nal +crip=ple +criss=cross +crit=i=cizes +croaks +cro=cus +crop=per +crosser +crosstalk +crowd +crown=ing +cru=ci=fix=ion +cruel +cruis=ing +crum=pled +cru=sade +crushes +crux +cryp=tic +crys=tal=lize +cubans +cu=cum=bers +cuf=flink +cul=mi=nate +cul=tivable +cul=tural +cum=mings +cup=board +curb +cur=ing +curlers +cur=rent +cur=ry=ing +cur=sory +curtly +curv=ing +custer +cus=tomiz=able +cut +cuts +cyanamid +cycli=cally +cygnus +cy=press +cy=to=plasm +dab=bles +dadais=tic +dahl +dairy +dali +dam=ages +damns +damsel +danc=ing +dan=gle +danize +dare +darken +darn +darted +dar=winizes +database +dates +daunted +davy +day=dreams +daz=zled +deaden +deaf +deal=ings +deanna +death +de=bater +de=bil=i=tates +debtor +debu=tante +de=cay +de=ceit +de=cel=er=ate +de=cent +de=cid=abil=ity +dec=i=mate +de=ci=sion +decks +de=clarer +de=clin=ers +de=cod=ings +de=com=po=si=tion +dec=o=ra=tive +de=creases +decre=ments +ded=i=cated +deduct +deed=ing +deep +deere +de=feats +de=fen=dant +de=fen=es=trat=ing +de=fer=ments +de=fi=cien=cies +de=fine +def=i=ni=tions +de=for=ma=tion +defy +degra=da=tion +de=ify +de=jected +de=lay=ing +deleter +de=lib=er=ated +del=i=ca=cies +de=light=ful +de=lim=it=ing +deliri=ous +de=liv=er=ies +del=phic +del=uged +de=mand +deme=ter +de=mod=u=late +demons +demon=stra=tor +de=mul=ti=plex +denebola +den=i=grates +de=nom=i=na=tors +de=not=ing +dens=est +den=tists +deny=ing +de=par=ture +de=pen=dent +de=pleted +de=ploy +de=pose +de=pos=i=tors +de=pre=ci=ated +de=pri=va=tions +de=queued +dereg=u=late +de=rive +de=scend +de=scents +de=scrip=tively +de=sert=ers +de=serv=ings +des=ig=na=tor +de=sire +des=o=late +despatched +de=spite +desta=bi=lize +de=stroyed +de=struc=tive=ness +de=tacher +de=tained +de=tec=tive +de=te=ri=o=rated +de=ter=mi=na=tion +de=ter=min=is=tic +de=trac=tor +dev=as=tate +de=vel=op=ment +de=vi=a=tion +de=vised +de=vot=edly +de=vours +dexedrine +di=ag=nose +di=ag=o=nals +dial +di=a=logue +di=a=mond +di=ar=rhea +dick=in=son +dic=ta=to=rial +did=dle +dies +di=et=rich +dif=fer=en=tials +dif=fer=ers +dif=fusely +di=gest +dig=gings +dig=its +di=gress=ing +di=lap=i=date +dili=gence +di=lu=tion +di=men=sions +dimmed +dine +din=ing +dio=genes +diph=thong +dip=per +di=rec=tion +di=rec=torate +dirt +dis=able +dis=af=fec=tion +dis=al=low=ing +dis=ap=pear=ances +dis=ap=prove +dis=as=sem=bles +dis=bands +dis=card=ing +dis=cerns +dis=ci=plines +dis=clo=sure +dis=con=nects +dis=cord +dis=cour=ag=ing +dis=cov=ery +dis=cre=tion +dis=cuss +dis=ease +dis=fig=ure +dis=grun=tle +dis=gust=ingly +dis=hon=estly +dish=wa=ter +dis=joint +disk +dis=lo=cates +dis=may=ing +dis=mis=sers +dis=obe=di=ent +dis=own +dis=patched +dis=pen=sary +dis=persed +dis=plac=ing +dis=pleas=ing +dis=po=si=tion +dis=puter +dis=qui=et=ing +dis=rup=tion +dis=sem=ble +dis=senter +dis=sim=i=lar=i=ties +dis=so=ci=at=ing +distaff +dis=tastes +dis=till=ing +dis=tin=guish +dis=torts +dis=tresses +dis=tribu=tiv=ity +dis=turbed +ditty +di=ver=gence +di=ver=si=fies +di=vert=ing +div=i=dend +di=vin=ing +di=vi=sors +dix=ieland +dober=man +doc=toral +doc=u=men=taries +do=dec=a=he=dra +doe +dog=house +dolan +dol=lies +domenico +domi=cile +dom=i=neer=ing +don=ahue +don=key +doo=ley +door=man +dop=ers +doric +dort=mund +doted +dou=ble=header +doubt +doubts +doves +downey +down=load=ing +down=stairs +doyle +dra=co=nian +drafty +dra=gooned +dram +drape +draughts +draw=ings +dreaded +dream=ers +dregs +dress=ing +dries +driller +drip +drive=way +droop +drop=pers +droves +drudgery +drum=mers +drunkly +du=al=ity +dubuque +ducts +dug +dull=ness +dumbly +dun=bar +dun=geons +du=pli=ca=ble +dupont +du=ra=tion +dur=ward +duster +dutch=man +dwarfed +dwelt +dye=ing +dy=namism +dysen=tery +ear +ear=marked +earnest=ness +earth +earth=quakes +eases +east=erner +easy +eaves +eben +echoed +ecol=ogy +econ=o=mize +ecuador +ed=enizes +edict +edi=tion +ed=mon=ton +ed=u=cat=ing +ed=wards +ef=fect=ing +ef=fi=cacy +ef=fort=less=ness +eggshell +egyp=tian=ize +eigen=state +eighthes +eis=ner +eject=ing +elab=o=rately +elapses +el=derly +elec=tions +elec=tri=cally +elec=tro=cute +elec=troen=cephalog=ra=phy +elec=tron=ics +el=e=men=tal +el=e=va=tion +elicited +elim=i=nat=ing +elite +ella +el=lip=soids +elmhurst +else +elu=ci=da=tion +ely +eman=ci=pate +em=bar=rass +em=beds +em=bod=ied +em=brac=ing +emer=ald +emer=i=tus +emil +emits +emo=tion=ally +em=pha=siz=ing +em=ploy=able +em=po=rium +emp=tily +em=u=la=tor +en=acted +en=camp=ing +en=chanter +en=cir=cled +en=coder +en=counter +en=cour=ag=ingly +en=cum=bered +en=dan=gers +en=demic +en=dorse +en=dows +en=dur=ingly +en=fee=ble +en=fran=chise +en=gels +en=gines +en=glish=men +en=gulf +en=join +en=joys +en=light=ened +en=livens +enor=mity +en=quirer +en=riches +en=sem=bles +en=snar=ing +en=sures +en=ter=prise +en=ter=tain=ment +en=ticed +en=ti=tle +en=treat +en=trepreneurs +enu=mer=ated +en=veloped +en=v=i=ron +en=vi=sioned +ephemeral +epi=cur=izes +epis=co=palian +epi=taphs +epochs +equal=ize +equates +equi=li=brate +equips +equiv=o=cally +erased +ere +ergo +er=lang +erode +er=ra=tum +errs +es=ca=lates +es=capes +es=corts +es=pe=cially +es=quires +es=sen=tially +es=tates +es=ti=mated +eter=nal +eth=er=nets +etruria +eu=le=rian +eura=sia +eu=ro=peanized +evade +eval=u=a=tive +evap=o=ra=tion +even=hand=ed=ness +events +ev=er=glades +ev=ery=thing +ev=i=dences +evinces +evolve +ex=ac=er=bated +ex=ac=tions +ex=ag=ger=a=tions +ex=am=ined +ex=as=per=ates +ex=ceeded +ex=cel=lently +ex=cep=tions +ex=change=able +ex=ci=sion +ex=cit=ingly +ex=clam=a=tory +ex=clu=sive=ness +ex=cret=ing +ex=cused +ex=e=cu=tional +ex=em=pli=fied +ex=empts +ex=er=tion +ex=haust=edly +ex=hi=bi=tions +ex=ile +ex=is=ten=tial=ist +ex=or=bi=tant +ex=panders +ex=pect +ex=pects +ex=pe=di=tious +ex=pen=di=ture +ex=pe=ri=enc=ing +ex=per=i=ments +ex=pires +ex=pla=na=tions +ex=ploit +ex=plo=rations +ex=plo=sive +ex=po=nen=ti=at=ing +ex=ports +ex=po=sure +ex=press=ibil=ity +ex=pul=sion +ex=tem=po=ra=ne=ous +ex=ten=sive +ex=ter=mi=nate +ex=tin=guished +ex=tract +ex=tra=ne=ous +ex=trap=o=la=tion +ex=tremely +ex=ult +eye=glasses +eye=sight +fa=bles +fa=cade +facile +fac=sim=ile +fac=to=ries +fac=ulty +fa=gin +fail=soft +faint=ness +fair=ing +faith=ful +fakes +fal=la=cious +fal=mouth +fal=si=fy=ing +fa=mil=iar +fam=i=lies +fa=nati=cism +fanci=ness +fan=ning +farad +farewells +farm=ers +far=rell +fas=ci=na=tion +fasted +fas=tid=i=ous +fate +fath=omed +fat=ten +faulkner +fauna +fa=vor=ing +fayette +fear=lessly +feat +feath=er=weight +fed +fee=ble=ness +feeds +feet +fe=line +fel=low=ships +fem=i=nism +fenc=ing +fer=men=ta=tion +fe=ro=ciously +fer=tile +fer=vent +fes=tiv=ity +fet=tered +fever=ish +fiat +fi=brously +fid=dled +fief +fiendish +fif=teenth +fight=ing +fiji +files +filled +film=ing +filthy +fi=nals +finder +fines +fin=ger=print +fin=ishes +finnish +fire=boat +fire=men +fire=wall +firm=ing +fis=cally +fishes +fis=sured +fitly +fitz=patrick +fix=a=tion +fix=ture +flagged +flak +flamer +flank=ing +flash +flask +flat=tered +flaunt=ing +flaw=lessly +fledglings +fleetly +flem=ish=ing +flew +flick=ing +flinches +flirt +floated +flood +floors +flo=ren=tine +floss=ing +flour=ished +flow=er=i=ness +fluc=tu=ate +fluffier +flu=o=resce +flut=ing +fly=ing +fo=cal +foes +fogy +fold=ers +folksy +fol=som +font +fooled +foot=ball +foot=ing +for=age +forbes +forcer +fore=arms +fore=fa=thers +for=eign +fore=see=able +fore=stalls +for=ever +forge +for=get=table +for=giv=ing +for=lornly +for=mal=ized +for=ma=tively +formi=cas +for=mu=lated +for=saken +forth=with +for=tiori +for=tu=itously +for=warder +fought +foun=da=tion +founds +four=some +foxes +frag=ile +fra=grantly +fram=ing +fran=cie +fran=coise +frank=ing +fraser +fray +freckle +fred=erico +free=ing +frees +freez=ing +frenchizes +fre=quented +fresh=ened +fresh=ness +freudi=an=ism +fric=tion +friendlier +friezes +fright=ful +frisia +frivolity +from +fronts +froth=ing +frue=hauf +fruits +fuch=sia +fu=jitsu +full +fum=bling +func=tion=ally +fun=da=men=tally +fun=gal +fun=nier +fur=long +fur=ni=ture +fur=ther=more +fuses +fu=tur=is=tic +gabled +gad=getry +gag=ing +gaines +galac=tic +galaxy +gal=lantly +gal=lon +gall=stone +gam=bled +games +gang=plank +gaped +garbed +gard=ner +gar=landed +gar=risoned +gaseous +gaspee +gas=tric +gath=ered +gauche +gaunt +gawky +gaze +gear=ing +gelatin +gemma +gen=er=al=ity +gen=er=als +generic +ge=netic +genre +gen=tler +geodesic +ge=o=log=i=cal +geo=phys=i=cal +geral=dine +ger=mane +ger=mi=nates +gestapo +get=ting +ghosted +gibral=tar +gig +gig=gle +gilds +gilt +ging=hams +gipsy +girl=ish +giver +glad=dest +glance +glar=ing +glazed +gleaner +glenda +glim=mer +glints +gloat +glo=ria +glo=ry=ing +glove +glow=ing +glynn +gnu +goats +gob=lins +god=mother +goethe +gold=enly +gold=s=tine +gon=dola +goode +goodyear +goren +gor=ton +got +goth=i=ciz=ing +goug=ing +gov=ern=ment +grab +grace=fully +gra=da=tions +grad=ual +graft +grained +grams +grand=fa=ther +grandpa +grant +gran=u=lates +graph=i=cal +gras=pable +grassi=est +grat=i=fi=ca=tion +gra=tu=itously +graves +grayed +grease +gre=cian=ize +greeks +green=feld +greens +greeter +grenades +greyest +grievances +grif=fith +grimes +grinds +gripped +gritty +gro=cers +grooved +gross=est +gro=ton +group +grov=els +growl=ing +grubs +grum=bling +guano +guard=edly +gu=ber=na=to=rial +guest +guide=line +guiltier +guises +gul=lah +gum=ming +gun=ner +gur=gle +gustafson +guts +guyer +gym=nas=tics +haas +ha=bit=ual +hacks +hag +hail +hairier +hale +hall=mark +halpern +halve +ham=burg=ers +ham=mer=ing +hamp=shire +hand=books +hand=i=cap +hand=ker=chiefs +hand=shake +handy +hang=man +han=nah +hansel +hap=lessly +hap=pily +harbinger +harder +hard=ships +harken +harm=ful=ness +har=mo=niously +har=ness=ing +har=ri=man +harry +har=vardize +har=veys +has=sle +hat +hate=fully +hat=tie +hauler +hausa +havoc +hawthorne +hay=wood +head +head=lands +head=room +heals +healy +hear=ings +heartily +heater +heaved +heav=i=ness +he=brides +hedge=hog +heeds +hegelian=izes +heights +heiresses +he=li=copter +hel=l=enized +hel=met +help=fully +hem +hemp +hen=drick +hen=ri=etta +her=alds +herder +here=ford +here=un=der +her=mit +hero=ically +her=ring +hert=zog +hes=pe=rus +het=eroge=nous +heuser +hexagon +hi=ber=nate +hid=den +hi=er=ar=chic +high=field +high=nesses +hikes +hill=crest +hilt +hin=dered +hin=dus=tan +hint=ing +hired +his +his=tograms +hitch +hither +hit=ting +hoarse=ness +hobby +hoe +hoists +holds +hol=landaise +hol=low=ness +holo=caust +homage +home=o=mor=phism +home=spun +hom=ing +ho=mo=sex=ual +hon=esty +hon=ey=moon=ing +hon=o=raries +hood=lum +hooker +hoosier=ize +hooves +hope=less=ness +ho=race +horn +hor=ri=ble +hor=rors +horse=shoer +hos=pi=tal=ize +hostesses +hotly +hound=ing +house=flies +house=top +hover +howled +hu=bert +huey +hugo +hu=man=i=ties +hum=bling +hu=mid=i=fiers +hu=mil=i=a=tion +hu=mor=ers +humpty +hung +hun=gry +hunt=ley +hurl=ing +hur=ry=ing +hus=bands +husks +hutchins +hyde +hy=giene +hy=phen=ate +hy=pothe=ses +hys=ter=i=cal +ib=sen +ici=cle +icosa=he=dron +ide=al=ize +iden=ti=cal +iden=tify +id=iosyn=crasy +idles +ig=nite +ig=nores +il=le=gal=ity +il=log=i=cal +il=lu=sions +il=lus=tra=tive +im=a=gen +imag=ine +im=brium +im=ma=te=rial +im=mensely +im=mi=grat=ing +im=mov=abil=ity +im=pacted +im=pale +im=pa=tiently +im=pedes +im=pen=e=tra=ble +im=per=fectly +im=per=ma=nent +im=per=son=ations +im=pinges +im=ple=mentable +im=pli=cants +im=plied +im=por=tant +im=poses +im=po=tence +im=prac=ti=cally +im=press=ible +im=press=ment +im=pris=on=ments +im=prove=ment +im=pro=vis=ers +im=pul=sion +in=ac=ces=si=ble +in=ad=e=quate +inane +in=audi=ble +inca +in=cas +in=ces=santly +in=ci=den=tally +in=cit=ing +in=closes +in=clu=sive=ness +in=com=pa=ra=ble +in=com=pletely +in=con=gruity +in=con=sis=tent +in=con=ve=nient +in=cor=rect=ness +in=cred=u=lous +in=cu=bate +in=cur=able +in=de=ci=sive +in=dent +in=de=scrib=able +in=dex=ing +in=di=ca=tion +in=dif=fer=ence +in=dig=na=tion +in=di=rectly +in=dis=tinct +in=di=vid=u=ally +in=doc=tri=nat=ing +in=du=bitable +in=duc=tances +in=ducts +in=dus=tri=al=ist +in=dus=try +in=el=e=gant +inertly +in=ex=act +in=ex=pli=ca=ble +in=fantry +in=fec=tion +in=fe=rior +in=fer=tile +in=fi=nite +in=fir=mary +in=flated +in=flict=ing +in=form +in=for=ma=tively +in=fre=quently +in=fu=ri=at=ing +in=ge=nious=ness +in=gra=ti=ate +in=hab=ited +in=her=ently +in=her=itress +in=hibitor +in=im=i=cal +ini=tial=ized +ini=ti=at=ing +in=jec=tion +in=jured +inker +in=let +in=ner +in=nocu=ous=ness +in=oc=u=late +in=quire +in=quis=i=tive +in=scribed +in=se=curely +in=ser=tion +in=sid=i=ous=ness +in=sin=u=ated +in=sis=tently +in=som=nia +in=spi=ra=tion +in=stal=la=tion +in=stances +in=stan=ti=a=tions +in=still +in=sti=tutes +in=struct +in=structs +in=stru=ments +in=su=la=tion +in=sur=ance +in=sur=rec=tion +in=te=grand +in=tel=lect +in=tel=li=gi=ble +in=ten=si=fi=ca=tion +in=ten=sively +in=ter +in=ter=cept +in=ter=changed +in=ter=com=mu=ni=cates +in=ter=course +in=ter=ested +in=ter=fered +in=ter=group +in=ter=leaved +in=ter=minable +in=ter=mod=ule +in=ter=na=tion=al=ity +in=ter=per=sonal +in=ter=posed +in=ter=pret=ing +in=ter=re=la=tions +in=ter=rupt +in=ter=sect=ing +in=ter=state +in=ter=ven=ing +in=ter=wo=ven +in=ti=ma=tion +in=tol=er=ance +in=tractabil=ity +in=traof=fice +in=trigued +in=tro=duc=tions +in=truder +in=tu=ba=tion +in=vaders +in=va=lidi=ties +in=vari=ants +in=ven=tively +in=verses +in=vert=ing +in=ves=tiga=tive +in=vet=er=ate +in=vites +in=voked +in=volves +io=ni=ans +ira +irately +irish=man +ironic +ir=ra=tional +ir=reg=u=lar +ir=re=press=ible +ir=re=versibil=ity +ir=ri=ta=ble +irv=ing +is=fa=han +is=land +iso=lated +iso=mor=phisms +is=suance +it +ital=i=cize +item=iza=tions +it=er=a=tion +ito +izves=tia +jack=ets +jacky +ja=cobus +jailer +ja=maican +janet +janus +jar=gon +jaun=ti=ness +jay +jeanne +jef=fer=so=nian +jen=nifer +jeremy +jer=oboam +jest +je=suit=iz=ing +jew=eled +jews +jin=gled +joaquin +joes +john +joiner +jok=ers +jolts +jor=dan +jose=phus +jot=ting +jour=nals +joust=ing +joy=ous +ju=daica +judge +ju=dith +ju=goslavia +julie +jump +junc=tures +ju=niper +juras +jury +jus=ti=fiers +jut=land +kad=dish +kamikazes +kant +karp +ka=tow=ice +keel=ing +keep=ers +kemp +ken=ney +ke=pler +ker=ouac +key +key=pad +khrushchevs +kidde +kid=ney +kil=i=man=jaro +kills +kilo=joule +ki=mono +kin=dling +king=pin +kin=nick=in=nic +kir=choff +kisses +kit=ing +klein +knap=sacks +kneel +knicker=bock=ers +knights +knocked +knots +knowl=edge +knuck=les +ko=dachrome +ko=rea +kraka=toa +kro=necker +kurd +la=bel=ing +la=borer +labyrinths +lac=erta +lacks +ladies +la=goon +laid=law +lamarck +lament +lamp +lanced +land=ings +lands +lange +lan=guish +laos +lapse +largely +lar=son +lash=ing +las=zlo +later +la=tin=ity +lat=i=tudes +laud=able +laugh=lin +laun=dered +lau=rels +laven=der +law=fully +law=suit +lay=ers +lazarus +leaded +leafed +lea=guers +le=an=der +leap=ing +leary +leath=ern +leav=ing +lec=tures +leeds +left=ists +le=gal=iza=tion +leger +leg=is=lated +le=git=i=mate +leila +lemon +lends +le=niency +lens +leonardo +les=bian +les=son +let=ter +levee +lev=elly +levin +lewdly +lex=ing=ton +li=belous +lib=er=ated +li=bido +li=cense +lick +lied +lifeboat +life=time +ligget +light=hearted +like +like=ness +lil=ian +li=man +limit +lim=its +lind +lindy +lin=early +lin=gerie +lin=ing +lin=naeus +li=oness +liq=uid +lise +lis=tened +list=ings +lit=er=al=ness +lithua=nia +lit=ter=ing +live +liv=ers +lizzie +loaf +loathing +lob=ster +lo=cally +lo=ca=tor +lock=ian +lock=wood +lodges +log=a=rithm +log=i=cally +logs +loi=ters +lon=doniza=tion +lon=ers +long=ings +look=ers +looms +loose=leaf +loos=ing +lords +lorry +lossi=est +lo=tus +louisa +lour=des +lovelace +loves +low=est +loy=ally +lucerne +luck=ier +lu=di=crous +luke +lu=mi=nously +lunch +lunged +lur=ing +lust +luther +lux=u=ri=antly +lyle +lynx +mac +mac=don=ald +maces +ma=chin=ery +mackey +macro=molecule +mad=den +mad=hya +mad=sen +mag=el=lanic +mag=ill +mag=ne=ti=z=able +mag=nify +maguire +maids +mail=man +main=frames +main=tained +majesty +maker +mal=ady +mal=colm +mal=formed +ma=li=cious=ness +mal=one +mal=ton +man=age +man=ag=ing +manda=tory +manger +man=hole +man=i=cur=ing +manila +ma=nip=u=la=tive +mann +manors +man=tissa +man=u=fac=tured +mao +maps +marched +mardis +margo +mari=nade +mar=itime +mar=ketabil=ity +mark=ings +mar=malade +mar=riott +mar=shal=ing +mar=tial +mar=tyr +mar=vels +mas=cara +mask=able +ma=sonite +mas=sa=cred +mast +mas=ter=piece +mas=tur=ba=tion +match=less +ma=te=ri=al=iz=ing +math=e=mat=i=cally +mat=ings +ma=trix +mat=tered +ma=tured +mauri=cio +max=ima +max=ims +maybe +may=oral +mc=cabe +mc=cluskey +mc=don=nell +mc=gov=ern +mc=kee +mclean +mcpher=son +meal=time +mean=ing=ful +meant +mea=sure=ments +me=chan=i=cally +medal +med=field +me=di=a=tions +medicine +med=i=tat=ing +medi=ums +meet=ing +mega=hertz +meis=ter +melcher +melodies +melpomene +mem=ber=ship +mem=o=randa +mem=o=rizes +menagerie +mendelizes +men=non=ite +men=tal=i=ties +men=tor +mer=ce=nar=i=ness +mer=ci=lessly +merged +mer=i=to=ri=ous +mer=rill +mesh +mes=sen=ger +messy +met=al=liza=tion +meta=phys=i=cal +me=te=oritic +me=thod=i=cally +meth=ods +metro +mews +mica +mick +mi=cro=bi=cide +mi=croe=co=nomics +mi=cron +mi=cro=pro=cess=ing +mi=cro=scope +mi=crovaxes +mid=dle=man +mid=night +mid=stream +mid=win=ter +mi=grate +mikoyan +mileage +milk +mill +mil=likan +mil=lionth +mill=stones +mil=tonized +minaret +mind=fully +min=eral +mini +min=ima +min=i=mizes +min=istries +mi=nor +min=strels +minute +mir=a=cle +miriam +mis=car=riage +mis=con=cep=tion +mis=er=ably +mis=giv=ings +mis=led +mis=plac=ing +miss=ing +mis=soula +mis=take +mistle=toe +mis=un=der=stand +mitch +mitres +mix=tures +moats +mocked +modally +mod=er=ated +mod=ern=izer +mod=icum +mod=i=fy=ing +mod=u=lar=iz=ing +mod=ule +moghul +moines +mol=davia +moles +mol=lusk +mo=men=tar=ily +monaco +mon=day +mon=go=lian +mon=keyed +mono=cotyle=don +mono=lithic +monos=table +mon=roe +mon=tague +mont=gomery +mon=u=ment +mooned +moor +moped +morass +more=house +morn +mor=pho=log=i=cal +morsels +mort=gage +mo=saic +mosque +mo=tels +moth=er=land +mo=tion=less=ness +mot=ley +mo=tor=ized +mound +moun=tain=ously +mourn=ers +mousy +mov=able +mov=ing +muck +mud=dled +muf=fin +mugs +mul=lah +mul=ti=com=puter +mul=ti=ple +mul=ti=pli=cand +mul=ti=plies +mul=ti=stage +mum=bles +mun=dane +mu=ni=tions +mur=der=ing +mur=murs +mus=covy +mush=roomed +mu=si=cians +muskrat +mus=sorgsky +mu=ta=bil=ity +mu=ta=tions +mu=ti=lat=ing +mut=ters +myce=naean +mys=te=ri=ous +mytholo=gies +na=gasaki +nair +naked=ness +names +nanook +nap=kin +nar=cotic +nar=row=est +nash +na=tal +na=tion=al=i=ties +na=tions +nat=u=ral=ist +naugh=ti=ness +navel +navona +ne=an=derthal +nears +neb=ula +ne=ces=si=ta=tion +neck=ties +nee=dled +needy +neg=a=tives +neg=li=gi=ble +ne=groid +neigh=bor=ing +neo=clas=sic +nero +nest=ing +nets +neu=ral +neu=tral +neva +new=bury=port +new=man +news=man +next +ni=belung +nicholls +nick=name +niel=son +night=fall +ni=hilism +nim=bler +nineties +nip=ponizes +no=bil=ity +noc=tur=nally +noel +nolan +nom=i=nee +non=con=ser=va=tive +non=de=ter=min=ism +non=govern=men=tal +non=lin=ear=ity +nonorthog=o=nal +non=seg=mented +non=ter=mi=nals +nook +nord=hoff +nor=mal=iza=tion +nor=man=iza=tions +north +north=ernly +nor=walk +nos=tradamus +no=ta=rizes +note +no=tice=able +no=ti=fies +not=ting=ham +no=vak +novices +nu=ances +nu=clide +nullary +num=ber +nu=mer=able +nu=mis=matic +nurs=ing +nu=tri=tious +nyquist +oases +obe=di=ent +ob=fus=cate +ob=jec=tively +obliged +oblit=er=at=ing +ob=scene +ob=serv=able +ob=servers +ob=so=letes +ob=struc=tion +ob=vi=ated +oc=ca=sional +oc=ci=den=tal=ize +oc=clu=sions +oc=cu=pied +oc=curs +oc=tag=o=nal +octets +oddly +odi=ous +o'dwyer +of=fended +of=fer +of=fi=cer +of=fi=ciously +oft +oil=cloth +ojibwa +old=en=burg +oleo=mar=garine +olivia +olym=pus +omi=nous=ness +om=nipresent +o'neill +on=looker +onus +opaquely +open=ings +op=er=ate +op=er=a=tor +op=pen=heimer +op=pose +op=pressed +opthalmic +op=ti=mist +op=ti=miz=ing +opts +or=anges +or=bital +or=ches=tral +or=der +or=di=nar=ily +ores +or=ga=ni=za=tion +or=gans +ori=en=tal=ized +ori=fices +orig=i=na=tion +or=leans +or=nate +orr +orville +os=cil=lates +o'shea +os=teopath +oth=ello +otto +ounces +out=burst +out=door +out=grow=ing +out=law=ing +out=live +out=per=forms +out=rages +out=stand=ing +out=vot=ing +out=wit=ting +over=board +over=crowds +over=es=ti=mates +over=hangs +over=joyed +over=load +overnighter +over=pro=duc=tion +over=run=ning +over=shad=ow=ing +over=sized +over=take +overtly +overuse +over=work=ing +owen +own=er=ship +ox=i=dized +ozzie +paci=fi=ca=tion +pack=aged +pack=ers +padding +pageant +pag=i=nat=ing +painful +paint=ing +pa=ja=mas +pale +pales=tine +pal=lia=tive +palo=mar +panacea +pan=demic +pan=els +panned +pan=the=ist +panty +pa=per=ers +par +pa=rades +paragon +par=al=lel +par=al=lels +pa=ram=e=ter=ize +para=mus +para=phrases +par=cel +par=doned +paren=the=ses +pares +parisian +park=ers +par=lay +par=ody +par=rots +par=si=fal +par=takes +par=tic=i=pant +par=tic=u=lar +par=ti=tioned +par=tridges +pas=sage=way +pas=sion +pass=port +pas=teur +pas=ture +patchy +patents +patho=gen=e=sis +pa=tients +pa=tri=cians +pa=trolling +pa=trons +pat=tern=ing +paula +paulus +pave=ment +pawn +payer +pay=offs +peace=fully +peaks +pearl +peat +pe=cu=liar +pedant +pe=di=a=tri=cian +peel=ing +peer=ing +peking +pem=broke +pence +pends +pen=e=tra=tion +penin=su=las +penn=syl=va=nia +pen=tagon +peo=pled +pep=pery +per=ceived +per=cents +per=chance +peren=ni=ally +per=fect=ness +per=forms +per=i=he=lion +pe=ri=od=i=cally +per=ish=able +perkins +per=me=at=ing +per=mit +per=ni=cious +per=pe=tra=tion +per=pet=u=a=tion +per=se=cut=ing +per=se=veres +per=sist +per=sonal +per=son=i=fied +per=spi=ra=tion +per=sua=sions +per=turb +pe=ruses +per=va=sive +pester +pe=ters +petri +pet=ting +phae=dra +phaser +phe=nomeno=log=i=cal +philco +philis=tinizes +philoso=phies +phoeni=cia +phon=ing +phos=pho=rus +pho=to=genic +pho=tos +phyla +physi=cist +pi +pick +pick=et=ing +pick=man +pi=co=joule +pic=tur=ing +pied=fort +pies +pig=gy=backed +pig=tail +pil=fer=age +pil=lar +pi=lots +pin=cush=ion +pin=ing +pin=na=cle +pin=scher +pi=o=neers +pipelin=ing +pi=rate +pis=tols +pitch=ing +pithi=ness +piti=less +pi=tu=itary +pix=els +place=ment +pla=gia=rist +plain=field +plain=tive=ness +planeload +plan=ets +planocon=cave +plant=ings +plas=tic=ity +plates +pla=toon +play=boy +play=ing +play=wrights +pleas=ant +pleat +ple=nary +pli=ant +plots +plows +plug=gable +plume +plun=dered +plung=ing +plu=to=nium +poc=a=hon=tas +pod +po=et=i=cal +poincare +pointy +poi=sons +po=laris +po=lice +pol=ish +po=liter +polka +pol=luted +poly=mer +pomera=nia +pompous=ness +ponds +pool +pop +pop=ping +pop=u=lar=ized +pop=u=lous +pores +port +por=tend=ing +por=tico +por=tray +posed +po=si=tion +posits +pos=ses=sive +pos=sums +pos=te=ri=ori +post=mas=ters +postscript +pot +po=ten=tates +po=tion +pot=tery +pounces +pourer +poverty +pow=er=ful +prac=ti=ca=ble +prac=ti=tion=ers +praise +prancer +prayer +pre=al=lo=cated +pre=car=i=ously +prece=dents +pre=ciously +pre=cip=i=ta=tion +pre=cludes +pre=con=cep=tion +pre=dat=ing +pre=de=ter=mi=na=tion +pred=i=ca=tion +pre=dic=tive +pre=dom=i=nately +pre=emp=tive +pref=ac=ing +prefers +preini=tial=izes +pre=lim=i=nary +premise +pre=oc=cu=pied +pre=pared +pre=pos=ter=ously +pre=rog=a=tives +pre=scrip=tions +pre=sen=ta=tions +pre=served +pres=i=den=tial +press=ings +pre=ston +pre=sump=tu=ous=ness +pre=tend=ing +pre=texts +pre=vail=ing +pre=vent=ing +pre=vi=ously +pricers +prides +pri=mar=ily +prim=ing +princesses +prin=ci=ples +prior +pris=on=ers +pri=va=tions +prizes +pro=bate +prob=ings +pro=ce=dure +pro=cess=ing +procla=ma=tion +pro=cre=ate +pro=curer +pro=duce +pro=duc=tive +pro=fes=sion +prof=fered +prof=itabil=ity +pro=found +pro=gram +pro=gresses +pro=hi=bi=tions +pro=jec=tions +pro=le=tariat +pro=long +promi=nent +pro=moter +promptest +pro=mul=ga=tion +pro=nounce=ment +proofs +propane +prop=erly +proph=esy +pro=por=tion=ately +pro=poser +pro=pounded +pro=rate +pros=e=cutes +prosodic +prospec=tor +prostate +pro=tect=ing +pro=tege +protes=ta=tions +pro=tons +pro=to=zoan +prouder +prove=nance +prov=i=dence +pro=vi=sion +pro=vokes +prox=i=mal +pruned +prus=sian=ize +pseu=doin=struc=tion +psy=chi=a=trist +psy=cho=log=i=cally +psy=cho=so=matic +pub +pub=licly +puck=ered +puffed +puller +pulls +pulse +pump=kin +punc=tu=ally +pun=ish=able +punt +pup=peteer +pur=chases +purges +pu=rina +pur=pler +pur=posed +purse +pur=su=ing +push=down +put=nam +puz=zle=ment +py=ongyang +pythagore=anizes +quad=ran=gle +qua=dren=nial +quag=mires +quak=er=ess +qual=i=fied +qualm +quan=ti=fiers +quan=tize +quar=reled +quar=ter=ing +quasar +qua=ver=ing +queerer +queried +ques=tion=able +ques=tions +quib=ble +quick=lime +qui=et=ing +quince +quit +quiv=ers +quon=set +quo=tient +ra=bin +rach=mani=noff +rack=e=teers +ra=di=ance +ra=di=a=tors +ra=dio=g=ra=phy +rae +rages +raider +rail=roaded +rain=bow +rains +rake +ral=ston +ram=i=fi=ca=tions +rams +rand +randy +rangy +rank=ings +ran=somer +rap +rapids +rap=tur=ous +ras=cally +rasp=ing +rat=for +ra=tion +ra=tio=nal=izes +rat=tler +rav=ager +ravens +rawl=ins +rays +reach +re=acted +re=ac=ti=va=tion +reader +read=justed +re=aligned +re=al=iz=able +realm +reaped +rear +re=ar=rest +rea=son=ings +re=as=signed +reawak=ened +re=bel=lions +re=boot=ing +re=buffed +re=but=ted +re=cal=i=brated +re=ca=pit=u=lates +re=ceded +re=ceives +re=cep=tive +re=cife +re=cip=ro=cat=ing +recita=tions +reck=oned +re=claim=ing +re=clin=ing +rec=og=nize +rec=ol=lect +rec=om=mend +re=com=piles +rec=on=cil=i=a=tion +re=con=nect +re=con=sti=tuted +recorder +re=cover +recre=at=ing +recta +re=cur +re=curs=ing +red +re=de=clared +re=de=fined +re=de=vel=op=ment +re=dis=played +red=ness +re=dress=ing +re=ducibly +reeds +re=elects +reen=force=ment +reestab=lish=ing +re=ex=am=in=ing +ref=er=ences +re=fer=ral +re=fine +re=flect=ing +re=flexes +re=for=ma=tory +re=for=mu=lated +re=frained +re=fresh=ment +refugee +re=futed +re=gally +re=gen=er=at=ing +reg=i=men=ta=tion +regis +re=gressed +re=gret=table +reg=u=larly +reg=u=la=tors +re=hears=ing +re=im=bursable +reined +rein=hold +re=in=stated +rein=tro=duces +re=it=er=a=tion +re=joiced +re=la=beled +re=lat=ing +rel=a=tives +re=laxes +rel=e=gate +re=lents +relic +re=liev=ing +re=lin=quish=ing +reloader +re=luc=tance +re=mains +reme=died +re=mind +rem=i=nis=cently +re=mod=els +re=motely +re=mov=ing +re=names +ren=dezvous +re=new=able +re=nounc=ing +rented +re=open +re=or=ga=nize +re=pair=man +re=pay=ing +re=peat=edly +re=pen=tance +rep=e=ti=tious +re=place=able +re=plays +repli=cate +re=port +repos=ing +rep=re=sentably +rep=re=sent=ing +re=prieved +re=proach +re=pro=ducibil=i=ties +re=pro=grams +re=publics +re=pulses +re=puted +re=quired +req=ui=si=tions +re=scind +re=searchers +re=sem=blances +re=sent=ment +reser=voir +res=i=dent +res=ig=na=tion +re=sis=tance +re=sis=tors +re=solver +re=sort=ing +re=spect +re=spec=tive +re=sponded +re=spon=si=ble +restarts +rest=ful +restora=tions +re=strain=ers +re=stric=tive +re=sul=tant +re=sum=ing +res=ur=rec=tors +re=tail=ing +re=tal=ia=tory +re=ten=tive=ness +retina +re=tir=ing +re=tract=ing +re=trans=mis=sion +ret=ri=bu=tion +re=triever +ret=ro=spec=tion +re=type +re=unit=ing +re=vamp=ing +rev=eler +re=vere +rever=i=fies +re=verses +re=viewer +re=viser +re=vival +re=voked +rev=o=lu=tion +re=volvers +rewind=ing +rewrit=ing +rhe=sus +rhode +rhyming +rib=bons +richard +rich=mond +rico +ride +ridiculed +ri=fle +rig=ging +right=ful=ness +rigor +rims +ring=ings +ri=or=dan +ripely +rip=pling +risk +rit=u=ally +river +rivulet +road=sters +roar=ing +rob=beries +roberta +robin=sonville +rochester +rocket +rock=well +rods +roll +ro=mance +ro=man=izes +romper +roof=ing +room=ing +root +rop=ing +rose=bush +rosetta +rot +ro=ta=tions +ro=tund +rough=ness +round=ing +roused +routes +rov=ing +row=ley +roy=alty +rub=bing +rubles +rude=ness +ruf=fian +rugged=ness +rule +ru=ma=ni=ans +rummy +run=away +runoff +rup=tur=ing +rus=sell +rus=tic +rustlers +ruth=less=ness +sab=bathize +sachs +sac=ri=fice +sacro=sanct +sad=dles +sa=fari +safes +sage=brush +said +sails +sal=able +salerno +saline +sally +salters +salu=ta=tions +sal=vages +same +sam=pling +sana=to=rium +sanc=tion=ing +sand=burg +san=dra +san=est +san=skrit +sapling +saran +sari +satchel +satires +sat=isfy +sat=ur=na=lia +saud +sav=aged +saver +sa=vored +saw=fish +sax=onize +say=ings +scala +scal=ing +scam=pers +scan=ners +scape=goat +scared +scat=ter +scenic +schantz +schelling +schemers +schmitt +scholas=tic +school=houses +schroeder +schuylkill +scis=sor +scoffs +scope +score=board +scorner +scotch=gard +scotts=dale +scouted +scram=bled +scrapes +scratch=ing +scream=ers +screen=ings +scrib=bled +scripts +scrump=tious +scuf=fle +sculp=tured +scythe +sea=gate +seam +seaquar=ium +search=light +sea=son=able +seat +se=ceded +sec=ondary +sec=re=tar=ial +se=cre=tive +sec=tions +se=cur=ings +sedi=tion +see +seedy +seem=ing +seer +seg=men=ta=tions +se=gundo +seizures +se=lect=man +self=ishly +sells +se=man=tics +semi=con=duc=tor +semiper=ma=nently +sen=ate +seneca +sense +sens=ing +sen=sual +sen=ti=men=tally +sep=a=rately +sept +se=quencers +se=quen=tially +serene +se=ri=al=iz=able +serif +serra +ser=vice +serv=ings +sets +set=tler +sev=en=teens +sev=er=ance +sev=ers +sex +sex=ual +shack=led +shadi=ness +shaf=fer +shak=ers +shale +shame=ful +shang=haied +shape=less +shard +shares +sharp=en=ing +shat=ter=ing +shawano +shear=ing +sheds +sheets +shel=ley +shelves +sheri=dan +shied +shiftier +shilling +shiner +shin=toizes +ship=per +shirk +shiver +shocker +shoe=horn +shooter +shop=pers +short=age +short=ens +shorts +shoul=dered +shoved +showed +shows +shrewd +shrilled +shrink=ing +shrugs +shuf=fled +shut=off +shut=tles +siberia +sicken +side=band +sides +sid=ings +sierra +sighed +sigma +sig=na=ture +sig=ni=fi=ca=tion +sikkim +silent +silken +sills +sil=ver=man +sim=ile +si=mon +sim=plic=i=ties +sim=plis=tic +sim=u=la=tion +sin=bad +sinews +singed +sin=glet +sin=gu=larly +sin=ner +sioux +sirens +sisy=phus +sit=tings +siva +six=ties +skate +skep=ti=cal +sketch=pad +skid=ding +skill=ful=ness +skims +skipped +skir=mishes +skulked +sky +sky=rock=ets +slacks +slang +slash +slaugh=ter +slavic +slavoni=cizes +sledge=ham=mer +sleep=less +sleighs +sliced +slide +slightly +slings +slips +slo=gans +slop=pi=ness +slot=ting +slower +slug=gish=ness +slums +smacked +small=time +smasher +smell +smiles +smith=so=nian +smoked +smol=dered +smooth=ing +smug +smythe +snap +snap=shots +snatched +sneaki=est +sneers +sniffs +snod=grass +snorkel +snow=belt +snows +snuffs +soak +soared +sobers +so=cial=ists +so=ci=o=log=i=cal +socks +so=fas +softly +so=journ +sol=dier +solenoid +solid +solids +so=los +sol=vent +somber +som=er=set +son +sonny +soothe +so=phis=ti=ca=tion +sor=did +sor=est +sor=rows +soul +sound=ness +soured +south=bound +south=land +so=vi=ets +spacer +spaded +spaniardiza=tion +spanked +spare +sparked +sparsely +spat +spawned +speak=ers +spe=cial=ists +spe=cialty +spec=i=fied +speckle +spec=ta=tors +spec=trog=ra=phy +spec=u=lates +speech=less +speeds +spellings +spent +spica +spies +spilt +spin=ner +spi=rally +spir=i=tu=als +spit=ing +spleen +splic=ing +splits +spoil=ing +sponged +spon=sor=ship +spool=ers +spores +sportswriter +spot=ter +sprague +spray=ing +sprees +springi=ness +sprint +sprouted +spurn +sput=tered +squadrons +squarer +squat=ting +squeaky +squeez=ing +squirmed +stab +sta=bi=lizes +stacked +staffing +stagers +stags +stair=cases +stale=mate +stalling +stam=mer +stam=ped=ing +stan=dard +stand=ings +stans +star +star=gate +star=ring +star=tles +state +statewide +sta=tion=mas=ter +stat=ues +statu=to=rily +staves +stead=ier +stealer +steamer +steele +steeper +steered +stem +stenog=ra=pher +step=mother +stereo=scopic +ster=il=izer +stetho=scope +stew +stick=ier +stiff=ens +stigma +stillest +stim=u=late +sting=ing +stipends +stir=rer +stitch=ing +stock=holder +stodgy +stom=acher +stood +stop=gap +stor=age +storeyed +stormi=est +stouter +strafe +straight=ened +strained +strand=ing +stran=gler +stratagem +strat=i=fies +straw=berry +streamer +street=car +strengths +stretched +strict +strife +stringed +stringy +striptease +strode +strolling +stron=tium +strug=gle +stu=art +stucco +stu=dious +stuffs +stun +stupid +sturm +styli +styx +sub=com=po=nents +sub=di=rec=tory +sub=dues +sub=graph +sub=jec=tive +sub=lime +sub=merges +sub=mode +sub=or=di=nate +sub=pro=gram +sub=schema +sub=script=ing +sub=se=quent +sub=si=dies +sub=sis=tent +sub=stan=tially +sub=sta=tion +sub=strate +sub=sys=tem +sub=tle=ness +sub=trac=tion +sub=units +sub=vert=ing +suc=cess=ful +suc=cinct=ness +suck=ers +sud=den +suf=fer=ance +suf=fi=ciency +suf=fo=cated +sug=ar=ings +sug=gests +suit=ably +suits +sulks +sul=tan +sum=mands +sum=ma=tion +sum=mon +sumter +sun=der +sunken +sun=shine +su=per=com=put=ers +su=per=groups +su=pe=rior +su=per=nat=u=ral +su=per=sede +su=per=vise +sup=pers +sup=ple=ment=ing +sup=port +sup=pose +sup=press=ing +surely +surge +surly +sur=pass +sur=pris=ingly +sur=round +sur=vey=ors +sus +sus=pended +sus=pi=cions +suther=land +swab +swal=low=ing +swan +swaps +swat +sweat +swedes +sweep=stakes +sweet=est +swellings +swifter +swim=suit +swipe +switch=boards +swivel +swords +sykes +sylvia +sym=bol=ize +sym=me=try +sym=pa=thy +syn=a=gogue +syn=chro=nizes +syn=di=ca=tion +syn=ony=mously +syn=the=size +syr=ian +sys=tem=at=i=cally +taber=na=cle +ta=ble=spoon=ful +tab=u=late +tacit +tac=tic +tail +taipei +tale +talker +tallchief +tal=mudiza=tions +tam=ing +tanaka +tan=gle +tan=ta=liz=ing +taos +tapestry +tar +tar=iffs +tasked +taste=fully +tat=tered +taunts +tav=erns +taxi=cabs +tay=lor +teaches +tear=ful +tea=spoon=ful +tech=nique +te=dious +teenaged +tegu=ci=galpa +tele=graph +tele=o=log=i=cally +tele=phony +tele=vise +teller +tem=per=ance +tem=pes=tu=ous +tem=po=raries +tempt=ingly +ten=dency +tenex +tense +tent +tenure +ter=mi=nat=ing +termwise +terre +ter=rify +ter=ror=ize +testable +tes=ti=fiers +tex +tex=tile +thai=land +thank=less +thaw +the=atri=cally +theme +the=ol=ogy +the=o=riza=tion +ther=a=pies +thereof +ther=mome=ter +thes=saly +thickly +thim=bles +think=ing +thirsted +this=tle +thorns +those +thou=sand +thread +threat=ens +thrift +thrived +throne +through=out +thrusters +thumbed +thun=derer +thus +tiburon +tick=les +ti=died +tier +tight=en=ers +tilde +tillich +tim=bered +time=outs +timeta=bles +ti=m=o=nizes +tin=gling +tin=kled +tint +tip=per=ary +tire=lessly +ti=tan +tit=ter +toasts +to=geth=er=ness +toi=lets +tol=er=a=ble +tol=er=a=tion +toma=toes +ton +tonic +tool +tooth=paste +top=most +topsy +tor=ment=ing +tor=rent +tor=tur=ing +tossed +to=tallers +touch=able +tough +tourist +tow=el=ing +towns +toys +tracked +trac=tor +trader +traf=ficked +trailed +trainer +tramp +trances +transceivers +tran=scribers +trans=feral +trans=formable +trans=gressed +tran=sis=tor=ized +tran=si=tively +trans=la=tion +trans=mit=tal +trans=par=ent +transpon=der +trans=pose +trape=zoidal +trauma +traver=sal +trays +trea=sure +treat=ing +tree=top +tremor +tres=passed +tri=an=gles +tri=bunals +tricked +tricky +trig=gered +trilled +trim=ming +tripled +tri=umphal +triv=ially +troop=ers +trot=sky +trou=bleshoots +trow=els +truck=ing +truest +trumped +trunk +trust=ingly +try +tubs +tuft +tum=bled +tun=able +tunisia +tur=bu=lent +turk=ize +turn=ing +tur=tle +tu=tankhamen +tut=tle +twenty +twiner +twirling +twitch=ing +tyler +type=writ=ers +typ=ing +tyranny +ugh +ul=cers +um=brage +un=ac=cept=ably +un=aided +unan=i=mous +unattain=abil=ity +un=aware +un=blocked +un=can=celled +un=chang=ing +un=closed +un=con=di=tional +un=con=trol=lable +un=count=able +un=de=cid=able +un=der=brush +un=der=flows +un=der=lies +un=der=mine +un=der=plays +un=der=stand=ings +un=der=tak=ings +un=der=writes +undi=rected +un=done +un=easy +un=equaled +un=event=ful +un=fair=ness +un=fit +un=for=mat=ted +un=grate=fully +un=harmed +uni=di=rec=tion=al=ity +uni=for=mity +unin=dented +un=in=ter=rupted +unions +uni=tar=ian +unity +uni=ver=si=ties +un=kind=ness +un=leashed +un=link=ing +un=lucky +un=mer=ci=ful +un=nec=es=sar=ily +un=ob=tain=able +un=paid +un=prece=dented +un=prov=able +un=rav=el=ing +un=rec=og=nized +un=re=strained +un=safely +un=s=e=lected +un=skilled +un=steady +un=syn=chro=nized +un=tie +un=to=ward +un=used +un=whole=some +un=winds +un=wrap +up=dater +up=holder +up=land +up=rightly +up=sets +up=turns +urge +uri=nates +ur=su=line +us=ages +usenix +usu=ally +uti=liza=tion +utopi=anizes +ut=ters +va=ca=tion +vac=u=umed +va=grantly +va=lence +valiant +valid=ness +valu=ably +valves +van=den=berg +van=ished +van=quish=ing +vari=ably +varies +vary=ing +vastly +vau=dois +vax +veer=ing +veg=e=tated +ve=hic=u=lar +ve=lasquez +vene=tian +ven=omous +ven=tri=cles +venus +ver=bal=ized +ver=dure +ver=i=fier +vern +ver=sa=tile +ver=te=brates +vested +vet=eri=nary +via +vi=bra=tions +vi=cious=ness +vic=tim=iz=ers +vic=to=ries +vi=dal +vier +view=ing +vi=gnettes +vil=i=fi=ca=tion +vil=lages +vinci +vine=yard +vi=o=la=tor +vi=o=lins +virgo +virus +vis=i=ble +vis=ited +vi=su=al=ize +vi=tally +vladimir +vo=ca=tions +voided +vo=li=tion +volt=ages +vol=un=teer=ing +voted +vouch=ing +voy=aged +vul=garly +waco +waf=fles +wag=ne=r=ian +wail=ing +waiter +waives +wak=ing +wal=green +wal=len=stein +walls +waltham +wan=dered +wan=ing +wants +ward +ware=hous=ing +warmer +warn=ing +war=ranted +war=saw +wash=burn +wasps +watch +watch=man +wa=ter=ing +wa=tery +wausau +wave=length +wax=ers +we +weak=nesses +wear +weari=somely +weath=er=ford +webs +wed=lock +weekly +wei=d=man +weights +weiss=muller +welder +welles=ley +wenches +wes=leyan +west=hamp=ton +wet +whacked +wharves +wheel +whelp +wher=ever +whims +whip=pany +whirling +whiskers +whis=tled +white=horse +whitens +whit=lock +whit=tling +whole=ness +whoop +wi=chita +widen +wid=owed +wield=ing +wilbur +wile +wilkin=son +william +willis +wilshire +wince +wind=ing +wine=head +win=ing +win=nie +win=sett +wiped +wire=tap=pers +wised +wish=ful +witches +with=drew +with=holds +wit=ness=ing +woe=fully +wom=an=hood +won=der=ful=ness +woo +wooden +wood=stock +woofer +woonsocket +words +work=books +work=man +world=li=ness +wor=rier +wor=shiper +worth=less +wound=ing +wrap=per +wreathes +wrenched +wretch +wring +writ +writ=ing +wrote +wyner +xe=roxed +yamaha +yard +yawner +years +yel=lowed +yelped +yes=ter=days +yok=na=p=ataw=pha +york=shire +young=sters +youth=ful=ness +yukon +zeal +zen +zeus +zion=ism +zoned +zoroaster diff --git a/third_party/rust/mapped_hyph/tests/base.word b/third_party/rust/mapped_hyph/tests/base.word new file mode 100644 index 000000000000..6d1e60849cfa --- /dev/null +++ b/third_party/rust/mapped_hyph/tests/base.word @@ -0,0 +1,4543 @@ +aarhus +abase +abate +abbeys +abby +abducts +aberrations +abhorrer +abilities +abjuring +ablest +abodes +abolitionist +abortion +about +abram +abridged +abruptly +absconds +absently +absolved +absorption +abstinence +abstractor +abundance +abuts +abyssinian +acapulco +accelerator +accentuated +accepted +accessibility +accidental +acclimated +accommodating +accompanying +accomplishments +accords +accountant +accretion +acculturation +accuracy +accused +aces +achieve +acid +acknowledgeable +acme +acoustics +acquiescent +acquisitive +acres +acropolis +actinometers +activators +actors +actuarial +acute +adagios +adaptation +adapts +addicting +additions +addresser +adducing +aden +adherents +adiabatically +adjoining +adjudging +adjured +adjustment +administer +administratively +admire +admissions +admixed +admonitions +adopted +adore +adrian +adsorbs +adulterer +adumbrating +advantageous +adventurers +adversely +advertises +advisees +advocacy +aerate +aerobacter +aerosols +affairs +affections +affiliating +affirmed +affliction +affords +afghans +aforethought +africanizes +afterimage +afterward +age +ager +agglutinated +aggravation +aggressive +agile +agitator +agonies +agreeably +agriculturally +aide +ailing +aims +airdrops +airfoil +airline +airplanes +airtight +akin +alamo +alas +albatross +albums +alcoholism +aldrich +alerting +alexandria +alga +algerian +algorithms +alicia +aligned +alistair +allan +alleges +allegory +allergic +alleyway +alliteration +allocator +allots +allowing +allurement +almaden +alnico +aloofness +alphabetized +alsatian +alterations +alternates +althaea +altruistically +alveolar +amalgamate +amass +amaze +amazons +ambiguous +ambled +ambushed +amend +america +americans +amide +ammonia +among +amortized +amour +amphibians +amplifiers +amputated +amusement +anabaptist +anagram +analogue +analyticities +anaphoric +anastomoses +anatomy +anchorite +andalusia +andover +anecdote +anesthetic +angeleno +angered +anglia +angola +angular +animated +animism +anita +annapolis +annihilated +annotation +annoy +annually +annuls +anodes +anomalously +anselm +ant +antagonizes +antedate +anthologies +anthropomorphically +anticipates +antidotes +antimony +antiquate +antisemitism +antithetical +antonio +anxious +anyway +apathy +aphasia +apiary +apocryphal +apologist +apostolic +appall +apparently +appear +appeasement +appended +appertains +applauds +applicable +applier +appointer +apportioning +appraisers +appreciation +apprehensively +approach +appropriate +approval +approximated +april +aptly +aquifer +arabians +aramco +arbitrating +arcades +archaism +archeologist +archimedes +architectures +arcing +ardently +arequipa +argos +argument +aridity +aristocrat +arithmetize +arm +armchairs +arming +armpits +arousal +arrack +arrangement +arrears +arrhenius +arrogate +arroyo +arterial +arthritis +articulately +artifact +artillerist +arts +ascendant +ascent +ascot +ashamedly +ashtray +asiatics +asking +aspersions +aspiration +ass +assassinated +assay +assemblies +asserter +assess +assiduity +assigning +assist +associate +associator +assuaged +assure +assyrianize +asteroid +astonishingly +astride +astronomically +asymmetric +asynchronously +atheism +athletes +atlas +atomization +atonement +atrophies +attaching +attain +attempt +attendants +attentionality +attenuator +attired +attracted +attributable +attune +auburn +audibly +audiometer +auditions +auger +august +aural +auscultated +austerely +austrianize +authenticator +authorities +authors +autocorrelate +autodecrements +autoindex +automaton +autopilot +autumnal +availer +avaricious +avenues +avers +avian +avionic +avoidable +avow +awakened +awards +awfulness +awry +axiological +axioms +ayes +azure +babelizes +babying +bacchus +backbend +backfill +backorder +backscatters +backstitch +backtracks +backyard +badger +baffle +baggage +bagrodia +bailiff +baits +bakes +balancers +baldwin +balkanization +balks +baller +balloon +ballplayer +balsam +bamboo +bandage +bandpass +bane +bangui +bank +bankrupts +bans +baptism +baptized +barbarism +barbells +bards +barest +barhop +barks +barnhard +barometers +barr +barren +barron +barter +basalt +baseless +bash +basics +basketball +bassinets +batavia +bather +bathtub +batted +batting +battlements +baudelaire +bawling +bayonet +be +beaded +beaker +beanbag +bearded +beast +beatification +beau +beautified +beavers +becker +becomingly +bedder +bedpost +bedspread +beecham +beefy +beethoven +befell +befoul +befuddles +beggary +begotten +beguiling +behaviorism +behold +being +belay +belfry +believable +belittles +belles +belligerents +bells +belong +belting +bemoans +bendable +benedictions +beneficiary +bengal +bent +bequeath +berating +beresford +beribboned +berliners +bernardine +bernoulli +bertie +besets +besmirched +bespoke +besting +bet +betrayed +bette +between +bewail +bewilderment +bianco +bibles +bicarbonate +biconvex +bidder +biennial +bigger +biharmonic +bilabial +bilk +billet +billings +bimetallism +bind +binghamton +biochemistry +biologically +biopsy +bipeds +birdbaths +birminghamize +births +bisectors +bisques +bites +bitterly +bivalves +blabbermouths +blackburn +blackfoots +blackmailed +blacks +blaine +blamers +bland +blanketers +blares +blasphemousness +blatz +bleachers +bleating +blemishes +blessings +blindfold +blinked +blissfully +blizzard +bloch +blockers +blond +bloodiest +bloom +blossoms +blowfish +bludgeons +blueprint +bluish +blunted +blurring +blushing +boarded +boaster +boathouse +boatswain +bobbsey +bodenheim +bodybuilding +bogart +bogus +boilers +boldface +bolshevist +bolton +bombastic +bonanzas +bondsman +bonham +bontempo +bookcases +bookkeepers +bookstore +booms +booster +bootle +bootstrapping +borden +borealis +born +borrowers +bosses +botanist +bother +bottler +botulism +bounce +bounden +bouquet +boutique +bowdlerizing +bowl +bowstring +boxing +boyfriend +braced +bradbury +bragger +braille +brainstorm +brakes +branchings +brandishing +brashly +braun +braving +braying +brazil +bread +breadwinners +breakfast +breakthrough +breastworks +breathlessly +breeding +brennan +brevet +brewery +bribers +bricklayers +bridge +bridgework +briefed +brig +brighten +brighton +brimming +brings +bristle +britisher +broaches +broadcasts +broadly +broglie +brokenness +bronchial +brooch +brookfield +broth +browbeat +brownian +bruce +brunette +brushing +brutalized +bryce +buchwald +buckler +bucky +buddies +budgeters +buff +buffetings +bugger +bugs +built +bulging +bulldoze +bullfrog +bullying +bumbling +bumptious +bundle +bungler +bunkhouse +bunted +buoys +bureaucracy +burgher +burglarproofing +burke +burn +burnings +burntness +burrowing +bursty +busch +bushwhacked +businesslike +bustards +butchered +buttercup +butternut +buttonholes +butyrate +buzzard +bye +bypassing +bystander +byzantinizes +cabinet +cache +cactus +caesarize +cager +cajole +calais +calculate +calculus +calgary +calico +callaghan +calloused +calmingly +caltech +calypso +camembert +camino +campaigning +camps +canadianize +canceled +candidacy +candler +canine +cannery +cannon +canonical +canopy +canto +canvassed +capable +capacitors +capita +capitalizers +capping +capstone +captivates +capturer +caravans +carbondale +carbonizing +cardboard +cardiology +carefully +caressing +caricature +carlsbad +carnation +caroline +carpenters +carriages +carruthers +carter +carton +carve +cascades +cashed +casings +cassette +castes +casts +catalina +catapult +catches +categorizes +cathedral +catholicisms +cattle +cauldrons +causer +cautioner +cavalierness +cavernous +cawing +cecil +celanese +celerity +cellist +celticizes +censoring +centaur +centimeter +centralized +centroid +cerebral +certainties +certifies +cezanne +chaffey +chairing +chalices +challenging +champaign +chancellor +changeability +channeled +chanter +chapel +chapter +characterize +chargeable +charitable +charlotte +chars +chartings +chasing +chastisers +chattel +chauffeured +cheaply +checkbook +checkout +cheekbone +cheeriness +cheeses +chemise +cherishes +cheryl +chests +cheyennes +chicanos +chides +childhood +chill +chime +chinas +chinning +chiropractor +chit +chloroplasts +choir +choose +chopping +choreograph +chou +christenson +christianizing +christoph +chronicle +chronology +chuckles +churchgoing +churn +ciceronianize +cinderella +ciphertexts +circuitously +circulating +circumnavigates +circumstanced +circuses +cities +civet +civilized +claimed +clambers +clams +clapboard +clarifications +clash +classes +classifiers +clattered +claustrophobia +cleaned +cleansed +clearer +cleaved +clemente +clerked +cliches +cliffs +climb +clincher +clink +clippers +cloaks +clockings +clogs +closeness +closing +clothing +clouding +clowns +clucks +clumsy +clutching +coaches +coalition +coastal +coating +coaxing +cobweb +cockpit +cocoon +codes +codifies +coefficient +coexist +coffer +cogitated +cogs +cohering +coils +coinciding +colder +colicky +collaborator +collared +collecting +colleges +collins +colombia +colonies +colons +colorless +columnize +combated +combinator +combings +comedic +cometary +comforting +comma +commandment +commemorative +commended +commenting +commissioners +committeemen +commonalities +commonwealth +communicated +communists +commuting +compactors +comparably +comparison +compassion +compelling +compensatory +competitions +compilers +complaint +completed +complexities +complications +complimenting +composedly +compost +comprehensibility +compression +compromisers +compulsory +computed +comradely +concatenation +concede +conceived +concentrators +conceptualized +concerted +conciseness +concoct +concretes +concurring +condemns +conditional +condoned +conduction +confectionery +conferred +confessions +confidential +configure +confining +confiscates +confocal +confounding +confucian +confusion +congo +congregating +congresswomen +conjoined +conjuncture +connected +connector +connors +conquered +conrail +consecrate +consenting +conservation +conserved +considered +consistent +consolers +consonants +conspirator +constant +constituent +constitutions +constructed +constructs +consultant +consumed +consumptions +contain +contaminated +contemplative +contender +contentment +context +continents +continuations +contortions +contracting +contradicting +contraptions +contribute +contrite +controllability +controversy +convened +conventionally +conversantly +conversion +convex +convict +convinces +convoys +cooked +coolers +coon +cooperations +coordinates +copeland +copings +coprocessor +coquette +cords +corinthian +corks +cornered +corns +coronary +corporately +correct +correctness +correspond +corridors +corroborative +corruption +corvallis +cosmopolitan +costs +cots +cotyledon +coughs +councilwoman +counselors +counteracting +counterfeited +counterpart +countersunk +countrywide +couplings +courser +courtesies +courtrooms +covenant +coverlet +covetousness +cower +cowl +coypu +crackers +cradles +craftsperson +cramps +crank +cranny +crater +crawford +craze +creaked +creams +creation +credence +creditor +creeks +cremates +crescents +cretin +cricket +criminal +cripple +crisscross +criticizes +croaks +crocus +cropper +crosser +crosstalk +crowd +crowning +crucifixion +cruel +cruising +crumpled +crusade +crushes +crux +cryptic +crystallize +cubans +cucumbers +cufflink +culminate +cultivable +cultural +cummings +cupboard +curb +curing +curlers +current +currying +cursory +curtly +curving +custer +customizable +cut +cuts +cyanamid +cyclically +cygnus +cypress +cytoplasm +dabbles +dadaistic +dahl +dairy +dali +damages +damns +damsel +dancing +dangle +danize +dare +darken +darn +darted +darwinizes +database +dates +daunted +davy +daydreams +dazzled +deaden +deaf +dealings +deanna +death +debater +debilitates +debtor +debutante +decay +deceit +decelerate +decent +decidability +decimate +decision +decks +declarer +decliners +decodings +decomposition +decorative +decreases +decrements +dedicated +deduct +deeding +deep +deere +defeats +defendant +defenestrating +deferments +deficiencies +define +definitions +deformation +defy +degradation +deify +dejected +delaying +deleter +deliberated +delicacies +delightful +delimiting +delirious +deliveries +delphic +deluged +demand +demeter +demodulate +demons +demonstrator +demultiplex +denebola +denigrates +denominators +denoting +densest +dentists +denying +departure +dependent +depleted +deploy +depose +depositors +depreciated +deprivations +dequeued +deregulate +derive +descend +descents +descriptively +deserters +deservings +designator +desire +desolate +despatched +despite +destabilize +destroyed +destructiveness +detacher +detained +detective +deteriorated +determination +deterministic +detractor +devastate +development +deviation +devised +devotedly +devours +dexedrine +diagnose +diagonals +dial +dialogue +diamond +diarrhea +dickinson +dictatorial +diddle +dies +dietrich +differentials +differers +diffusely +digest +diggings +digits +digressing +dilapidate +diligence +dilution +dimensions +dimmed +dine +dining +diogenes +diphthong +dipper +direction +directorate +dirt +disable +disaffection +disallowing +disappearances +disapprove +disassembles +disbands +discarding +discerns +disciplines +disclosure +disconnects +discord +discouraging +discovery +discretion +discuss +disease +disfigure +disgruntle +disgustingly +dishonestly +dishwater +disjoint +disk +dislocates +dismaying +dismissers +disobedient +disown +dispatched +dispensary +dispersed +displacing +displeasing +disposition +disputer +disquieting +disruption +dissemble +dissenter +dissimilarities +dissociating +distaff +distastes +distilling +distinguish +distorts +distresses +distributivity +disturbed +ditty +divergence +diversifies +diverting +dividend +divining +divisors +dixieland +doberman +doctoral +documentaries +dodecahedra +doe +doghouse +dolan +dollies +domenico +domicile +domineering +donahue +donkey +dooley +doorman +dopers +doric +dortmund +doted +doubleheader +doubt +doubts +doves +downey +downloading +downstairs +doyle +draconian +drafty +dragooned +dram +drape +draughts +drawings +dreaded +dreamers +dregs +dressing +dries +driller +drip +driveway +droop +droppers +droves +drudgery +drummers +drunkly +duality +dubuque +ducts +dug +dullness +dumbly +dunbar +dungeons +duplicable +dupont +duration +durward +duster +dutchman +dwarfed +dwelt +dyeing +dynamism +dysentery +ear +earmarked +earnestness +earth +earthquakes +eases +easterner +easy +eaves +eben +echoed +ecology +economize +ecuador +edenizes +edict +edition +edmonton +educating +edwards +effecting +efficacy +effortlessness +eggshell +egyptianize +eigenstate +eighthes +eisner +ejecting +elaborately +elapses +elderly +elections +electrically +electrocute +electroencephalography +electronics +elemental +elevation +elicited +eliminating +elite +ella +ellipsoids +elmhurst +else +elucidation +ely +emancipate +embarrass +embeds +embodied +embracing +emerald +emeritus +emil +emits +emotionally +emphasizing +employable +emporium +emptily +emulator +enacted +encamping +enchanter +encircled +encoder +encounter +encouragingly +encumbered +endangers +endemic +endorse +endows +enduringly +enfeeble +enfranchise +engels +engines +englishmen +engulf +enjoin +enjoys +enlightened +enlivens +enormity +enquirer +enriches +ensembles +ensnaring +ensures +enterprise +entertainment +enticed +entitle +entreat +entrepreneurs +enumerated +enveloped +environ +envisioned +ephemeral +epicurizes +episcopalian +epitaphs +epochs +equalize +equates +equilibrate +equips +equivocally +erased +ere +ergo +erlang +erode +erratum +errs +escalates +escapes +escorts +especially +esquires +essentially +estates +estimated +eternal +ethernets +etruria +eulerian +eurasia +europeanized +evade +evaluative +evaporation +evenhandedness +events +everglades +everything +evidences +evinces +evolve +exacerbated +exactions +exaggerations +examined +exasperates +exceeded +excellently +exceptions +exchangeable +excision +excitingly +exclamatory +exclusiveness +excreting +excused +executional +exemplified +exempts +exertion +exhaustedly +exhibitions +exile +existentialist +exorbitant +expanders +expect +expects +expeditious +expenditure +experiencing +experiments +expires +explanations +exploit +explorations +explosive +exponentiating +exports +exposure +expressibility +expulsion +extemporaneous +extensive +exterminate +extinguished +extract +extraneous +extrapolation +extremely +exult +eyeglasses +eyesight +fables +facade +facile +facsimile +factories +faculty +fagin +failsoft +faintness +fairing +faithful +fakes +fallacious +falmouth +falsifying +familiar +families +fanaticism +fanciness +fanning +farad +farewells +farmers +farrell +fascination +fasted +fastidious +fate +fathomed +fatten +faulkner +fauna +favoring +fayette +fearlessly +feat +featherweight +fed +feebleness +feeds +feet +feline +fellowships +feminism +fencing +fermentation +ferociously +fertile +fervent +festivity +fettered +feverish +fiat +fibrously +fiddled +fief +fiendish +fifteenth +fighting +fiji +files +filled +filming +filthy +finals +finder +fines +fingerprint +finishes +finnish +fireboat +firemen +firewall +firming +fiscally +fishes +fissured +fitly +fitzpatrick +fixation +fixture +flagged +flak +flamer +flanking +flash +flask +flattered +flaunting +flawlessly +fledglings +fleetly +flemishing +flew +flicking +flinches +flirt +floated +flood +floors +florentine +flossing +flourished +floweriness +fluctuate +fluffier +fluoresce +fluting +flying +focal +foes +fogy +folders +folksy +folsom +font +fooled +football +footing +forage +forbes +forcer +forearms +forefathers +foreign +foreseeable +forestalls +forever +forge +forgettable +forgiving +forlornly +formalized +formatively +formicas +formulated +forsaken +forthwith +fortiori +fortuitously +forwarder +fought +foundation +founds +foursome +foxes +fragile +fragrantly +framing +francie +francoise +franking +fraser +fray +freckle +frederico +freeing +frees +freezing +frenchizes +frequented +freshened +freshness +freudianism +friction +friendlier +friezes +frightful +frisia +frivolity +from +fronts +frothing +fruehauf +fruits +fuchsia +fujitsu +full +fumbling +functionally +fundamentally +fungal +funnier +furlong +furniture +furthermore +fuses +futuristic +gabled +gadgetry +gaging +gaines +galactic +galaxy +gallantly +gallon +gallstone +gambled +games +gangplank +gaped +garbed +gardner +garlanded +garrisoned +gaseous +gaspee +gastric +gathered +gauche +gaunt +gawky +gaze +gearing +gelatin +gemma +generality +generals +generic +genetic +genre +gentler +geodesic +geological +geophysical +geraldine +germane +germinates +gestapo +getting +ghosted +gibraltar +gig +giggle +gilds +gilt +ginghams +gipsy +girlish +giver +gladdest +glance +glaring +glazed +gleaner +glenda +glimmer +glints +gloat +gloria +glorying +glove +glowing +glynn +gnu +goats +goblins +godmother +goethe +goldenly +goldstine +gondola +goode +goodyear +goren +gorton +got +gothicizing +gouging +government +grab +gracefully +gradations +gradual +graft +grained +grams +grandfather +grandpa +grant +granulates +graphical +graspable +grassiest +gratification +gratuitously +graves +grayed +grease +grecianize +greeks +greenfeld +greens +greeter +grenades +greyest +grievances +griffith +grimes +grinds +gripped +gritty +grocers +grooved +grossest +groton +group +grovels +growling +grubs +grumbling +guano +guardedly +gubernatorial +guest +guideline +guiltier +guises +gullah +gumming +gunner +gurgle +gustafson +guts +guyer +gymnastics +haas +habitual +hacks +hag +hail +hairier +hale +hallmark +halpern +halve +hamburgers +hammering +hampshire +handbooks +handicap +handkerchiefs +handshake +handy +hangman +hannah +hansel +haplessly +happily +harbinger +harder +hardships +harken +harmfulness +harmoniously +harnessing +harriman +harry +harvardize +harveys +hassle +hat +hatefully +hattie +hauler +hausa +havoc +hawthorne +haywood +head +headlands +headroom +heals +healy +hearings +heartily +heater +heaved +heaviness +hebrides +hedgehog +heeds +hegelianizes +heights +heiresses +helicopter +hellenized +helmet +helpfully +hem +hemp +hendrick +henrietta +heralds +herder +hereford +hereunder +hermit +heroically +herring +hertzog +hesperus +heterogenous +heuser +hexagon +hibernate +hidden +hierarchic +highfield +highnesses +hikes +hillcrest +hilt +hindered +hindustan +hinting +hired +his +histograms +hitch +hither +hitting +hoarseness +hobby +hoe +hoists +holds +hollandaise +hollowness +holocaust +homage +homeomorphism +homespun +homing +homosexual +honesty +honeymooning +honoraries +hoodlum +hooker +hoosierize +hooves +hopelessness +horace +horn +horrible +horrors +horseshoer +hospitalize +hostesses +hotly +hounding +houseflies +housetop +hover +howled +hubert +huey +hugo +humanities +humbling +humidifiers +humiliation +humorers +humpty +hung +hungry +huntley +hurling +hurrying +husbands +husks +hutchins +hyde +hygiene +hyphenate +hypotheses +hysterical +ibsen +icicle +icosahedron +idealize +identical +identify +idiosyncrasy +idles +ignite +ignores +illegality +illogical +illusions +illustrative +imagen +imagine +imbrium +immaterial +immensely +immigrating +immovability +impacted +impale +impatiently +impedes +impenetrable +imperfectly +impermanent +impersonations +impinges +implementable +implicants +implied +important +imposes +impotence +impractically +impressible +impressment +imprisonments +improvement +improvisers +impulsion +inaccessible +inadequate +inane +inaudible +inca +incas +incessantly +incidentally +inciting +incloses +inclusiveness +incomparable +incompletely +incongruity +inconsistent +inconvenient +incorrectness +incredulous +incubate +incurable +indecisive +indent +indescribable +indexing +indication +indifference +indignation +indirectly +indistinct +individually +indoctrinating +indubitable +inductances +inducts +industrialist +industry +inelegant +inertly +inexact +inexplicable +infantry +infection +inferior +infertile +infinite +infirmary +inflated +inflicting +inform +informatively +infrequently +infuriating +ingeniousness +ingratiate +inhabited +inherently +inheritress +inhibitor +inimical +initialized +initiating +injection +injured +inker +inlet +inner +innocuousness +inoculate +inquire +inquisitive +inscribed +insecurely +insertion +insidiousness +insinuated +insistently +insomnia +inspiration +installation +instances +instantiations +instill +institutes +instruct +instructs +instruments +insulation +insurance +insurrection +integrand +intellect +intelligible +intensification +intensively +inter +intercept +interchanged +intercommunicates +intercourse +interested +interfered +intergroup +interleaved +interminable +intermodule +internationality +interpersonal +interposed +interpreting +interrelations +interrupt +intersecting +interstate +intervening +interwoven +intimation +intolerance +intractability +intraoffice +intrigued +introductions +intruder +intubation +invaders +invalidities +invariants +inventively +inverses +inverting +investigative +inveterate +invites +invoked +involves +ionians +ira +irately +irishman +ironic +irrational +irregular +irrepressible +irreversibility +irritable +irving +isfahan +island +isolated +isomorphisms +issuance +it +italicize +itemizations +iteration +ito +izvestia +jackets +jacky +jacobus +jailer +jamaican +janet +janus +jargon +jauntiness +jay +jeanne +jeffersonian +jennifer +jeremy +jeroboam +jest +jesuitizing +jeweled +jews +jingled +joaquin +joes +john +joiner +jokers +jolts +jordan +josephus +jotting +journals +jousting +joyous +judaica +judge +judith +jugoslavia +julie +jump +junctures +juniper +juras +jury +justifiers +jutland +kaddish +kamikazes +kant +karp +katowice +keeling +keepers +kemp +kenney +kepler +kerouac +key +keypad +khrushchevs +kidde +kidney +kilimanjaro +kills +kilojoule +kimono +kindling +kingpin +kinnickinnic +kirchoff +kisses +kiting +klein +knapsacks +kneel +knickerbockers +knights +knocked +knots +knowledge +knuckles +kodachrome +korea +krakatoa +kronecker +kurd +labeling +laborer +labyrinths +lacerta +lacks +ladies +lagoon +laidlaw +lamarck +lament +lamp +lanced +landings +lands +lange +languish +laos +lapse +largely +larson +lashing +laszlo +later +latinity +latitudes +laudable +laughlin +laundered +laurels +lavender +lawfully +lawsuit +layers +lazarus +leaded +leafed +leaguers +leander +leaping +leary +leathern +leaving +lectures +leeds +leftists +legalization +leger +legislated +legitimate +leila +lemon +lends +leniency +lens +leonardo +lesbian +lesson +letter +levee +levelly +levin +lewdly +lexington +libelous +liberated +libido +license +lick +lied +lifeboat +lifetime +ligget +lighthearted +like +likeness +lilian +liman +limit +limits +lind +lindy +linearly +lingerie +lining +linnaeus +lioness +liquid +lise +listened +listings +literalness +lithuania +littering +live +livers +lizzie +loaf +loathing +lobster +locally +locator +lockian +lockwood +lodges +logarithm +logically +logs +loiters +londonization +loners +longings +lookers +looms +looseleaf +loosing +lords +lorry +lossiest +lotus +louisa +lourdes +lovelace +loves +lowest +loyally +lucerne +luckier +ludicrous +luke +luminously +lunch +lunged +luring +lust +luther +luxuriantly +lyle +lynx +mac +macdonald +maces +machinery +mackey +macromolecule +madden +madhya +madsen +magellanic +magill +magnetizable +magnify +maguire +maids +mailman +mainframes +maintained +majesty +maker +malady +malcolm +malformed +maliciousness +malone +malton +manage +managing +mandatory +manger +manhole +manicuring +manila +manipulative +mann +manors +mantissa +manufactured +mao +maps +marched +mardis +margo +marinade +maritime +marketability +markings +marmalade +marriott +marshaling +martial +martyr +marvels +mascara +maskable +masonite +massacred +mast +masterpiece +masturbation +matchless +materializing +mathematically +matings +matrix +mattered +matured +mauricio +maxima +maxims +maybe +mayoral +mccabe +mccluskey +mcdonnell +mcgovern +mckee +mclean +mcpherson +mealtime +meaningful +meant +measurements +mechanically +medal +medfield +mediations +medicine +meditating +mediums +meeting +megahertz +meister +melcher +melodies +melpomene +membership +memoranda +memorizes +menagerie +mendelizes +mennonite +mentalities +mentor +mercenariness +mercilessly +merged +meritorious +merrill +mesh +messenger +messy +metallization +metaphysical +meteoritic +methodically +methods +metro +mews +mica +mick +microbicide +microeconomics +micron +microprocessing +microscope +microvaxes +middleman +midnight +midstream +midwinter +migrate +mikoyan +mileage +milk +mill +millikan +millionth +millstones +miltonized +minaret +mindfully +mineral +mini +minima +minimizes +ministries +minor +minstrels +minute +miracle +miriam +miscarriage +misconception +miserably +misgivings +misled +misplacing +missing +missoula +mistake +mistletoe +misunderstand +mitch +mitres +mixtures +moats +mocked +modally +moderated +modernizer +modicum +modifying +modularizing +module +moghul +moines +moldavia +moles +mollusk +momentarily +monaco +monday +mongolian +monkeyed +monocotyledon +monolithic +monostable +monroe +montague +montgomery +monument +mooned +moor +moped +morass +morehouse +morn +morphological +morsels +mortgage +mosaic +mosque +motels +motherland +motionlessness +motley +motorized +mound +mountainously +mourners +mousy +movable +moving +muck +muddled +muffin +mugs +mullah +multicomputer +multiple +multiplicand +multiplies +multistage +mumbles +mundane +munitions +murdering +murmurs +muscovy +mushroomed +musicians +muskrat +mussorgsky +mutability +mutations +mutilating +mutters +mycenaean +mysterious +mythologies +nagasaki +nair +nakedness +names +nanook +napkin +narcotic +narrowest +nash +natal +nationalities +nations +naturalist +naughtiness +navel +navona +neanderthal +nears +nebula +necessitation +neckties +needled +needy +negatives +negligible +negroid +neighboring +neoclassic +nero +nesting +nets +neural +neutral +neva +newburyport +newman +newsman +next +nibelung +nicholls +nickname +nielson +nightfall +nihilism +nimbler +nineties +nipponizes +nobility +nocturnally +noel +nolan +nominee +nonconservative +nondeterminism +nongovernmental +nonlinearity +nonorthogonal +nonsegmented +nonterminals +nook +nordhoff +normalization +normanizations +north +northernly +norwalk +nostradamus +notarizes +note +noticeable +notifies +nottingham +novak +novices +nuances +nuclide +nullary +number +numerable +numismatic +nursing +nutritious +nyquist +oases +obedient +obfuscate +objectively +obliged +obliterating +obscene +observable +observers +obsoletes +obstruction +obviated +occasional +occidentalize +occlusions +occupied +occurs +octagonal +octets +oddly +odious +o'dwyer +offended +offer +officer +officiously +oft +oilcloth +ojibwa +oldenburg +oleomargarine +olivia +olympus +ominousness +omnipresent +o'neill +onlooker +onus +opaquely +openings +operate +operator +oppenheimer +oppose +oppressed +opthalmic +optimist +optimizing +opts +oranges +orbital +orchestral +order +ordinarily +ores +organization +organs +orientalized +orifices +origination +orleans +ornate +orr +orville +oscillates +o'shea +osteopath +othello +otto +ounces +outburst +outdoor +outgrowing +outlawing +outlive +outperforms +outrages +outstanding +outvoting +outwitting +overboard +overcrowds +overestimates +overhangs +overjoyed +overload +overnighter +overproduction +overrunning +overshadowing +oversized +overtake +overtly +overuse +overworking +owen +ownership +oxidized +ozzie +pacification +packaged +packers +padding +pageant +paginating +painful +painting +pajamas +pale +palestine +palliative +palomar +panacea +pandemic +panels +panned +pantheist +panty +paperers +par +parades +paragon +parallel +parallels +parameterize +paramus +paraphrases +parcel +pardoned +parentheses +pares +parisian +parkers +parlay +parody +parrots +parsifal +partakes +participant +particular +partitioned +partridges +passageway +passion +passport +pasteur +pasture +patchy +patents +pathogenesis +patients +patricians +patrolling +patrons +patterning +paula +paulus +pavement +pawn +payer +payoffs +peacefully +peaks +pearl +peat +peculiar +pedant +pediatrician +peeling +peering +peking +pembroke +pence +pends +penetration +peninsulas +pennsylvania +pentagon +peopled +peppery +perceived +percents +perchance +perennially +perfectness +performs +perihelion +periodically +perishable +perkins +permeating +permit +pernicious +perpetration +perpetuation +persecuting +perseveres +persist +personal +personified +perspiration +persuasions +perturb +peruses +pervasive +pester +peters +petri +petting +phaedra +phaser +phenomenological +philco +philistinizes +philosophies +phoenicia +phoning +phosphorus +photogenic +photos +phyla +physicist +pi +pick +picketing +pickman +picojoule +picturing +piedfort +pies +piggybacked +pigtail +pilferage +pillar +pilots +pincushion +pining +pinnacle +pinscher +pioneers +pipelining +pirate +pistols +pitching +pithiness +pitiless +pituitary +pixels +placement +plagiarist +plainfield +plaintiveness +planeload +planets +planoconcave +plantings +plasticity +plates +platoon +playboy +playing +playwrights +pleasant +pleat +plenary +pliant +plots +plows +pluggable +plume +plundered +plunging +plutonium +pocahontas +pod +poetical +poincare +pointy +poisons +polaris +police +polish +politer +polka +polluted +polymer +pomerania +pompousness +ponds +pool +pop +popping +popularized +populous +pores +port +portending +portico +portray +posed +position +posits +possessive +possums +posteriori +postmasters +postscript +pot +potentates +potion +pottery +pounces +pourer +poverty +powerful +practicable +practitioners +praise +prancer +prayer +preallocated +precariously +precedents +preciously +precipitation +precludes +preconception +predating +predetermination +predication +predictive +predominately +preemptive +prefacing +prefers +preinitializes +preliminary +premise +preoccupied +prepared +preposterously +prerogatives +prescriptions +presentations +preserved +presidential +pressings +preston +presumptuousness +pretending +pretexts +prevailing +preventing +previously +pricers +prides +primarily +priming +princesses +principles +prior +prisoners +privations +prizes +probate +probings +procedure +processing +proclamation +procreate +procurer +produce +productive +profession +proffered +profitability +profound +program +progresses +prohibitions +projections +proletariat +prolong +prominent +promoter +promptest +promulgation +pronouncement +proofs +propane +properly +prophesy +proportionately +proposer +propounded +prorate +prosecutes +prosodic +prospector +prostate +protecting +protege +protestations +protons +protozoan +prouder +provenance +providence +provision +provokes +proximal +pruned +prussianize +pseudoinstruction +psychiatrist +psychologically +psychosomatic +pub +publicly +puckered +puffed +puller +pulls +pulse +pumpkin +punctually +punishable +punt +puppeteer +purchases +purges +purina +purpler +purposed +purse +pursuing +pushdown +putnam +puzzlement +pyongyang +pythagoreanizes +quadrangle +quadrennial +quagmires +quakeress +qualified +qualm +quantifiers +quantize +quarreled +quartering +quasar +quavering +queerer +queried +questionable +questions +quibble +quicklime +quieting +quince +quit +quivers +quonset +quotient +rabin +rachmaninoff +racketeers +radiance +radiators +radiography +rae +rages +raider +railroaded +rainbow +rains +rake +ralston +ramifications +rams +rand +randy +rangy +rankings +ransomer +rap +rapids +rapturous +rascally +rasping +ratfor +ration +rationalizes +rattler +ravager +ravens +rawlins +rays +reach +reacted +reactivation +reader +readjusted +realigned +realizable +realm +reaped +rear +rearrest +reasonings +reassigned +reawakened +rebellions +rebooting +rebuffed +rebutted +recalibrated +recapitulates +receded +receives +receptive +recife +reciprocating +recitations +reckoned +reclaiming +reclining +recognize +recollect +recommend +recompiles +reconciliation +reconnect +reconstituted +recorder +recover +recreating +recta +recur +recursing +red +redeclared +redefined +redevelopment +redisplayed +redness +redressing +reducibly +reeds +reelects +reenforcement +reestablishing +reexamining +references +referral +refine +reflecting +reflexes +reformatory +reformulated +refrained +refreshment +refugee +refuted +regally +regenerating +regimentation +regis +regressed +regrettable +regularly +regulators +rehearsing +reimbursable +reined +reinhold +reinstated +reintroduces +reiteration +rejoiced +relabeled +relating +relatives +relaxes +relegate +relents +relic +relieving +relinquishing +reloader +reluctance +remains +remedied +remind +reminiscently +remodels +remotely +removing +renames +rendezvous +renewable +renouncing +rented +reopen +reorganize +repairman +repaying +repeatedly +repentance +repetitious +replaceable +replays +replicate +report +reposing +representably +representing +reprieved +reproach +reproducibilities +reprograms +republics +repulses +reputed +required +requisitions +rescind +researchers +resemblances +resentment +reservoir +resident +resignation +resistance +resistors +resolver +resorting +respect +respective +responded +responsible +restarts +restful +restorations +restrainers +restrictive +resultant +resuming +resurrectors +retailing +retaliatory +retentiveness +retina +retiring +retracting +retransmission +retribution +retriever +retrospection +retype +reuniting +revamping +reveler +revere +reverifies +reverses +reviewer +reviser +revival +revoked +revolution +revolvers +rewinding +rewriting +rhesus +rhode +rhyming +ribbons +richard +richmond +rico +ride +ridiculed +rifle +rigging +rightfulness +rigor +rims +ringings +riordan +ripely +rippling +risk +ritually +river +rivulet +roadsters +roaring +robberies +roberta +robinsonville +rochester +rocket +rockwell +rods +roll +romance +romanizes +romper +roofing +rooming +root +roping +rosebush +rosetta +rot +rotations +rotund +roughness +rounding +roused +routes +roving +rowley +royalty +rubbing +rubles +rudeness +ruffian +ruggedness +rule +rumanians +rummy +runaway +runoff +rupturing +russell +rustic +rustlers +ruthlessness +sabbathize +sachs +sacrifice +sacrosanct +saddles +safari +safes +sagebrush +said +sails +salable +salerno +saline +sally +salters +salutations +salvages +same +sampling +sanatorium +sanctioning +sandburg +sandra +sanest +sanskrit +sapling +saran +sari +satchel +satires +satisfy +saturnalia +saud +savaged +saver +savored +sawfish +saxonize +sayings +scala +scaling +scampers +scanners +scapegoat +scared +scatter +scenic +schantz +schelling +schemers +schmitt +scholastic +schoolhouses +schroeder +schuylkill +scissor +scoffs +scope +scoreboard +scorner +scotchgard +scottsdale +scouted +scrambled +scrapes +scratching +screamers +screenings +scribbled +scripts +scrumptious +scuffle +sculptured +scythe +seagate +seam +seaquarium +searchlight +seasonable +seat +seceded +secondary +secretarial +secretive +sections +securings +sedition +see +seedy +seeming +seer +segmentations +segundo +seizures +selectman +selfishly +sells +semantics +semiconductor +semipermanently +senate +seneca +sense +sensing +sensual +sentimentally +separately +sept +sequencers +sequentially +serene +serializable +serif +serra +service +servings +sets +settler +seventeens +severance +severs +sex +sexual +shackled +shadiness +shaffer +shakers +shale +shameful +shanghaied +shapeless +shard +shares +sharpening +shattering +shawano +shearing +sheds +sheets +shelley +shelves +sheridan +shied +shiftier +shilling +shiner +shintoizes +shipper +shirk +shiver +shocker +shoehorn +shooter +shoppers +shortage +shortens +shorts +shouldered +shoved +showed +shows +shrewd +shrilled +shrinking +shrugs +shuffled +shutoff +shuttles +siberia +sicken +sideband +sides +sidings +sierra +sighed +sigma +signature +signification +sikkim +silent +silken +sills +silverman +simile +simon +simplicities +simplistic +simulation +sinbad +sinews +singed +singlet +singularly +sinner +sioux +sirens +sisyphus +sittings +siva +sixties +skate +skeptical +sketchpad +skidding +skillfulness +skims +skipped +skirmishes +skulked +sky +skyrockets +slacks +slang +slash +slaughter +slavic +slavonicizes +sledgehammer +sleepless +sleighs +sliced +slide +slightly +slings +slips +slogans +sloppiness +slotting +slower +sluggishness +slums +smacked +smalltime +smasher +smell +smiles +smithsonian +smoked +smoldered +smoothing +smug +smythe +snap +snapshots +snatched +sneakiest +sneers +sniffs +snodgrass +snorkel +snowbelt +snows +snuffs +soak +soared +sobers +socialists +sociological +socks +sofas +softly +sojourn +soldier +solenoid +solid +solids +solos +solvent +somber +somerset +son +sonny +soothe +sophistication +sordid +sorest +sorrows +soul +soundness +soured +southbound +southland +soviets +spacer +spaded +spaniardization +spanked +spare +sparked +sparsely +spat +spawned +speakers +specialists +specialty +specified +speckle +spectators +spectrography +speculates +speechless +speeds +spellings +spent +spica +spies +spilt +spinner +spirally +spirituals +spiting +spleen +splicing +splits +spoiling +sponged +sponsorship +spoolers +spores +sportswriter +spotter +sprague +spraying +sprees +springiness +sprint +sprouted +spurn +sputtered +squadrons +squarer +squatting +squeaky +squeezing +squirmed +stab +stabilizes +stacked +staffing +stagers +stags +staircases +stalemate +stalling +stammer +stampeding +standard +standings +stans +star +stargate +starring +startles +state +statewide +stationmaster +statues +statutorily +staves +steadier +stealer +steamer +steele +steeper +steered +stem +stenographer +stepmother +stereoscopic +sterilizer +stethoscope +stew +stickier +stiffens +stigma +stillest +stimulate +stinging +stipends +stirrer +stitching +stockholder +stodgy +stomacher +stood +stopgap +storage +storeyed +stormiest +stouter +strafe +straightened +strained +stranding +strangler +stratagem +stratifies +strawberry +streamer +streetcar +strengths +stretched +strict +strife +stringed +stringy +striptease +strode +strolling +strontium +struggle +stuart +stucco +studious +stuffs +stun +stupid +sturm +styli +styx +subcomponents +subdirectory +subdues +subgraph +subjective +sublime +submerges +submode +subordinate +subprogram +subschema +subscripting +subsequent +subsidies +subsistent +substantially +substation +substrate +subsystem +subtleness +subtraction +subunits +subverting +successful +succinctness +suckers +sudden +sufferance +sufficiency +suffocated +sugarings +suggests +suitably +suits +sulks +sultan +summands +summation +summon +sumter +sunder +sunken +sunshine +supercomputers +supergroups +superior +supernatural +supersede +supervise +suppers +supplementing +support +suppose +suppressing +surely +surge +surly +surpass +surprisingly +surround +surveyors +sus +suspended +suspicions +sutherland +swab +swallowing +swan +swaps +swat +sweat +swedes +sweepstakes +sweetest +swellings +swifter +swimsuit +swipe +switchboards +swivel +swords +sykes +sylvia +symbolize +symmetry +sympathy +synagogue +synchronizes +syndication +synonymously +synthesize +syrian +systematically +tabernacle +tablespoonful +tabulate +tacit +tactic +tail +taipei +tale +talker +tallchief +talmudizations +taming +tanaka +tangle +tantalizing +taos +tapestry +tar +tariffs +tasked +tastefully +tattered +taunts +taverns +taxicabs +taylor +teaches +tearful +teaspoonful +technique +tedious +teenaged +tegucigalpa +telegraph +teleologically +telephony +televise +teller +temperance +tempestuous +temporaries +temptingly +tendency +tenex +tense +tent +tenure +terminating +termwise +terre +terrify +terrorize +testable +testifiers +tex +textile +thailand +thankless +thaw +theatrically +theme +theology +theorization +therapies +thereof +thermometer +thessaly +thickly +thimbles +thinking +thirsted +thistle +thorns +those +thousand +thread +threatens +thrift +thrived +throne +throughout +thrusters +thumbed +thunderer +thus +tiburon +tickles +tidied +tier +tighteners +tilde +tillich +timbered +timeouts +timetables +timonizes +tingling +tinkled +tint +tipperary +tirelessly +titan +titter +toasts +togetherness +toilets +tolerable +toleration +tomatoes +ton +tonic +tool +toothpaste +topmost +topsy +tormenting +torrent +torturing +tossed +totallers +touchable +tough +tourist +toweling +towns +toys +tracked +tractor +trader +trafficked +trailed +trainer +tramp +trances +transceivers +transcribers +transferal +transformable +transgressed +transistorized +transitively +translation +transmittal +transparent +transponder +transpose +trapezoidal +trauma +traversal +trays +treasure +treating +treetop +tremor +trespassed +triangles +tribunals +tricked +tricky +triggered +trilled +trimming +tripled +triumphal +trivially +troopers +trotsky +troubleshoots +trowels +trucking +truest +trumped +trunk +trustingly +try +tubs +tuft +tumbled +tunable +tunisia +turbulent +turkize +turning +turtle +tutankhamen +tuttle +twenty +twiner +twirling +twitching +tyler +typewriters +typing +tyranny +ugh +ulcers +umbrage +unacceptably +unaided +unanimous +unattainability +unaware +unblocked +uncancelled +unchanging +unclosed +unconditional +uncontrollable +uncountable +undecidable +underbrush +underflows +underlies +undermine +underplays +understandings +undertakings +underwrites +undirected +undone +uneasy +unequaled +uneventful +unfairness +unfit +unformatted +ungratefully +unharmed +unidirectionality +uniformity +unindented +uninterrupted +unions +unitarian +unity +universities +unkindness +unleashed +unlinking +unlucky +unmerciful +unnecessarily +unobtainable +unpaid +unprecedented +unprovable +unraveling +unrecognized +unrestrained +unsafely +unselected +unskilled +unsteady +unsynchronized +untie +untoward +unused +unwholesome +unwinds +unwrap +updater +upholder +upland +uprightly +upsets +upturns +urge +urinates +ursuline +usages +usenix +usually +utilization +utopianizes +utters +vacation +vacuumed +vagrantly +valence +valiant +validness +valuably +valves +vandenberg +vanished +vanquishing +variably +varies +varying +vastly +vaudois +vax +veering +vegetated +vehicular +velasquez +venetian +venomous +ventricles +venus +verbalized +verdure +verifier +vern +versatile +vertebrates +vested +veterinary +via +vibrations +viciousness +victimizers +victories +vidal +vier +viewing +vignettes +vilification +villages +vinci +vineyard +violator +violins +virgo +virus +visible +visited +visualize +vitally +vladimir +vocations +voided +volition +voltages +volunteering +voted +vouching +voyaged +vulgarly +waco +waffles +wagnerian +wailing +waiter +waives +waking +walgreen +wallenstein +walls +waltham +wandered +waning +wants +ward +warehousing +warmer +warning +warranted +warsaw +washburn +wasps +watch +watchman +watering +watery +wausau +wavelength +waxers +we +weaknesses +wear +wearisomely +weatherford +webs +wedlock +weekly +weidman +weights +weissmuller +welder +wellesley +wenches +wesleyan +westhampton +wet +whacked +wharves +wheel +whelp +wherever +whims +whippany +whirling +whiskers +whistled +whitehorse +whitens +whitlock +whittling +wholeness +whoop +wichita +widen +widowed +wielding +wilbur +wile +wilkinson +william +willis +wilshire +wince +winding +winehead +wining +winnie +winsett +wiped +wiretappers +wised +wishful +witches +withdrew +withholds +witnessing +woefully +womanhood +wonderfulness +woo +wooden +woodstock +woofer +woonsocket +words +workbooks +workman +worldliness +worrier +worshiper +worthless +wounding +wrapper +wreathes +wrenched +wretch +wring +writ +writing +wrote +wyner +xeroxed +yamaha +yard +yawner +years +yellowed +yelped +yesterdays +yoknapatawpha +yorkshire +youngsters +youthfulness +yukon +zeal +zen +zeus +zionism +zoned +zoroaster diff --git a/third_party/rust/mapped_hyph/tests/compound.hyf b/third_party/rust/mapped_hyph/tests/compound.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/compound4.hyf b/third_party/rust/mapped_hyph/tests/compound4.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/compound5.hyf b/third_party/rust/mapped_hyph/tests/compound5.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/compound6.hyf b/third_party/rust/mapped_hyph/tests/compound6.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/hyphen.hyf b/third_party/rust/mapped_hyph/tests/hyphen.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/lhmin.hyf b/third_party/rust/mapped_hyph/tests/lhmin.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/num.hyf b/third_party/rust/mapped_hyph/tests/num.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/rhmin.hyf b/third_party/rust/mapped_hyph/tests/rhmin.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/settings2.hyf b/third_party/rust/mapped_hyph/tests/settings2.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/settings3.hyf b/third_party/rust/mapped_hyph/tests/settings3.hyf new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/third_party/rust/mapped_hyph/tests/test.rs b/third_party/rust/mapped_hyph/tests/test.rs new file mode 100644 index 000000000000..95eae86f67e2 --- /dev/null +++ b/third_party/rust/mapped_hyph/tests/test.rs @@ -0,0 +1,169 @@ +// Any copyright to the test code below is dedicated to the Public Domain. +// http://creativecommons.org/publicdomain/zero/1.0/ + +use mapped_hyph::Hyphenator; + +#[test] +fn basic_tests() { + let dic_path = "hyph_en_US.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("haha", '-'), "haha"); + assert_eq!(hyph.hyphenate_word("hahaha", '-'), "ha-haha"); + assert_eq!(hyph.hyphenate_word("photo", '-'), "photo"); + assert_eq!(hyph.hyphenate_word("photograph", '-'), "pho-to-graph"); + assert_eq!(hyph.hyphenate_word("photographer", '-'), "pho-tog-ra-pher"); + assert_eq!(hyph.hyphenate_word("photographic", '-'), "pho-to-graphic"); + assert_eq!(hyph.hyphenate_word("photographical", '-'), "pho-to-graph-i-cal"); + assert_eq!(hyph.hyphenate_word("photographically", '-'), "pho-to-graph-i-cally"); + assert_eq!(hyph.hyphenate_word("supercalifragilisticexpialidocious", '-'), "su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious"); +} + +// Testcases adapted from tests included with libhyphen. +// (Using only the UTF-8 dictionaries/tests, and omitting those that require +// the extended hyphenation algorithm.) + +#[test] +fn base() { + let dic_path = "tests/base.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + use std::fs::File; + use std::io::{BufRead,BufReader}; + let words: Vec = { + let file = File::open("tests/base.word").unwrap(); + BufReader::new(file).lines().map(|l| l.unwrap()).collect() + }; + let hyphs: Vec = { + let file = File::open("tests/base.hyph").unwrap(); + BufReader::new(file).lines().map(|l| l.unwrap()).collect() + }; + for i in 0 .. words.len() { + assert_eq!(hyph.hyphenate_word(&words[i], '='), hyphs[i]); + } +} + +#[test] +fn compound() { + let dic_path = "tests/compound.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("motorcycle", '-'), "mo-tor-cy-cle"); +} + +#[test] +fn compound4() { + let dic_path = "tests/compound4.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("motorcycle", '-'), "motor-cycle"); +} + +#[test] +fn compound5() { + let dic_path = "tests/compound5.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("postea", '-'), "post-e-a"); +} + +#[test] +fn compound6() { + let dic_path = "tests/compound6.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("meaque", '-'), "me-a-que"); +} + +#[test] +fn settings2() { + let dic_path = "tests/settings2.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("őőőőőőő", '='), "ő=ő=ő=ő=ő=ő=ő"); +} + +#[test] +fn settings3() { + let dic_path = "tests/settings3.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("őőőőőőő", '='), "őő=ő=ő=ő=őő"); +} + +#[test] +fn hyphen() { + let dic_path = "tests/hyphen.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("foobar'foobar-foobar’foobar", '='), "foobar'foobar-foobar’foobar"); +} + +#[test] +fn lhmin() { + let dic_path = "tests/lhmin.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("miért", '='), "mi=ért"); +} + +#[test] +fn rhmin() { + let dic_path = "tests/rhmin.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("övéit", '='), "övéit"); + assert_eq!(hyph.hyphenate_word("అంగడిధర", '='), "అం=గ=డిధర"); +} + +#[test] +fn num() { + let dic_path = "tests/num.hyf"; + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + assert_eq!(hyph.hyphenate_word("foobar", '='), "foobar"); + assert_eq!(hyph.hyphenate_word("foobarfoobar", '='), "foobar=foobar"); + assert_eq!(hyph.hyphenate_word("barfoobarfoo", '='), "barfoo=barfoo"); + assert_eq!(hyph.hyphenate_word("123foobarfoobar", '='), "123foobar=foobar"); + assert_eq!(hyph.hyphenate_word("foobarfoobar123", '='), "foobar=foobar123"); + assert_eq!(hyph.hyphenate_word("123foobarfoobar123", '='), "123foobar=foobar123"); + assert_eq!(hyph.hyphenate_word("123barfoobarfoo", '='), "123barfoo=barfoo"); + assert_eq!(hyph.hyphenate_word("barfoobarfoo123", '='), "barfoo=barfoo123"); + assert_eq!(hyph.hyphenate_word("123barfoobarfoo123", '='), "123barfoo=barfoo123"); +} diff --git a/toolkit/library/rust/shared/Cargo.toml b/toolkit/library/rust/shared/Cargo.toml index 49e84653d0a3..709edc20a373 100644 --- a/toolkit/library/rust/shared/Cargo.toml +++ b/toolkit/library/rust/shared/Cargo.toml @@ -44,6 +44,7 @@ audio_thread_priority = "0.20.2" mdns_service = { path="../../../../media/mtransport/mdns_service", optional = true } neqo_glue = { path = "../../../../netwerk/socket/neqo_glue" } rlbox_lucet_sandbox = { version = "0.1.0", optional = true } +mapped_hyph = { git = "https://github.com/jfkthame/mapped_hyph.git", tag = "v0.3.0" } [build-dependencies] rustc_version = "0.2" diff --git a/toolkit/library/rust/shared/lib.rs b/toolkit/library/rust/shared/lib.rs index 72925e957444..f57876296d6b 100644 --- a/toolkit/library/rust/shared/lib.rs +++ b/toolkit/library/rust/shared/lib.rs @@ -6,6 +6,7 @@ extern crate geckoservo; +extern crate mapped_hyph; extern crate kvstore; extern crate mp4parse_capi; extern crate nsstring;