Bug 1590167 - Add Rust implementation of hyphenation (mapped_hyph) and hook up in place of libhyphen. r=heycam

Differential Revision: https://phabricator.services.mozilla.com/D49967

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Jonathan Kew 2019-11-13 22:11:22 +00:00
parent 74f8ebeed2
commit 7bb39d9e94
92 changed files with 11456 additions and 3262 deletions

View File

@ -17,6 +17,11 @@ git = "https://github.com/mozilla/neqo"
replace-with = "vendored-sources"
rev = "a17c1e83"
[source."https://github.com/jfkthame/mapped_hyph.git"]
git = "https://github.com/jfkthame/mapped_hyph.git"
replace-with = "vendored-sources"
tag = "v0.3.0"
[source."https://github.com/hsivonen/packed_simd"]
branch = "rust_1_32"
git = "https://github.com/hsivonen/packed_simd"

13
Cargo.lock generated
View File

@ -1261,6 +1261,7 @@ dependencies = [
"kvstore 0.1.0",
"lmdb-rkv-sys 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"mapped_hyph 0.3.0 (git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0)",
"mdns_service 0.1.0",
"mozurl 0.0.1",
"mp4parse_capi 0.11.2",
@ -1800,6 +1801,15 @@ dependencies = [
"synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "mapped_hyph"
version = "0.3.0"
source = "git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0#3b5fffbe17e8cdcc6814886a9b9170fde3db13bd"
dependencies = [
"arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "marionette"
version = "0.1.0"
@ -2889,7 +2899,7 @@ dependencies = [
"byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"digest 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"murmurhash3 0.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -4283,6 +4293,7 @@ dependencies = [
"checksum lzw 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7d947cbb889ed21c2a84be6ffbaebf5b4e0f4340638cba0444907e38b56be084"
"checksum mach 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa"
"checksum malloc_size_of_derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "35adee9ed962cf7d07d62cb58bc45029f3227f5b5b86246caa8632f06c187bc3"
"checksum mapped_hyph 0.3.0 (git+https://github.com/jfkthame/mapped_hyph.git?tag=v0.3.0)" = "<none>"
"checksum matches 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "100aabe6b8ff4e4a7e32c1c13523379802df0772b82466207ac25b013f193376"
"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39"
"checksum memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"

View File

@ -1,46 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* To enable us to load hyphenation dictionaries from arbitrary resource URIs,
* not just through file paths using stdio, we override the (few) stdio APIs
* that hyphen.c uses and provide our own reimplementation that calls Gecko
* i/o methods.
*/
#include <stdio.h> /* ensure stdio.h is loaded before our macros */
#undef FILE
#define FILE hnjFile
#define fopen(path, mode) hnjFopen(path, mode)
#define fclose(file) hnjFclose(file)
#define fgets(buf, count, file) hnjFgets(buf, count, file)
#define feof(file) hnjFeof(file)
#define fgetc(file) hnjFgetc(file)
typedef struct hnjFile_ hnjFile;
#ifdef __cplusplus
extern "C" {
#endif
void* hnj_malloc(size_t size);
void* hnj_realloc(void* ptr, size_t size);
void hnj_free(void* ptr);
hnjFile* hnjFopen(const char* aURISpec, const char* aMode);
int hnjFclose(hnjFile* f);
char* hnjFgets(char* s, int n, hnjFile* f);
int hnjFeof(hnjFile* f);
int hnjFgetc(hnjFile* f);
#ifdef __cplusplus
}
#endif

View File

@ -1,133 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// This file provides substitutes for the basic stdio routines used by hyphen.c
// to read its dictionary files. We #define the stdio names to these versions
// in hnjalloc.h, so that we can use nsIURI and nsIInputStream to specify and
// access the dictionary resources.
#include "hnjalloc.h"
#undef FILE // Undo #defines from hnjalloc.h before #including other headers
#undef fopen
#undef fclose
#undef fgets
#undef feof
#undef fgetc
#include "nsNetUtil.h"
#include "nsIInputStream.h"
#include "nsIURI.h"
#include "nsContentUtils.h"
#define BUFSIZE 1024
struct hnjFile_ {
nsCOMPtr<nsIInputStream> mStream;
char mBuffer[BUFSIZE];
uint32_t mCurPos;
uint32_t mLimit;
bool mEOF;
};
// replacement for fopen()
// (not a full substitute: only supports read access)
hnjFile* hnjFopen(const char* aURISpec, const char* aMode) {
// this override only needs to support "r"
NS_ASSERTION(!strcmp(aMode, "r"), "unsupported fopen() mode in hnjFopen");
nsCOMPtr<nsIURI> uri;
nsresult rv = NS_NewURI(getter_AddRefs(uri), aURISpec);
if (NS_FAILED(rv)) {
return nullptr;
}
nsCOMPtr<nsIChannel> channel;
rv = NS_NewChannel(getter_AddRefs(channel), uri,
nsContentUtils::GetSystemPrincipal(),
nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL,
nsIContentPolicy::TYPE_OTHER);
if (NS_FAILED(rv)) {
return nullptr;
}
nsCOMPtr<nsIInputStream> instream;
rv = channel->Open(getter_AddRefs(instream));
if (NS_FAILED(rv)) {
return nullptr;
}
hnjFile* f = new hnjFile;
f->mStream = instream;
f->mCurPos = 0;
f->mLimit = 0;
f->mEOF = false;
return f;
}
// replacement for fclose()
int hnjFclose(hnjFile* f) {
NS_ASSERTION(f && f->mStream, "bad argument to hnjFclose");
int result = 0;
nsresult rv = f->mStream->Close();
if (NS_FAILED(rv)) {
result = EOF;
}
f->mStream = nullptr;
delete f;
return result;
}
// replacement for fgetc()
int hnjFgetc(hnjFile* f) {
if (f->mCurPos >= f->mLimit) {
f->mCurPos = 0;
nsresult rv = f->mStream->Read(f->mBuffer, BUFSIZE, &f->mLimit);
if (NS_FAILED(rv)) {
f->mLimit = 0;
}
if (f->mLimit == 0) {
f->mEOF = true;
return EOF;
}
}
return f->mBuffer[f->mCurPos++];
}
// replacement for fgets()
// (not a full reimplementation, but sufficient for libhyphen's needs)
char* hnjFgets(char* s, int n, hnjFile* f) {
NS_ASSERTION(s && f, "bad argument to hnjFgets");
int i = 0;
while (i < n - 1) {
int c = hnjFgetc(f);
if (c == EOF) {
break;
}
s[i++] = c;
if (c == '\n' || c == '\r') {
break;
}
}
if (i == 0) {
return nullptr; // end of file
}
s[i] = '\0'; // null-terminate the returned string
return s;
}
int hnjFeof(hnjFile* f) { return f->mEOF ? EOF : 0; }

View File

@ -14,16 +14,18 @@ UNIFIED_SOURCES += [
'nsHyphenator.cpp',
]
# These files cannot be built in unified mode because they include hnjalloc.h.
SOURCES += [
'hnjstdio.cpp',
]
LOCAL_INCLUDES += [
'../hyphen',
]
FINAL_LIBRARY = 'xul'
if CONFIG['CC_TYPE'] in ('clang', 'gcc'):
CXXFLAGS += ['-Wno-error=shadow']
if CONFIG['COMPILE_ENVIRONMENT']:
GENERATED_FILES += [
'mapped_hyph.h'
]
generated = GENERATED_FILES['mapped_hyph.h']
generated.script = '/layout/style/RunCbindgen.py:generate'
generated.inputs = [
'/third_party/rust/mapped_hyph'
]

View File

@ -37,8 +37,7 @@ static const char kMemoryPressureNotification[] = "memory-pressure";
static const char kParentShuttingDownNotification[] = "profile-before-change";
static const char kChildShuttingDownNotification[] = "content-child-shutdown";
class HyphenReporter final : public nsIMemoryReporter,
public CountingAllocatorBase<HyphenReporter> {
class HyphenReporter final : public nsIMemoryReporter {
private:
~HyphenReporter() = default;
@ -47,14 +46,19 @@ class HyphenReporter final : public nsIMemoryReporter,
// For telemetry, we report the memory rounded up to the nearest KB.
static uint32_t MemoryAllocatedInKB() {
return (MemoryAllocated() + 1023) / 1024;
size_t total = 0;
if (nsHyphenationManager::Instance()) {
total = nsHyphenationManager::Instance()->SizeOfIncludingThis(
moz_malloc_size_of);
}
return (total + 1023) / 1024;
}
NS_IMETHOD CollectReports(nsIHandleReportCallback* aHandleReport,
nsISupports* aData, bool aAnonymize) override {
size_t total = MemoryAllocated();
size_t total = 0;
if (nsHyphenationManager::Instance()) {
total += nsHyphenationManager::Instance()->SizeOfIncludingThis(
total = nsHyphenationManager::Instance()->SizeOfIncludingThis(
moz_malloc_size_of);
}
MOZ_COLLECT_REPORT("explicit/hyphenation", KIND_HEAP, UNITS_BYTES, total,
@ -65,30 +69,6 @@ class HyphenReporter final : public nsIMemoryReporter,
NS_IMPL_ISUPPORTS(HyphenReporter, nsIMemoryReporter)
template <>
CountingAllocatorBase<HyphenReporter>::AmountType
CountingAllocatorBase<HyphenReporter>::sAmount(0);
/**
* Allocation wrappers to track the amount of memory allocated by libhyphen.
* Note that libhyphen assumes its malloc/realloc functions are infallible!
*/
extern "C" {
void* hnj_malloc(size_t aSize);
void* hnj_realloc(void* aPtr, size_t aSize);
void hnj_free(void* aPtr);
};
void* hnj_malloc(size_t aSize) {
return HyphenReporter::InfallibleCountingMalloc(aSize);
}
void* hnj_realloc(void* aPtr, size_t aSize) {
return HyphenReporter::InfallibleCountingRealloc(aPtr, aSize);
}
void hnj_free(void* aPtr) { HyphenReporter::CountingFree(aPtr); }
nsHyphenationManager* nsHyphenationManager::sInstance = nullptr;
NS_IMPL_ISUPPORTS(nsHyphenationManager, nsIObserver)
@ -257,7 +237,7 @@ void nsHyphenationManager::LoadPatternListFromOmnijar(Omnijar::Type aType) {
}
nsZipFind* find;
zip->FindInit("hyphenation/hyph_*.dic", &find);
zip->FindInit("hyphenation/hyph_*.hyf", &find);
if (!find) {
return;
}
@ -278,7 +258,7 @@ void nsHyphenationManager::LoadPatternListFromOmnijar(Omnijar::Type aType) {
continue;
}
ToLowerCase(locale);
locale.SetLength(locale.Length() - 4); // strip ".dic"
locale.SetLength(locale.Length() - 4); // strip ".hyf"
locale.Cut(0, locale.RFindChar('/') + 1); // strip directory
if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) {
locale.Cut(0, 5);
@ -323,13 +303,13 @@ void nsHyphenationManager::LoadPatternListFromDir(nsIFile* aDir) {
file->GetLeafName(dictName);
NS_ConvertUTF16toUTF8 locale(dictName);
ToLowerCase(locale);
if (!StringEndsWith(locale, NS_LITERAL_CSTRING(".dic"))) {
if (!StringEndsWith(locale, NS_LITERAL_CSTRING(".hyf"))) {
continue;
}
if (StringBeginsWith(locale, NS_LITERAL_CSTRING("hyph_"))) {
locale.Cut(0, 5);
}
locale.SetLength(locale.Length() - 4); // strip ".dic"
locale.SetLength(locale.Length() - 4); // strip ".hyf"
for (uint32_t i = 0; i < locale.Length(); ++i) {
if (locale[i] == '_') {
locale.Replace(i, 1, '-');
@ -383,9 +363,6 @@ size_t nsHyphenationManager::SizeOfIncludingThis(MallocSizeOf aMallocSizeOf) {
// finds it is worthwhile.
result += mHyphenators.ShallowSizeOfExcludingThis(aMallocSizeOf);
for (auto i = mHyphenators.ConstIter(); !i.Done(); i.Next()) {
result += aMallocSizeOf(i.Data().get());
}
return result;
}

View File

@ -4,34 +4,147 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsHyphenator.h"
#include "nsIFile.h"
#include "nsUTF8Utils.h"
#include "nsUnicodeProperties.h"
#include "nsIURI.h"
#include "mozilla/Telemetry.h"
#include "hyphen.h"
#include "mozilla/Telemetry.h"
#include "nsContentUtils.h"
#include "nsIChannel.h"
#include "nsIFile.h"
#include "nsIFileURL.h"
#include "nsIInputStream.h"
#include "nsIJARURI.h"
#include "nsIURI.h"
#include "nsNetUtil.h"
#include "nsUnicodeProperties.h"
#include "nsUTF8Utils.h"
#include "mapped_hyph.h"
static const void* GetItemPtrFromJarURI(nsIJARURI* aJAR, uint32_t* aLength) {
// Try to get the jarfile's nsZipArchive, find the relevant item, and return
// a pointer to its data provided it is stored uncompressed.
nsCOMPtr<nsIURI> jarFile;
if (NS_FAILED(aJAR->GetJARFile(getter_AddRefs(jarFile)))) {
return nullptr;
}
nsCOMPtr<nsIFileURL> fileUrl = do_QueryInterface(jarFile);
if (!fileUrl) {
return nullptr;
}
nsCOMPtr<nsIFile> file;
fileUrl->GetFile(getter_AddRefs(file));
if (!file) {
return nullptr;
}
RefPtr<nsZipArchive> archive = mozilla::Omnijar::GetReader(file);
if (archive) {
nsCString path;
aJAR->GetJAREntry(path);
nsZipItem* item = archive->GetItem(path.get());
if (item && item->Compression() == 0 && item->Size() > 0) {
// We do NOT own this data, but it won't go away until the omnijar
// file is closed during shutdown.
const uint8_t* data = archive->GetData(item);
if (data) {
*aLength = item->Size();
return data;
}
}
}
return nullptr;
}
static const void* LoadResourceFromURI(nsIURI* aURI, uint32_t* aLength) {
nsCOMPtr<nsIChannel> channel;
if (NS_FAILED(NS_NewChannel(getter_AddRefs(channel), aURI,
nsContentUtils::GetSystemPrincipal(),
nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL,
nsIContentPolicy::TYPE_OTHER))) {
return nullptr;
}
nsCOMPtr<nsIInputStream> instream;
if (NS_FAILED(channel->Open(getter_AddRefs(instream)))) {
return nullptr;
}
// Check size, bail out if it is excessively large (the largest of the
// hyphenation files currently shipped with Firefox is around 1MB
// uncompressed).
uint64_t available;
if (NS_FAILED(instream->Available(&available)) || !available ||
available > 16 * 1024 * 1024) {
return nullptr;
}
char* buffer = static_cast<char*>(malloc(available));
if (!buffer) {
return nullptr;
}
uint32_t bytesRead = 0;
if (NS_FAILED(instream->Read(buffer, available, &bytesRead)) ||
bytesRead != available) {
free(buffer);
return nullptr;
}
*aLength = bytesRead;
return buffer;
}
nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized)
: mDict(nullptr), mHyphenateCapitalized(aHyphenateCapitalized) {
nsCString uriSpec;
nsresult rv = aURI->GetSpec(uriSpec);
if (NS_FAILED(rv)) {
return;
}
: mDict(nullptr),
mDictSize(0),
mOwnsDict(false),
mHyphenateCapitalized(aHyphenateCapitalized) {
Telemetry::AutoTimer<Telemetry::HYPHENATION_LOAD_TIME> telemetry;
mDict = hnj_hyphen_load(uriSpec.get());
#ifdef DEBUG
if (mDict) {
printf("loaded hyphenation patterns from %s\n", uriSpec.get());
nsCOMPtr<nsIJARURI> jar = do_QueryInterface(aURI);
if (jar) {
// This gives us a raw pointer into the omnijar's data (if uncompressed);
// we do not own it and must not attempt to free it!
mDict = GetItemPtrFromJarURI(jar, &mDictSize);
if (!mDict) {
// Omnijar must be compressed: we need to decompress the item into our
// own buffer. (Currently this is the case on Android.)
// TODO: Allocate in shared memory for all content processes to use.
mDict = LoadResourceFromURI(aURI, &mDictSize);
mOwnsDict = true;
}
if (mDict) {
// Reject the resource from omnijar if it fails to validate. (If this
// happens, we will hit the MOZ_ASSERT_UNREACHABLE at the end of the
// constructor, indicating the build is broken in some way.)
if (!mapped_hyph_is_valid_hyphenator(static_cast<const uint8_t*>(mDict),
mDictSize)) {
if (mOwnsDict) {
free(const_cast<void*>(mDict));
}
mDict = nullptr;
mDictSize = 0;
}
}
} else if (mozilla::net::SchemeIsFile(aURI)) {
// Ask the Rust lib to mmap the file. In this case our mDictSize field
// remains zero; mDict is not a pointer to the raw data but an opaque
// reference to a Rust object, and can only be freed by passing it to
// mapped_hyph_free_dictionary().
nsAutoCString path;
aURI->GetFilePath(path);
mDict = mapped_hyph_load_dictionary(path.get());
}
if (!mDict) {
// This should never happen, unless someone has included an invalid
// hyphenation file that fails to load.
MOZ_ASSERT_UNREACHABLE("invalid hyphenation resource?");
}
#endif
}
nsHyphenator::~nsHyphenator() {
if (mDict != nullptr) {
hnj_hyphen_free((HyphenDict*)mDict);
mDict = nullptr;
if (mDict) {
if (mDictSize) {
if (mOwnsDict) {
free(const_cast<void*>(mDict));
}
} else {
mapped_hyph_free_dictionary((HyphDic*)mDict);
}
}
}
@ -83,13 +196,12 @@ nsresult nsHyphenator::Hyphenate(const nsAString& aString,
void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
uint32_t aLimit, nsTArray<bool>& aHyphens) {
// Convert word from aStart and aLimit in aString to utf-8 for libhyphen,
// Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph,
// lowercasing it as we go so that it will match the (lowercased) patterns
// (bug 1105644).
nsAutoCString utf8;
const char16_t* const begin = aString.BeginReading();
const char16_t* cur = begin + aStart;
const char16_t* end = begin + aLimit;
const char16_t* cur = aString.BeginReading() + aStart;
const char16_t* end = aString.BeginReading() + aLimit;
bool firstLetter = true;
while (cur < end) {
uint32_t ch = *cur++;
@ -98,10 +210,10 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
ch = SURROGATE_TO_UCS4(ch, *cur++);
} else {
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
return; // unpaired surrogate: bail out, don't hyphenate broken text
}
} else if (NS_IS_LOW_SURROGATE(ch)) {
ch = 0xfffd; // unpaired surrogate
return; // unpaired surrogate
}
// XXX What about language-specific casing? Consider Turkish I/i...
@ -111,15 +223,11 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
ch = ToLowerCase(ch);
if (ch != origCh) {
if (firstLetter) {
// Avoid hyphenating capitalized words (bug 1550532) unless explicitly
// allowed by prefs for the language in use.
if (!mHyphenateCapitalized) {
return;
}
} else {
// Also never auto-hyphenate a word that has internal caps, as it may
// well be an all-caps acronym or a quirky name like iTunes.
// Avoid hyphenating capitalized words (bug 1550532) unless explicitly
// allowed by prefs for the language in use.
// Also never auto-hyphenate a word that has internal caps, as it may
// well be an all-caps acronym or a quirky name like iTunes.
if (!mHyphenateCapitalized || !firstLetter) {
return;
}
}
@ -142,31 +250,43 @@ void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
}
}
AutoTArray<char, 200> utf8hyphens;
utf8hyphens.SetLength(utf8.Length() + 5);
char** rep = nullptr;
int* pos = nullptr;
int* cut = nullptr;
int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(),
utf8.Length(), utf8hyphens.Elements(),
nullptr, &rep, &pos, &cut);
if (!err) {
// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
// from utf8 code unit indexing (which would match the utf8 input
// string directly) to Unicode character indexing.
// We then need to convert this to utf16 code unit offsets for Gecko.
const char* hyphPtr = utf8hyphens.Elements();
const char16_t* cur = begin + aStart;
const char16_t* end = begin + aLimit;
while (cur < end) {
if (*hyphPtr & 0x01) {
aHyphens[cur - begin] = true;
AutoTArray<uint8_t, 200> hyphenValues;
hyphenValues.SetLength(utf8.Length());
int32_t result;
if (mDictSize > 0) {
result = mapped_hyph_find_hyphen_values_raw(
static_cast<const uint8_t*>(mDict), mDictSize, utf8.BeginReading(),
utf8.Length(), hyphenValues.Elements(), hyphenValues.Length());
} else {
result = mapped_hyph_find_hyphen_values_dic(
static_cast<const HyphDic*>(mDict), utf8.BeginReading(), utf8.Length(),
hyphenValues.Elements(), hyphenValues.Length());
}
if (result > 0) {
// We need to convert UTF-8 indexing as used by the hyphenation lib into
// UTF-16 indexing of the aHyphens[] array for Gecko.
uint32_t utf16index = 0;
for (uint32_t utf8index = 0; utf8index < utf8.Length();) {
// We know utf8 is valid, so we only need to look at the first byte of
// each character to determine its length and the corresponding UTF-16
// length to add to utf16index.
const uint8_t leadByte = utf8[utf8index];
if (leadByte < 0x80) {
utf8index += 1;
} else if (leadByte < 0xE0) {
utf8index += 2;
} else if (leadByte < 0xF0) {
utf8index += 3;
} else {
utf8index += 4;
}
cur++;
if (cur < end && NS_IS_SURROGATE_PAIR(*(cur - 1), *cur)) {
cur++;
// The hyphenation value of interest is the one for the last code unit
// of the utf-8 character, and is recorded on the last code unit of the
// utf-16 character (in the case of a surrogate pair).
utf16index += leadByte >= 0xF0 ? 2 : 1;
if (utf16index > 0 && (hyphenValues[utf8index - 1] & 0x01)) {
aHyphens[aStart + utf16index - 1] = true;
}
hyphPtr++;
}
}
}

View File

@ -28,7 +28,14 @@ class nsHyphenator {
void HyphenateWord(const nsAString& aString, uint32_t aStart, uint32_t aLimit,
nsTArray<bool>& aHyphens);
void* mDict;
const void* mDict; // If mDictSize > 0, this points to a raw byte buffer
// containing the hyphenation dictionary data (in the
// memory-mapped omnijar, or owned by us if mOwnsDict);
// if mDictSize == 0, it's a HyphDic reference created
// by mapped_hyph_load_dictionary() and must be released
// by calling mapped_hyph_free_dictionary().
uint32_t mDictSize;
bool mOwnsDict;
bool mHyphenateCapitalized;
};

View File

@ -1,17 +0,0 @@
Libhnj was written by Raph Levien <raph at acm dot org>.
Original Libhnj source with OOo's patches are managed by Rene Engelhard and
Chris Halls at Debian: http://packages.debian.org/stable/libdevel/libhnj-dev
and http://packages.debian.org/unstable/source/libhnj
This subset of Libhnj was extended by
Peter Novodvorsky <nidd at alt-linux dot org> (OOo integration),
László Németh <nemeth at numbertext dot org> (non-standard and compound
hyphenation with Unicode support),
Nanning Buitenhuis <nanning at elvenkind dot com> (substrings.c)
Write bug reports to László Németh or in the bug tracker of hunspell.sf.net.
---
Please contact Raph Levien for information about licensing for
proprietary applications.

View File

@ -1,17 +0,0 @@
GPL 2.0/LGPL 2.1/MPL 1.1 tri-license
The contents of this software may be used under the terms of
the GNU General Public License Version 2 or later (the "GPL"), or
the GNU Lesser General Public License Version 2.1 or later (the "LGPL",
see COPYING.LGPL) or the Mozilla Public License Version 1.1 or later
(the "MPL", see COPYING.MPL).
The Plain TeX hyphenation tables "hyphen.tex" by Donald E. Knuth
has a non MPL/LGPL compatible license, but freely redistributable:
"Unlimited copying and redistribution of this file are permitted as long
as this file is not modified. Modifications are permitted, but only if
the resulting file is not named hyphen.tex."
Software distributed under these licenses is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences
for the specific language governing rights and limitations under the licenses.

View File

@ -1,515 +0,0 @@
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations
below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
^L
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it
becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
^L
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control
compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
^L
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
^L
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
^L
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
^L
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply, and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License
may add an explicit geographical distribution limitation excluding those
countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
^L
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
^L
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms
of the ordinary General Public License).
To apply these terms, attach the following notices to the library.
It is safest to attach them to the start of each source file to most
effectively convey the exclusion of warranty; and each file should
have at least the "copyright" line and a pointer to where the full
notice is found.
<one line to give the library's name and a brief idea of what it
does.>
Copyright (C) <year> <name of author>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Also add information on how to contact you by electronic and paper
mail.
You should also get your employer (if you work as a programmer) or
your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James
Random Hacker.
<signature of Ty Coon>, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!

View File

@ -1,470 +0,0 @@
MOZILLA PUBLIC LICENSE
Version 1.1
---------------
1. Definitions.
1.0.1. "Commercial Use" means distribution or otherwise making the
Covered Code available to a third party.
1.1. "Contributor" means each entity that creates or contributes to
the creation of Modifications.
1.2. "Contributor Version" means the combination of the Original
Code, prior Modifications used by a Contributor, and the Modifications
made by that particular Contributor.
1.3. "Covered Code" means the Original Code or Modifications or the
combination of the Original Code and Modifications, in each case
including portions thereof.
1.4. "Electronic Distribution Mechanism" means a mechanism generally
accepted in the software development community for the electronic
transfer of data.
1.5. "Executable" means Covered Code in any form other than Source
Code.
1.6. "Initial Developer" means the individual or entity identified
as the Initial Developer in the Source Code notice required by Exhibit
A.
1.7. "Larger Work" means a work which combines Covered Code or
portions thereof with code not governed by the terms of this License.
1.8. "License" means this document.
1.8.1. "Licensable" means having the right to grant, to the maximum
extent possible, whether at the time of the initial grant or
subsequently acquired, any and all of the rights conveyed herein.
1.9. "Modifications" means any addition to or deletion from the
substance or structure of either the Original Code or any previous
Modifications. When Covered Code is released as a series of files, a
Modification is:
A. Any addition to or deletion from the contents of a file
containing Original Code or previous Modifications.
B. Any new file that contains any part of the Original Code or
previous Modifications.
1.10. "Original Code" means Source Code of computer software code
which is described in the Source Code notice required by Exhibit A as
Original Code, and which, at the time of its release under this
License is not already Covered Code governed by this License.
1.10.1. "Patent Claims" means any patent claim(s), now owned or
hereafter acquired, including without limitation, method, process,
and apparatus claims, in any patent Licensable by grantor.
1.11. "Source Code" means the preferred form of the Covered Code for
making modifications to it, including all modules it contains, plus
any associated interface definition files, scripts used to control
compilation and installation of an Executable, or source code
differential comparisons against either the Original Code or another
well known, available Covered Code of the Contributor's choice. The
Source Code can be in a compressed or archival form, provided the
appropriate decompression or de-archiving software is widely available
for no charge.
1.12. "You" (or "Your") means an individual or a legal entity
exercising rights under, and complying with all of the terms of, this
License or a future version of this License issued under Section 6.1.
For legal entities, "You" includes any entity which controls, is
controlled by, or is under common control with You. For purposes of
this definition, "control" means (a) the power, direct or indirect,
to cause the direction or management of such entity, whether by
contract or otherwise, or (b) ownership of more than fifty percent
(50%) of the outstanding shares or beneficial ownership of such
entity.
2. Source Code License.
2.1. The Initial Developer Grant.
The Initial Developer hereby grants You a world-wide, royalty-free,
non-exclusive license, subject to third party intellectual property
claims:
(a) under intellectual property rights (other than patent or
trademark) Licensable by Initial Developer to use, reproduce,
modify, display, perform, sublicense and distribute the Original
Code (or portions thereof) with or without Modifications, and/or
as part of a Larger Work; and
(b) under Patents Claims infringed by the making, using or
selling of Original Code, to make, have made, use, practice,
sell, and offer for sale, and/or otherwise dispose of the
Original Code (or portions thereof).
(c) the licenses granted in this Section 2.1(a) and (b) are
effective on the date Initial Developer first distributes
Original Code under the terms of this License.
(d) Notwithstanding Section 2.1(b) above, no patent license is
granted: 1) for code that You delete from the Original Code; 2)
separate from the Original Code; or 3) for infringements caused
by: i) the modification of the Original Code or ii) the
combination of the Original Code with other software or devices.
2.2. Contributor Grant.
Subject to third party intellectual property claims, each Contributor
hereby grants You a world-wide, royalty-free, non-exclusive license
(a) under intellectual property rights (other than patent or
trademark) Licensable by Contributor, to use, reproduce, modify,
display, perform, sublicense and distribute the Modifications
created by such Contributor (or portions thereof) either on an
unmodified basis, with other Modifications, as Covered Code
and/or as part of a Larger Work; and
(b) under Patent Claims infringed by the making, using, or
selling of Modifications made by that Contributor either alone
and/or in combination with its Contributor Version (or portions
of such combination), to make, use, sell, offer for sale, have
made, and/or otherwise dispose of: 1) Modifications made by that
Contributor (or portions thereof); and 2) the combination of
Modifications made by that Contributor with its Contributor
Version (or portions of such combination).
(c) the licenses granted in Sections 2.2(a) and 2.2(b) are
effective on the date Contributor first makes Commercial Use of
the Covered Code.
(d) Notwithstanding Section 2.2(b) above, no patent license is
granted: 1) for any code that Contributor has deleted from the
Contributor Version; 2) separate from the Contributor Version;
3) for infringements caused by: i) third party modifications of
Contributor Version or ii) the combination of Modifications made
by that Contributor with other software (except as part of the
Contributor Version) or other devices; or 4) under Patent Claims
infringed by Covered Code in the absence of Modifications made by
that Contributor.
3. Distribution Obligations.
3.1. Application of License.
The Modifications which You create or to which You contribute are
governed by the terms of this License, including without limitation
Section 2.2. The Source Code version of Covered Code may be
distributed only under the terms of this License or a future version
of this License released under Section 6.1, and You must include a
copy of this License with every copy of the Source Code You
distribute. You may not offer or impose any terms on any Source Code
version that alters or restricts the applicable version of this
License or the recipients' rights hereunder. However, You may include
an additional document offering the additional rights described in
Section 3.5.
3.2. Availability of Source Code.
Any Modification which You create or to which You contribute must be
made available in Source Code form under the terms of this License
either on the same media as an Executable version or via an accepted
Electronic Distribution Mechanism to anyone to whom you made an
Executable version available; and if made available via Electronic
Distribution Mechanism, must remain available for at least twelve (12)
months after the date it initially became available, or at least six
(6) months after a subsequent version of that particular Modification
has been made available to such recipients. You are responsible for
ensuring that the Source Code version remains available even if the
Electronic Distribution Mechanism is maintained by a third party.
3.3. Description of Modifications.
You must cause all Covered Code to which You contribute to contain a
file documenting the changes You made to create that Covered Code and
the date of any change. You must include a prominent statement that
the Modification is derived, directly or indirectly, from Original
Code provided by the Initial Developer and including the name of the
Initial Developer in (a) the Source Code, and (b) in any notice in an
Executable version or related documentation in which You describe the
origin or ownership of the Covered Code.
3.4. Intellectual Property Matters
(a) Third Party Claims.
If Contributor has knowledge that a license under a third party's
intellectual property rights is required to exercise the rights
granted by such Contributor under Sections 2.1 or 2.2,
Contributor must include a text file with the Source Code
distribution titled "LEGAL" which describes the claim and the
party making the claim in sufficient detail that a recipient will
know whom to contact. If Contributor obtains such knowledge after
the Modification is made available as described in Section 3.2,
Contributor shall promptly modify the LEGAL file in all copies
Contributor makes available thereafter and shall take other steps
(such as notifying appropriate mailing lists or newsgroups)
reasonably calculated to inform those who received the Covered
Code that new knowledge has been obtained.
(b) Contributor APIs.
If Contributor's Modifications include an application programming
interface and Contributor has knowledge of patent licenses which
are reasonably necessary to implement that API, Contributor must
also include this information in the LEGAL file.
(c) Representations.
Contributor represents that, except as disclosed pursuant to
Section 3.4(a) above, Contributor believes that Contributor's
Modifications are Contributor's original creation(s) and/or
Contributor has sufficient rights to grant the rights conveyed by
this License.
3.5. Required Notices.
You must duplicate the notice in Exhibit A in each file of the Source
Code. If it is not possible to put such notice in a particular Source
Code file due to its structure, then You must include such notice in a
location (such as a relevant directory) where a user would be likely
to look for such a notice. If You created one or more Modification(s)
You may add your name as a Contributor to the notice described in
Exhibit A. You must also duplicate this License in any documentation
for the Source Code where You describe recipients' rights or ownership
rights relating to Covered Code. You may choose to offer, and to
charge a fee for, warranty, support, indemnity or liability
obligations to one or more recipients of Covered Code. However, You
may do so only on Your own behalf, and not on behalf of the Initial
Developer or any Contributor. You must make it absolutely clear than
any such warranty, support, indemnity or liability obligation is
offered by You alone, and You hereby agree to indemnify the Initial
Developer and every Contributor for any liability incurred by the
Initial Developer or such Contributor as a result of warranty,
support, indemnity or liability terms You offer.
3.6. Distribution of Executable Versions.
You may distribute Covered Code in Executable form only if the
requirements of Section 3.1-3.5 have been met for that Covered Code,
and if You include a notice stating that the Source Code version of
the Covered Code is available under the terms of this License,
including a description of how and where You have fulfilled the
obligations of Section 3.2. The notice must be conspicuously included
in any notice in an Executable version, related documentation or
collateral in which You describe recipients' rights relating to the
Covered Code. You may distribute the Executable version of Covered
Code or ownership rights under a license of Your choice, which may
contain terms different from this License, provided that You are in
compliance with the terms of this License and that the license for the
Executable version does not attempt to limit or alter the recipient's
rights in the Source Code version from the rights set forth in this
License. If You distribute the Executable version under a different
license You must make it absolutely clear that any terms which differ
from this License are offered by You alone, not by the Initial
Developer or any Contributor. You hereby agree to indemnify the
Initial Developer and every Contributor for any liability incurred by
the Initial Developer or such Contributor as a result of any such
terms You offer.
3.7. Larger Works.
You may create a Larger Work by combining Covered Code with other code
not governed by the terms of this License and distribute the Larger
Work as a single product. In such a case, You must make sure the
requirements of this License are fulfilled for the Covered Code.
4. Inability to Comply Due to Statute or Regulation.
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Code due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description
must be included in the LEGAL file described in Section 3.4 and must
be included with all distributions of the Source Code. Except to the
extent prohibited by statute or regulation, such description must be
sufficiently detailed for a recipient of ordinary skill to be able to
understand it.
5. Application of this License.
This License applies to code to which the Initial Developer has
attached the notice in Exhibit A and to related Covered Code.
6. Versions of the License.
6.1. New Versions.
Netscape Communications Corporation ("Netscape") may publish revised
and/or new versions of the License from time to time. Each version
will be given a distinguishing version number.
6.2. Effect of New Versions.
Once Covered Code has been published under a particular version of the
License, You may always continue to use it under the terms of that
version. You may also choose to use such Covered Code under the terms
of any subsequent version of the License published by Netscape. No one
other than Netscape has the right to modify the terms applicable to
Covered Code created under this License.
6.3. Derivative Works.
If You create or use a modified version of this License (which you may
only do in order to apply it to code which is not already Covered Code
governed by this License), You must (a) rename Your license so that
the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape",
"MPL", "NPL" or any confusingly similar phrase do not appear in your
license (except to note that your license differs from this License)
and (b) otherwise make it clear that Your version of the license
contains terms which differ from the Mozilla Public License and
Netscape Public License. (Filling in the name of the Initial
Developer, Original Code or Contributor in the notice described in
Exhibit A shall not of themselves be deemed to be modifications of
this License.)
7. DISCLAIMER OF WARRANTY.
COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF
DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING.
THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE
IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT,
YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE
COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER
OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
8. TERMINATION.
8.1. This License and the rights granted hereunder will terminate
automatically if You fail to comply with terms herein and fail to cure
such breach within 30 days of becoming aware of the breach. All
sublicenses to the Covered Code which are properly granted shall
survive any termination of this License. Provisions which, by their
nature, must remain in effect beyond the termination of this License
shall survive.
8.2. If You initiate litigation by asserting a patent infringement
claim (excluding declatory judgment actions) against Initial Developer
or a Contributor (the Initial Developer or Contributor against whom
You file such action is referred to as "Participant") alleging that:
(a) such Participant's Contributor Version directly or indirectly
infringes any patent, then any and all rights granted by such
Participant to You under Sections 2.1 and/or 2.2 of this License
shall, upon 60 days notice from Participant terminate prospectively,
unless if within 60 days after receipt of notice You either: (i)
agree in writing to pay Participant a mutually agreeable reasonable
royalty for Your past and future use of Modifications made by such
Participant, or (ii) withdraw Your litigation claim with respect to
the Contributor Version against such Participant. If within 60 days
of notice, a reasonable royalty and payment arrangement are not
mutually agreed upon in writing by the parties or the litigation claim
is not withdrawn, the rights granted by Participant to You under
Sections 2.1 and/or 2.2 automatically terminate at the expiration of
the 60 day notice period specified above.
(b) any software, hardware, or device, other than such Participant's
Contributor Version, directly or indirectly infringes any patent, then
any rights granted to You by such Participant under Sections 2.1(b)
and 2.2(b) are revoked effective as of the date You first made, used,
sold, distributed, or had made, Modifications made by that
Participant.
8.3. If You assert a patent infringement claim against Participant
alleging that such Participant's Contributor Version directly or
indirectly infringes any patent where such claim is resolved (such as
by license or settlement) prior to the initiation of patent
infringement litigation, then the reasonable value of the licenses
granted by such Participant under Sections 2.1 or 2.2 shall be taken
into account in determining the amount or value of any payment or
license.
8.4. In the event of termination under Sections 8.1 or 8.2 above,
all end user license agreements (excluding distributors and resellers)
which have been validly granted by You or any distributor hereunder
prior to termination shall survive termination.
9. LIMITATION OF LIABILITY.
UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
(INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE,
OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR
ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL,
WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY
RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO
THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
10. U.S. GOVERNMENT END USERS.
The Covered Code is a "commercial item," as that term is defined in
48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
software" and "commercial computer software documentation," as such
terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48
C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995),
all U.S. Government End Users acquire Covered Code with only those
rights set forth herein.
11. MISCELLANEOUS.
This License represents the complete agreement concerning subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. This License shall be governed by
California law provisions (except to the extent applicable law, if
any, provides otherwise), excluding its conflict-of-law provisions.
With respect to disputes in which at least one party is a citizen of,
or an entity chartered or registered to do business in the United
States of America, any litigation relating to this License shall be
subject to the jurisdiction of the Federal Courts of the Northern
District of California, with venue lying in Santa Clara County,
California, with the losing party responsible for costs, including
without limitation, court costs and reasonable attorneys' fees and
expenses. The application of the United Nations Convention on
Contracts for the International Sale of Goods is expressly excluded.
Any law or regulation which provides that the language of a contract
shall be construed against the drafter shall not apply to this
License.
12. RESPONSIBILITY FOR CLAIMS.
As between Initial Developer and the Contributors, each party is
responsible for claims and damages arising, directly or indirectly,
out of its utilization of rights under this License and You agree to
work with Initial Developer and Contributors to distribute such
responsibility on an equitable basis. Nothing herein is intended or
shall be deemed to constitute any admission of liability.
13. MULTIPLE-LICENSED CODE.
Initial Developer may designate portions of the Covered Code as
"Multiple-Licensed". "Multiple-Licensed" means that the Initial
Developer permits you to utilize portions of the Covered Code under
Your choice of the NPL or the alternative licenses, if any, specified
by the Initial Developer in the file described in Exhibit A.
EXHIBIT A -Mozilla Public License.
``The contents of this file are subject to the Mozilla Public License
Version 1.1 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS"
basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific language governing rights and limitations
under the License.
The Original Code is ______________________________________.
The Initial Developer of the Original Code is ________________________.
Portions created by ______________________ are Copyright (C) ______
_______________________. All Rights Reserved.
Contributor(s): ______________________________________.
Alternatively, the contents of this file may be used under the terms
of the _____ license (the "[___] License"), in which case the
provisions of [______] License are applicable instead of those
above. If you wish to allow use of your version of this file only
under the terms of the [____] License and not to allow others to use
your version of this file under the MPL, indicate your decision by
deleting the provisions above and replace them with the notice and
other provisions required by the [___] License. If you do not delete
the provisions above, a recipient may use your version of this file
under either the MPL or the [___] License."
[NOTE: The text of this Exhibit A may differ slightly from the text of
the notices in the Source Code files of the Original Code. You should
use the text of this Exhibit A rather than the text found in the
Original Code Source Code for Your Modifications.]

View File

@ -1,106 +0,0 @@
2014-09-18 Hyphen 2.8.8:
- remove last coverity warning, 0 remaining
2014-06-27 Hyphen 2.8.7:
- various clang scan-build warning fixes
2012-09-13 Hyphen 2.8.6:
- righthyphenmin fix for 3-byte or more UTF-8
multibyte characters by Steven Dickson
- fix for fdo#43931 (removing hard hyphen hyphenation for LibreOffice)
2012-07-12 Hyphen 2.8.5:
- fix short alloc
2012-06-29 Hyphen 2.8.4:
- coverity warnings
2011-10-10 Hyphen 2.8.3:
- fix NOHYPHEN
- fix unbalanced hyphenation of LibreOffice/OOo
- set default COMPOUNDHYPHENMIN=3 at hyphens and apostrophes
- fix VERBOSE in hyphen.c
- new ./example option: -n to print hyphenation vector
2011-10-07 Hyphen 2.8.2:
- fix for explicite COMPOUNDHYPHENMIN values
2011-10-06 Hyphen 2.8.1:
- force minimal lefthyphenmin and righthyphenmin values of the dictionary
(eg. righthyphenmin=3 of English dictionaries in LibreOffice/OOo,
also the original TeX hyphenation patterns are correct only with this
righthyphenmin value).
2011-10-04 Hyphen 2.8:
- Ignore leading and ending numbers (eg. manual/field based indexes
in LibreOffice/OOo)
- Fix LibreOffice/OpenOffice.org hyphenation errors at apostrophes and
hyphens, n-dashes with default NOHYPHEN separators.
Eg. *o'c=lock -> o'clock.
2010-12-01 Hyphen 2.7.1 bug fix release
2010-11-27 Hyphen 2.7 release:
- The new hyphenation problem of OpenOffice.org 3.2, related to its
modified word breaking of words with hyphen characters, can be fixed
with the new NOHYPHEN feature. Also it's possible to solve the similar old
problem with apostrophes. More information: README.compound.
- improved English dictionaries
2010-08-10 Hyphen 2.6 release:
- maintainance release, fix all warnings, tidy up
make check with VALGRIND=memcheck, etc.
2010-02-23 Hyphen 2.5 release:
- add Unicode ligature support for correct hyphenmin calculation
(ff, fi, fl, St, st are 1-character, ffi and ffl are 2-character length for
hyphenation)
- fix lefthyphenmin calculation for UTF-8 encoded input
- en_US hyphenation dictionary:
- add OpenOffice.org patch to fix apostrophe handling
- add correct hyphenation for words with Unicode f-ligatures
(NOTE: hyphenation within ligatures is not supported yet
because of an implementation problem of OpenOffice.org,
see OOo issue 71608.)
- small patches from OpenOffice.org
2008-05-01 Hyphen 2.4 release:
- compound word hyphenation support by recursive pattern matching
based on two hyphenation pattern sets, see README.compound.
Especially useful for languages with arbitrary number of compounds (Danish,
Dutch, Finnish, German, Hungarian, Icelandic, Norwegian, Swedish etc.).
- new dictionary parameters (minimal character numbers for hyph. distances):
LEFTHYPHENMIN: minimal hyphenation distance from the left end of the word
RIGHTHYPHENMIN: minimal hyphenation distance from the right end of the word
COMPOUNDLEFTHYPHENMIN: min. hyph. dist. from the left compound word boundary
COMPOUNDRIGHTHYPHENMIN: min. hyph. dist. from the right comp. word boundary
- new API function: hnj_hyphen_hyphenate3() (like hyphenate2(), but
with hyphenmin options)
en_US hyphenation patterns:
- extended hyph_en_US.dic with TugBoat hyphenation log (fix thousand
incompletely or badly hyphenated words, for example acad-e-my, acro-nym,
acryl-amide, adren-a-line, aero-space, am-phet-a-mine, anom-aly etc.)
- fixed hyph_en_US.dic: set the right default hyphenation distance of
the original TeX hyphenation patterns:
LEFTHYPHENMIN 2
RIGHTHYPHENMIN 3 (not 2!)
It is not only a typographical issue. It seems, TeX hyphenation
patterns are right only with these settings, for example,
the bad "anoma-ly" is restricted in TeX only by the default
\righthyphenmin=3 (but not restricted in OpenOffice.org, until now).
- documentation (README_hyph_en_US.dic)
- fixes for automake configuration, compiling and checking, see ChangeLog
2008-02-19: Hyphen 2.3.1 release:
- fix obsolete API function hnj_hyphen_hyphenate()

View File

@ -1,134 +0,0 @@
Hyphen - hyphenation library to use converted TeX hyphenation patterns
(C) 1998 Raph Levien
(C) 2001 ALTLinux, Moscow
(C) 2006, 2007, 2008, 2010, 2011 László Németh
This was part of libHnj library by Raph Levien.
Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj
to use it in OpenOffice.org.
Compound word and non-standard hyphenation support by László Németh.
License is the original LibHnj license:
LibHnj is dual licensed under LGPL and MPL (see also README.libhnj).
Because LGPL allows GPL relicensing, COPYING contains now
LGPL/GPL/MPL tri-license for explicit Mozilla source compatibility.
Original Libhnj source with OOo's patches are managed by Rene Engelhard
and Chris Halls at Debian:
http://packages.debian.org/stable/libdevel/libhnj-dev
and http://packages.debian.org/unstable/source/libhnj
OTHER FILES
This distribution is the source of the en_US hyphenation patterns
"hyph_en_US.dic", too. See README_hyph_en_US.txt.
Source files of hyph_en_US.dic in the distribution:
hyphen.tex (en_US hyphenation patterns from plain TeX)
Source: http://tug.ctan.org/text-archive/macros/plain/base/hyphen.tex
tbhyphext.tex: hyphenation exception log from TugBoat archive
Source of the hyphenation exception list:
http://www.ctan.org/tex-archive/info/digests/tugboat/tb0hyf.tex
Generated with the hyphenex script
(http://www.ctan.org/tex-archive/info/digests/tugboat/hyphenex.sh)
sh hyphenex.sh <tb0hyf.tex >tbhyphext.tex
INSTALLATION
autoreconf -fvi
./configure
make
make install
UNIT TESTS (WITH VALGRIND DEBUGGER)
make check
VALGRIND=memcheck make check
USAGE
./example hyph_en_US.dic mywords.txt
or (under Linux)
echo example | ./example hyph_en_US.dic /dev/stdin
NOTE: In the case of Unicode encoded input, convert your words
to lowercase before hyphenation (under UTF-8 console environment):
cat mywords.txt | awk '{print tolower($0)}' >mywordslow.txt
BUILD DLL USING CROSS-COMPILATION
./configure --host i586-mingw32 --prefix=/tmp/hyphen-dll
make
make install
DEVELOPMENT
See README.hyphen for hyphenation algorithm, README.nonstandard
and doc/tb87nemeth.pdf for non-standard hyphenation,
README.compound for compound word hyphenation, and tests/*.
Description of the dictionary format:
First line contains the character encoding (ISO8859-x, UTF-8).
Possible options in the following lines:
LEFTHYPHENMIN num minimal hyphenation distance from the left word end
RIGHTHYPHENMIN num minimal hyphation distance from the right word end
COMPOUNDLEFTHYPHENMIN num min. hyph. dist. from the left compound word boundary
COMPOUNDRIGHTHYPHENMIN num min. hyph. dist. from the right comp. word boundary
hyphenation patterns see README.* files
NEXTWORD separate the two compound sets (see README.compound)
Default values:
Without explicite declarations, hyphenmin fields of dict struct
are zeroes, but in this case the lefthyphenmin and righthyphenmin
will be the default 2 under the hyphenation (for backward compatibility).
Comments
Use percent sign at the beginning of the lines to add comments to your
hpyhenation patterns (after the character encoding in the first line):
% comment
*****************************************************************************
* Warning! Correct working of Libhnj *needs* prepared hyphenation patterns. *
For example, generating hyph_en_US.dic from "hyphen.us" TeX patterns:
perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1
or with default LEFTHYPHENMIN and RIGHTHYPHENMIN values:
perl substrings.pl hyphen.us hyph_en_US.dic ISO8859-1 2 3
perl substrings.pl hyphen.gb hyph_en_GB.dic ISO8859-1 3 3
****************************************************************************
OTHERS
Java hyphenation: Peter B. West (Folio project) implements a hyphenator with
non standard hyphenation facilities based on extended Libhnj. The HyFo module
is released in binary form as jar files and in source form as zip files.
See http://sourceforge.net/project/showfiles.php?group_id=119136
László Németh
<nemeth (at) numbertext (dot) org>

View File

@ -1,87 +0,0 @@
New option of Libhyphen 2.7: NOHYPHEN
Hyphen, apostrophe and other characters may be word boundary characters,
but they don't need (extra) hyphenation. With NOHYPHEN option
it's possible to hyphenate the words parts correctly.
Example:
ISO8859-1
NOHYPHEN -,'
1-1
1'1
NEXTLEVEL
Description:
1-1 and 1'1 declare hyphen and apostrophe as word boundary characters
and NOHYPHEN with the comma separated character (or character sequence)
list forbid the (extra) hyphens at the hyphen and apostrophe characters.
Implicite NOHYPHEN declaration
Without explicite NEXTLEVEL declaration, Hyphen 2.8 uses the
previous settings, plus in UTF-8 encoding, endash (U+2013) and
typographical apostrophe (U+2019) are NOHYPHEN characters, too.
It's possible to enlarge the hyphenation distance from these
NOHYPHEN characters by using COMPOUNDLEFTHYPHENMIN and
COMPOUNDRIGHTHYPHENMIN attributes.
Compound word hyphenation
Hyphen library supports better compound word hyphenation and special
rules of compound word hyphenation of German languages and other
languages with arbitrary number of compound words. The new options,
COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN help to set the right
style for the hyphenation of compound words.
Algorithm
The algorithm is an extension of the original pattern based hyphenation
algorithm. It uses two hyphenation pattern sets, defined in the same
pattern file and separated by the NEXTLEVEL keyword. First pattern
set is for hyphenation only at compound word boundaries, the second one
is for hyphenation within words or word parts.
Recursive compound level hyphenation
The algorithm is recursive: every word parts of a successful
first (compound) level hyphenation will be rehyphenated
by the same (first) pattern set.
Finally, when first level hyphenation is not possible, Hyphen uses
the second level hyphenation for the word or the word parts.
Word endings and word parts
Patterns for word endings (patterns with ellipses) match the
word parts, too.
Options
COMPOUNDLEFTHYPHENMIN: min. hyph. dist. from the left compound word boundary
COMPOUNDRIGHTHYPHENMIN: min. hyph. dist. from the right comp. word boundary
NEXTLEVEL: sign second level hyphenation patterns
Default hyphenmin values
Default values of COMPOUNDLEFTHYPHENMIN and COMPOUNDRIGHTHYPHENMIN are 0,
and 0 under the hyphenation, too. ("0" values of
LEFTHYPHENMIN and RIGHTHYPHENMIN mean the default "2" under the hyphenation.)
Examples
See tests/compound* test files.
Preparation of hyphenation patterns
It hasn't been special pattern generator tool for compound hyphenation
patterns, yet. It is possible to use PATGEN to generate both of
pattern sets, concatenate it manually and set the requested HYPHENMIN values.
(But don't forget the preprocessing steps by substrings.pl before
concatenation.) One of the disadvantage of this method, that PATGEN
doesn't know recursive compound hyphenation of Hyphen.
László Németh
<nemeth (at) openoffice.org>

View File

@ -1,108 +0,0 @@
Brief explanation of the hyphenation algorithm herein.[1]
Raph Levien <raph@acm.org>
4 Aug 1998
The hyphenation algorithm is basically the same as Knuth's TeX
algorithm. However, the implementation is quite a bit faster.
The hyphenation files from TeX can almost be used directly. There
is a preprocessing step, however. If you don't do the preprocessing
step, you'll get bad hyphenations (i.e. a silent failure).
Start with a file such as hyphen.us. This is the TeX ushyph1.tex
file, with the exception dictionary encoded using the same rules as
the main portion of the file. Any line beginning with % is a comment.
Each other line should contain exactly one rule.
Then, do the preprocessing - "perl substrings.pl hyphen.us". The
resulting file is hyphen.mashed. It's in Perl, and it's fairly slow
(it uses brute force algorithms; about 17 seconds on a P100), but it
could probably be redone in C with clever algorithms. This would be
valuable, for example, if it was handle user-supplied exception
dictionaries by integrating them into the rule table.[2]
Once the rules are preprocessed, loading them is quite quick -
about 200ms on a P100. It then hyphenates at about 40,000 words per
second on a P100. I haven't benchmarked it against other
implementations (both TeX and groff contain essentially the same
algorithm), but expect that it runs quite a bit faster than any of
them.
Knuth's algorithm
This section contains a brief explanation of Knuth's algorithm, in
case you missed it from the TeX books. We'll use the semi-word
"example" as our running example.
Since the beginning and end of a word are special, the algorithm is
actually run over the prepared word (prep_word in the source)
".example.". Knuths algorithm basically just does pattern matches from
the rule set, then applies the matches. The patterns in this case that
match are "xa", "xam", "mp", and "pl". These are actually stored as
"x1a", "xam3", "4m1p", and "1p2l2". Whenever numbers appear between
the letters, they are added in. If two (or more) patterns have numbers
in the same place, the highest number wins. Here's the example:
. e x a m p l e .
x1a
x a m3
4m1p
1p2l2
-----------------
. e x1a4m3p2l2e .
Finally, hyphens are placed wherever odd numbers appear. They are,
however, suppressed after the first letter and before the last letter
of the word (TeX actually suppresses them before the next-to-last, as
well). So, it's "ex-am-ple", which is correct.
Knuth uses a trie to implement this. I.e. he stores each rule in a
trie structure. For each position in the word, he searches the trie,
searching for a match. Most patterns are short, so efficiency should
be quite good.
Theory of the algorithm
The algorithm works as a slightly modified finite state machine.
There are two kinds of transitions: those that consume one letter of
input (which work just like your regular finite state machine), and
"fallback" transitions, which don't consume any input. If no
transition matching the next letter is found, the fallback is used.
One way of looking at this is a form of compression of the transition
tables - i.e. it behaves the same as a completely vanilla state
machine in which the actual transition table of a node is made up of
the union of transition tables of the node itself, plus its fallbacks.
Each state is represented by a string. Thus, if the current state
is "am" and the next letter is "p", then the next state is "amp".
Fallback transitions go to states which chop off one or (sometimes)
more letters from the beginning. For example, if none of the
transitions from "amp" match the next letter, then it will fall back
to "mp". Similarly, if none of the transitions from "mp" match the
next letter, it will fall back to "m".
Each state is also associated with a (possibly null) "match"
string. This represents the union of all patterns which are
right-justified substrings of the match string. I.e. the pattern "mp"
is a right-justified substring of the state "amp", so it's numbers get
added in. The actual calculation of this union is done by the
Perl preprocessing script, but could probably be done in C just about
as easily.
Because each state transition either consumes one input character
or shortens the state string by one character, the total number of
state transitions is linear in the length of the word.
[1] Documentations:
Franklin M. Liang: Word Hy-phen-a-tion by Com-put-er.
Stanford University, 1983. http://www.tug.org/docs/liang.
László Németh: Automatic non-standard hyphenation in OpenOffice.org,
TUGboat (27), 2006. No. 2., http://hunspell.sourceforge.net/tb87nemeth.pdf
[2] There is the C version of pattern converter "substrings.c"
in the distribution written by Nanning Buitenhuis. Unfortunatelly,
this version hasn't handled the non standard extension of the
algorithm, yet.

View File

@ -1,122 +0,0 @@
Non-standard hyphenation
------------------------
Some languages use non-standard hyphenation; `discretionary'
character changes at hyphenation points. For example,
Catalan: paral·lel -> paral-lel,
Dutch: omaatje -> oma-tje,
German (before the new orthography): Schiffahrt -> Schiff-fahrt,
Hungarian: asszonnyal -> asz-szony-nyal (multiple occurance!)
Swedish: tillata -> till-lata.
Using this extended library, you can define
non-standard hyphenation patterns. For example:
l·1l/l=l
a1atje./a=t,1,3
.schif1fahrt/ff=f,5,2
.as3szon/sz=sz,2,3
n1nyal./ny=ny,1,3
.til1lata./ll=l,3,2
or with narrow boundaries:
l·1l/l=,1,2
a1atje./a=,1,1
.schif1fahrt/ff=,5,1
.as3szon/sz=,2,1
n1nyal./ny=,1,1
.til1lata./ll=,3,1
Note: Libhnj uses modified patterns by preparing substrings.pl.
Unfortunatelly, now the conversion step can generate bad non-standard
patterns (non-standard -> standard pattern conversion), so using
narrow boundaries may be better for recent Libhnj. For example,
substrings.pl generates a few bad patterns for Hungarian hyphenation
patterns resulting bad non-standard hyphenation in a few cases. Using narrow
boundaries solves this problem. Java HyFo module can check this problem.
Syntax of the non-standard hyphenation patterns
------------------------------------------------
pat1tern/change[,start,cut]
If this pattern matches the word, and this pattern win (see README.hyphen)
in the change region of the pattern, then pattern[start, start + cut - 1]
substring will be replaced with the "change".
For example, a German ff -> ff-f hyphenation:
f1f/ff=f
or with expansion
f1f/ff=f,1,2
will change every "ff" with "ff=f" at hyphenation.
A more real example:
% simple ff -> f-f hyphenation
f1f
% Schiffahrt -> Schiff-fahrt hyphenation
%
schif3fahrt/ff=f,5,2
Specification
- Pattern: matching patterns of the original Liang's algorithm
- patterns must contain only one hyphenation point at change region
signed with an one-digit odd number (1, 3, 5, 7 or 9).
These point may be at subregion boundaries: schif3fahrt/ff=,5,1
- only the greater value guarantees the win (don't mix non-standard and
non-standard patterns with the same value, for example
instead of f3f and schif3fahrt/ff=f,5,2 use f3f and schif5fahrt/ff=f,5,2)
- Change: new characters.
Arbitrary character sequence. Equal sign (=) signs hyphenation points
for OpenOffice.org (like in the example). (In a possible German LaTeX
preprocessor, ff could be replaced with "ff, for a Hungarian one, ssz
with `ssz, according to the German and Hungarian Babel settings.)
- Start: starting position of the change region.
- begins with 1 (not 0): schif3fahrt/ff=f,5,2
- start dot doesn't matter: .schif3fahrt/ff=f,5,2
- numbers don't matter: .s2c2h2i2f3f2ahrt/ff=f,5,2
- In UTF-8 encoding, use Unicode character positions: össze/sz=sz,2,3
("össze" looks "össze" in an ISO 8859-1 8-bit editor).
- Cut: length of the removed character sequence in the original word.
- In UTF-8 encoding, use Unicode character length: paral·1lel/l=l,5,3
("paral·lel" looks "paral·1lel" in an ISO 8859-1 8-bit editor).
Dictionary developing
---------------------
There hasn't been extended PatGen pattern generator for non-standard
hyphenation patterns, yet.
Fortunatelly, non-standard hyphenation points are forbidden in the PatGen
generated hyphenation patterns, so with a little patch can be develop
non-standard hyphenation patterns also in this case.
Warning: If you use UTF-8 Unicode encoding in your patterns, call
substrings.pl with UTF-8 parameter to calculate right
character positions for non-standard hyphenation:
./substrings.pl input output UTF-8
Programming
-----------
Use hyphenate2() or hyphenate3() to handle non-standard hyphenation.
See hyphen.h for the documentation of the hyphenate*() functions.
See example.c for processing the output of the hyphenate*() functions.
Warning: change characters are lower cased in the source, so you may need
case conversion of the change characters based on input word case detection.
For example, see OpenOffice.org source
(lingucomponent/source/hyphenator/altlinuxhyph/hyphen/hyphenimp.cxx).
László Németh
<nemeth (at) openoffice.org>

File diff suppressed because it is too large Load Diff

View File

@ -1,175 +0,0 @@
/* Hyphen - hyphenation library using converted TeX hyphenation patterns
*
* (C) 1998 Raph Levien
* (C) 2001 ALTLinux, Moscow
* (C) 2006, 2007, 2008 László Németh
*
* This was part of libHnj library by Raph Levien.
*
* Peter Novodvorsky from ALTLinux cut hyphenation part from libHnj
* to use it in OpenOffice.org.
*
* Non-standard and compound word hyphenation support by László Németh.
*
* License is the original LibHnj license:
*
* LibHnj is dual licensed under LGPL and MPL. Boilerplate for both
* licenses follows.
*/
/* LibHnj - a library for high quality hyphenation and justification
* Copyright (C) 1998 Raph Levien
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307 USA.
*/
/*
* The contents of this file are subject to the Mozilla Public License
* Version 1.0 (the "MPL"); you may not use this file except in
* compliance with the MPL. You may obtain a copy of the MPL at
* http://www.mozilla.org/MPL/
*
* Software distributed under the MPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
* for the specific language governing rights and limitations under the
* MPL.
*
*/
#ifndef __HYPHEN_H__
#define __HYPHEN_H__
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdio.h>
typedef struct _HyphenDict HyphenDict;
typedef struct _HyphenState HyphenState;
typedef struct _HyphenTrans HyphenTrans;
#define MAX_CHARS 100
#define MAX_NAME 20
struct _HyphenDict {
/* user options */
char lhmin; /* lefthyphenmin: min. hyph. distance from the left side */
char rhmin; /* righthyphenmin: min. hyph. distance from the right side */
char clhmin; /* min. hyph. distance from the left compound boundary */
char crhmin; /* min. hyph. distance from the right compound boundary */
char * nohyphen; /* comma separated list of characters or character
sequences with forbidden hyphenation */
int nohyphenl; /* count of elements in nohyphen */
/* system variables */
int num_states;
char cset[MAX_NAME];
int utf8;
HyphenState *states;
HyphenDict *nextlevel;
};
struct _HyphenState {
char *match;
char *repl;
signed char replindex;
signed char replcut;
int fallback_state;
int num_trans;
HyphenTrans *trans;
};
struct _HyphenTrans {
char ch;
int new_state;
};
HyphenDict *hnj_hyphen_load (const char *fn);
HyphenDict *hnj_hyphen_load_file (FILE *f);
void hnj_hyphen_free (HyphenDict *dict);
/* obsolete, use hnj_hyphen_hyphenate2() or *hyphenate3() functions) */
int hnj_hyphen_hyphenate (HyphenDict *dict,
const char *word, int word_size,
char *hyphens);
/*
int hnj_hyphen_hyphenate2(): non-standard hyphenation.
(It supports Catalan, Dutch, German, Hungarian, Norwegian, Swedish
etc. orthography, see documentation.)
input data:
word: input word
word_size: byte length of the input word
hyphens: allocated character buffer (size = word_size + 5)
hyphenated_word: allocated character buffer (size ~ word_size * 2) or NULL
rep, pos, cut: pointers (point to the allocated and _zeroed_ buffers
(size=word_size) or with NULL value) or NULL
output data:
hyphens: hyphenation vector (hyphenation points signed with odd numbers)
hyphenated_word: hyphenated input word (hyphens signed with `='),
optional (NULL input)
rep: NULL (only standard hyph.), or replacements (hyphenation points
signed with `=' in replacements);
pos: NULL, or difference of the actual position and the beginning
positions of the change in input words;
cut: NULL, or counts of the removed characters of the original words
at hyphenation,
Note: rep, pos, cut are complementary arrays to the hyphens, indexed with the
character positions of the input word.
For example:
Schiffahrt -> Schiff=fahrt,
pattern: f1f/ff=f,1,2
output: rep[5]="ff=f", pos[5] = 1, cut[5] = 2
Note: hnj_hyphen_hyphenate2() can allocate rep, pos, cut (word_size
length arrays):
char ** rep = NULL;
int * pos = NULL;
int * cut = NULL;
char hyphens[MAXWORDLEN];
hnj_hyphen_hyphenate2(dict, "example", 7, hyphens, NULL, &rep, &pos, &cut);
See example in the source distribution.
*/
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
const char *word, int word_size, char * hyphens,
char *hyphenated_word, char *** rep, int ** pos, int ** cut);
/* like hnj_hyphen_hyphenate2, but with hyphenmin parameters */
/* lhmin: lefthyphenmin
* rhmin: righthyphenmin
* clhmin: compoundlefthyphemin
* crhmin: compoundrighthyphenmin
* (see documentation) */
int hnj_hyphen_hyphenate3 (HyphenDict *dict,
const char *word, int word_size, char * hyphens,
char *hyphword, char *** rep, int ** pos, int ** cut,
int lhmin, int rhmin, int clhmin, int crhmin);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* __HYPHEN_H__ */

View File

@ -1,19 +0,0 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# These files cannot be built in unified mode because they include hnjalloc.h.
SOURCES += [
'hyphen.c',
]
FINAL_LIBRARY = 'xul'
LOCAL_INCLUDES += [
'../glue',
]
# We allow warnings for third-party code that can be updated from upstream.
AllowCompilerWarnings()

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

@ -1589,8 +1589,8 @@ izn4j
iz1no
2z1p
2z1s
.W8a8r9s8z8a9w8a.
.Warsza3w2a
.w8a8r9s8z8a9w8a.
.warsza3w2a
.d8o9z8n8a.
.do1z2na1
.n8j8e8j9s8y8m.

View File

View File

View File

View File

View File

View File

View File

View File

@ -42,7 +42,7 @@ locales = [
'tr',
'uk',
]
filename = '{locale}/hyphenation/hyph_{locale}.dic'
filename = '{locale}/hyphenation/hyph_{locale}.hyf'
FINAL_TARGET_FILES.hyphenation += [filename.format(locale=locale) for locale in locales]
# en-US is a special case: the dic file is named like en_US.
FINAL_TARGET_FILES.hyphenation += ['en-US/hyphenation/hyph_en_US.dic']
FINAL_TARGET_FILES.hyphenation += ['en-US/hyphenation/hyph_en_US.hyf']

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

@ -9,7 +9,6 @@ TEST_DIRS += [
]
DIRS += [
'hyphenation/hyphen',
'hyphenation/glue',
'locale',
'locales',

View File

@ -29,7 +29,8 @@ def generate(output, cbindgen_crate_path, *in_tree_dependencies):
"--lockfile",
CARGO_LOCK,
"--crate",
_get_crate_name(cbindgen_crate_path)
_get_crate_name(cbindgen_crate_path),
"--cpp-compat"
], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()

View File

@ -224,7 +224,7 @@ stage-android: make-stage-dir
$(NSINSTALL) $(topsrcdir)/mobile/android/fonts $(DEPTH)/_tests/reftest
$(NSINSTALL) $(topsrcdir)/mobile/android/fonts $(DEPTH)/_tests/testing/mochitest
$(NSINSTALL) -D $(DEPTH)/_tests/reftest/hyphenation
$(NSINSTALL) $(wildcard $(topsrcdir)/intl/locales/*/hyphenation/*.dic) $(DEPTH)/_tests/reftest/hyphenation
$(NSINSTALL) $(wildcard $(topsrcdir)/intl/locales/*/hyphenation/*.hyf) $(DEPTH)/_tests/reftest/hyphenation
ifdef MOZ_COPY_PDBS
CPP_UNIT_TEST_BINS=$(filter-out $(wildcard $(DIST)/cppunittests/*.pdb), $(wildcard $(DIST)/cppunittests/*))

View File

@ -0,0 +1 @@
{"files":{"COPYRIGHT":"4df931055b82b96e13ad475c4cee3de5afa69a54a4c611c9d7dc6252d858d9c8","Cargo.toml":"ed3016de5a5dbfb0904cd3a442fa98cb66f8b4d8c1b801bcdcba777b57abe69d","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"14cbfed88443a2e7ffb5beb788cae17e19d7329e9ef6c7ebdbd45c67751f4a06","benches/bench.rs":"ed7143e66ecf8bfb12c87d1f9344157d97696b8194de9132d061129bc80d8d52","cbindgen.toml":"07d22767e85ed64cf190038205e189a8fffea8910bbe923d04f425b36b9e9e93","doc/mapped_hyph_format.md":"2f2487cf536fe4b03db6e4b384be06744ec30b3f299519492288306a93127fbb","hyph_en_US.hyf":"6262b4c5118fe277ab4add8689d9524ca72097564652baec67a8fcd5029ec9b0","src/bin/hyf_compile.rs":"8dfcad9c6e6f27bda9eb6ac6493114fdec0187fef144d86e097ffe488d00a49c","src/builder.rs":"7d4bb46ab2e00bb1cad1de8365781102a44817f23518ca617db17c07d44f5f7e","src/ffi.rs":"bdcff084276418788f4c8a1c525d7a6fd0bce900ca1561ff0353029e1171d9f1","src/lib.rs":"0126ba46f1c30a2dea2f72dec9e9639635aaba85f4b0da7b1a6e2f52624243ed","src/main.rs":"666befeb39cb1a7dfb66c6b9218d5f7b6c4ed09dbbbc8cfff6b749a33a99ebcf","tests/base.hyf":"d8bf57c6280cfa1d357d3fdba156ce64afbd9df58e28eeb084dfe3f80972b73f","tests/base.hyph":"a3f1fab24c101701fdf21e8359685d80611ab970304e2bd89ef024768b3700c8","tests/base.word":"1136c9a421b242262661b9a65723f87a5ecf77ae38eabcea057832d036d567fd","tests/compound.hyf":"929c1ba6676e4c43bc649d0abf4275ea9e8b02bffaa5acdf704a710813a7a13c","tests/compound4.hyf":"2093287bc41ee30ff9bdbf278f1f8209cb1d1a78236b46e9060af2a881572b8e","tests/compound5.hyf":"0942a5dfbb8d0ef3a937ab9da0418abb41300357cde49f4c477a59a11b2cb6bd","tests/compound6.hyf":"ebad958c2692a5b439b31e324020ed27c42dc05bd5b8c6a6dea4669e6ccf76b4","tests/hyphen.hyf":"92b8a5c86aac6a0b9f0eb7330a057065d6985fd047e851cae47039995c682d4d","tests/lhmin.hyf":"23c886704fafee7d9c54b2478029cf69a5fa946c2f2442bd86697bca5933c88d","tests/num.hyf":"4834fabe78b5c81815434d4562ce3322541649e1ea1edc555a498574bc8b237e","tests/rhmin.hyf":"239cb3d4d7f904abb43b57241e12cc1396e636220c3806e64666aca7ca46cc42","tests/settings2.hyf":"9fc4855e0b952a3593db1efef080b93ce7f1c6fe6798db0440e2bf0cc986ffa2","tests/settings3.hyf":"867db207b485a06e7d60ad10735c9111f10516ee3a5afd6306c683ace3454491","tests/test.rs":"5c81ae59b9384b70d9461407999dac1fde9214398876c4433fbbde9571cc1d94"},"package":null}

12
third_party/rust/mapped_hyph/COPYRIGHT vendored Normal file
View File

@ -0,0 +1,12 @@
mapped_hyph is copyright 2019 Mozilla Foundation.
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
https://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.
Code in the subdirectories /test/ and /bench/ is dedicated
to the Public Domain.

18
third_party/rust/mapped_hyph/Cargo.toml vendored Normal file
View File

@ -0,0 +1,18 @@
[package]
name = "mapped_hyph"
description = "Hyphenation using precompiled memory-mapped tables"
version = "0.3.0"
authors = ["Jonathan Kew <jfkthame@gmail.com>"]
license = "MIT/Apache-2.0"
edition = "2018"
[dependencies]
memmap = "0.7.0"
arrayref = "0.3.5"
[dev-dependencies]
criterion = "0.3"
[[bench]]
name = "bench"
harness = false

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,25 @@
Copyright (c) 2019 Mozilla Foundation
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

75
third_party/rust/mapped_hyph/README.md vendored Normal file
View File

@ -0,0 +1,75 @@
# mapped_hyph
mapped_hyph is a reimplementation of the hyphenation algorithm from the
[libhyphen](https://github.com/hunspell/hyphen) library
that is intended to reduce the in-memory footprint of loaded
hyphenation dictionaries, especially when the same dictionary
may be in use by multiple processes.
To reduce memory footprint, mapped_hyph uses hyphenation dictionaries that are
"precompiled" into a flat, position-independent binary format that is used
directly by the runtime hyphenation functions.
Therefore, dictionaries do not have to be parsed into a dynamic structure in memory;
the files can simply be mmap'd into the address space and immediately used.
In addition, a compiled dictionary mapped into a shared-memory block
can be made available to multiple processes for no added physical memory cost.
One deliberate simplification compared to libhyphen
is that mapped_hyph only accepts UTF-8 text and hyphenation dictionaries;
legacy non-Unicode encodings are not supported.
mapped_hyph has been created primarily for use by Gecko, replacing the use of libhyphen,
and so its features (and limitations) are based on this use case.
However, it is hoped that it will also be more generally useful.
## Functionality
Currently, mapped_hyph supports only "standard" hyphenation, where spelling does not
change around the hyphenation position. At present this is the only kind of
hyphenation supported in Gecko.
The compiled hyphenation dictionary format includes provision for replacement
strings and indexes, as used by libhyphen to support non-standard hyphenations
(e.g. German "Schiffahrt" -> "Schiff-fahrt"), but the `find_hyphen_values` function
will ignore any such hyphenation positions it finds.
(None of the hyphenation dictionaries shipping with Firefox includes such rules.)
## Licensing
mapped_hyph is dual licensed under the Apache-2.0 and MIT licenses;
see the file COPYRIGHT.
## Documentation
Use `cargo doc --open` to view (admittedly brief) documentation generated from
comments in the source.
## C and C++ bindings
See the `mapped_hyph.h` header for C/C++ APIs that can be used to load hyphenation files
and to locate valid hyphenation positions in a word.
## Sample programs
See main.rs for a simple example program.
## Compiled dictionaries
The `hyf_compile` tool is used to generate `.hyf` files for mapped_hyph
from standard `.dic` (or `.pat`) files as used by libhyphen, LibreOffice, etc.
(A compiled version of the `hyph_en_US` dictionary from libhyphen is currently
included here, as it is handy for testing purposes.)
## Release Notes
### 0.2.0
* Implemented a hyphenation table compiler in the `builder` submodule,
and `hyf_compile` command-line tool.
* Moved C-callable API functions into an `ffi` submodule.
### 0.1.0
* Initial release.

View File

@ -0,0 +1,50 @@
// Any copyright to the test code below is dedicated to the Public Domain.
// http://creativecommons.org/publicdomain/zero/1.0/
use criterion::black_box;
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::BenchmarkId;
use criterion::Criterion;
use mapped_hyph::Hyphenator;
use std::fs;
const SAMPLE_SIZE: usize = 300;
const DIC_PATH: &str = "hyph_en_US.hyf";
fn bench_construct(c: &mut Criterion) {
c.bench_function("construct", |b| {
b.iter(|| {
let dic = unsafe { mapped_hyph::load_file(DIC_PATH) }
.expect(&format!("failed to load dictionary {}", DIC_PATH));
let _ = Hyphenator::new(black_box(&*dic));
})
});
}
fn bench_find_hyphen_values(c: &mut Criterion) {
// XXX: Should we copy this file to the crate to ensure reproducability?
let data = fs::read_to_string("/usr/share/dict/words").expect("File reading failed.");
let words: Vec<&str> = data.lines().take(SAMPLE_SIZE).collect();
let dic = unsafe { mapped_hyph::load_file(DIC_PATH) }
.expect(&format!("failed to load dictionary {}", DIC_PATH));
let hyph = Hyphenator::new(&*dic);
c.bench_with_input(
BenchmarkId::new("bench_word", SAMPLE_SIZE),
&words,
|b, words| {
b.iter(|| {
let mut values: Vec<u8> = vec![0; 1000];
for w in words {
hyph.find_hyphen_values(&w, &mut values);
}
});
},
);
}
criterion_group!(benches, bench_construct, bench_find_hyphen_values,);
criterion_main!(benches);

View File

@ -0,0 +1,114 @@
# This is a template cbindgen.toml file with all of the default values.
# Some values are commented out because their absence is the real default.
#
# See https://github.com/eqrion/cbindgen/blob/master/docs.md#cbindgentoml
# for detailed documentation of every option here.
language = "C"
############## Options for Wrapping the Contents of the Header #################
header = """/*
* Copyright 2019 Mozilla Foundation. See the COPYRIGHT
* file at the top-level directory of this distribution.
*
* Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
* https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
* <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
* option. This file may not be copied, modified, or distributed
* except according to those terms.
**/
/* clang-format off */
"""
trailer = "/* clang-format on */"
include_guard = "mapped_hyph_h"
autogen_warning = """/*
* Warning, this file is autogenerated by cbindgen. Don't modify this manually.
*/
"""
include_version = false
# namespace = "my_namespace"
namespaces = []
# using_namespaces = []
sys_includes = ["stdbool.h","stdint.h"]
includes = []
no_includes = true
############################ Code Style Options ################################
braces = "SameLine"
line_length = 100
tab_width = 2
documentation_style = "auto"
############################# Codegen Options ##################################
style = "both"
[defines]
# "target_os = freebsd" = "DEFINE_FREEBSD"
# "feature = serde" = "DEFINE_SERDE"
[export]
include = []
exclude = []
# prefix = "CAPI_"
item_types = []
renaming_overrides_prefixing = false
[export.rename]
[export.body]
[fn]
rename_args = "None"
# must_use = "MUST_USE_FUNC"
# prefix = "START_FUNC"
# postfix = "END_FUNC"
args = "auto"
[struct]
rename_fields = "None"
# must_use = "MUST_USE_STRUCT"
derive_constructor = false
derive_eq = false
derive_neq = false
derive_lt = false
derive_lte = false
derive_gt = false
derive_gte = false
[enum]
rename_variants = "None"
# must_use = "MUST_USE_ENUM"
add_sentinel = false
prefix_with_name = false
derive_helper_methods = false
derive_const_casts = false
derive_mut_casts = false
# cast_assert_name = "ASSERT"
derive_tagged_enum_destructor = false
derive_tagged_enum_copy_constructor = false
private_default_tagged_enum_constructor = false
[const]
allow_static_const = true
[macro_expansion]
bitflags = false
############## Options for How Your Rust library Should Be Parsed ##############
[parse]
parse_deps = false
# include = []
exclude = []
clean = false
extra_bindings = []
[parse.expand]
crates = []
all_features = false
default_features = true
features = []

View File

@ -0,0 +1,98 @@
# Compiled hyphenation table format for mapped_hyph
The file is a "flattened" representation of the list of `HyphenDict` structs
and descendant objects used by libhyphen
(see [hyphen.h](https://github.com/hunspell/hyphen/blob/master/hyphen.h)).
Note that multi-byte integer types in the file are stored in _little-endian_ byte order.
## Overall file header
The file begins with a 4-byte "signature", followed by a count of the number
of hyphenation levels, and an array of offsets to each hyphenation level.
A "level" is essentially equivalent to libhyphen's `HyphenDict`.
### Header (size: 8 bytes + 4 * numLevels)
Type | Name | Description
-----|------|------------
uint8[4] | magicNumber | 4-byte file identification code: ['H', 'y', 'f', '0']
uint32 | numLevels | number of hyphenation levels present
uint32[numLevels] | levelOffset | offset from start of file to each Level
Currently, there are normally 2 hyphenation levels, as the parser/compiler will
generate a default first level if no NEXTLEVEL keyword is present in the pattern file.
## Hyphenation Level
Each level of the hyphenation pattern begins with a Level header, followed by
the data for its states and the strings they refer to.
When the hyphenation machine is executed, we always begin at state offset 0
(from the level's stateDataBase); each transition to a new state represents the
target directly by its offset from stateDataBase.
A state offset of 0xFFFFFF is considered invalid.
Strings are represented as offsets from the level's stringDataBase; each string
is encoded as a one-byte length followed by `length` bytes of utf-8 data.
(So the maximum string length is 255 utf-8 code units; this is far more than any actual
hyphenation dictionary uses).
A string offset of 0xFFFF is considered invalid and represents an absent string.
The minimum number of characters that must be kept together at the start/end of a word,
or of a component of a compound (i.e. the `...Min` values) is a count of _Unicode characters_,
not UTF-8 code units. (Note that the presentation-form ligature characters U+FB00 'ff' through U+FB06 'st'
are counted as 2 or 3 characters for this purpose.)
### Level (size: 16 bytes + state data + string data, padded to a 4-byte boundary)
Type | Name | Description
-----|------|------------
uint32 | stateDataBase | offset from beginning of Level to start of level's State data
uint32 | stringDataBase | offset from beginning of Level to start of level's packed String data
uint16 | noHyphenStringOffset | from level's stringDataBase
uint16 | noHyphenCount | number of (NUL-separated) strings in the nohyphen string
uint8 | leftHyphenMin | minimum number of characters kept together at start of word
uint8 | rightHyphenMin | minimum number of characters kept together at end of word
uint8 | compoundLeftHyphenMin | minimum number of characters kept together at start of second component of a compound
uint8 | compoundRightHyphenMin | minimum number of characters kept together at end of first component of a compound
## State
Each state, referred to by its offset from the level's stateDataBase, consists of a header
followed by an array of transitions for input bytes that need to be matched in this state.
The state also records a fallback state offset, which is the transition to be taken
if the next input byte does not match any of the transition records.
If a match string is present (i.e. `matchStringOffset` is not 0xFFFF), it is a string of hyphenation values
(encoded as ASCII digits '0'..'9') to be applied at the current position in the word.
### StateHeader (size: 8 bytes)
Type | Name | Description
-----|------|------------
uint32 | fallbackStateOffset | (from level's stateDataBase)
uint16 | matchStringOffset | (from level's stringDataBase)
uint8 | numTransitions | count of Transitions that follow the StateHeader and optional StateHeaderExtension
uint8 | isExtended | if non-zero, the StateHeader is immediately followed by a StateHeaderExtension
If the `isExtended` flag in the state header is set, this state includes a potential spelling change
and there is an extended form of the header present before the array of transitions.
(Note that extended states with spelling-change rules are not yet supported by the mapped_hyph engine;
none of the hyphenation dictionaries shipped with Firefox includes such rules.)
### StateHeaderExtension (size: 4 bytes)
Type | Name | Description
-----|------|------------
uint16 | replacementStringOffset | (from level's stringDataBase) the replacement string
int8 | replacementIndex | index of the byte position (relative to current position in the word) at which the spelling replacement should happen
int8 | replacementCut | number of bytes to cut from the original word when making the replacement
## Transitions
The state's transitions are encoded as an array of Transition records, each corresponding to an input byte
and providing the offset of the new state. The transitions for each state are sorted by ascending value of input byte
(although in practice there are usually only a few valid transitions, and so a binary search does not seem to be
worthwhile).
### Transition (size: 4 bytes)
Type | Name | Description
-----|------|------------
uint24 | newStateOffset | (from level's stateDataBase)
uint8 | inputByte | the input byte (utf-8 code unit) for this transition

View File

View File

@ -0,0 +1,25 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
extern crate mapped_hyph;
use std::env;
use std::fs::File;
fn main() -> std::io::Result<()> {
let args: Vec<String> = env::args().collect();
if args.len() == 3 {
let in_file = File::open(&args[1])?;
let mut out_file = File::create(&args[2])?;
mapped_hyph::builder::write_hyf_file(&mut out_file, mapped_hyph::builder::read_dic_file(&in_file))?;
} else {
println!("usage: hyf_compile <pattern-file> <output-file>");
}
Ok(())
}

View File

@ -0,0 +1,473 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/// Functions to compile human-readable patterns into a mapped_hyph
/// flattened representation of the hyphenation state machine.
use std::io::{Read,BufRead,BufReader,Write};
use std::collections::HashMap;
use std::convert::TryInto;
use std::hash::{Hash,Hasher};
// Wrap a HashMap so that we can implement the Hash trait.
#[derive(PartialEq, Eq, Clone)]
struct TransitionMap (HashMap<u8,i32>);
impl TransitionMap {
fn new() -> TransitionMap {
TransitionMap(HashMap::<u8,i32>::new())
}
}
impl Hash for TransitionMap {
fn hash<H: Hasher>(&self, state: &mut H) {
// We only look at the values here; that's likely to be enough
// for a reasonable hash.
let mut transitions: Vec<&i32> = self.0.values().collect();
transitions.sort();
for t in transitions {
t.hash(state);
}
}
}
#[derive(PartialEq, Eq, Hash, Clone)]
struct State {
match_string: Option<Vec<u8>>,
#[allow(dead_code)]
repl_string: Option<Vec<u8>>,
#[allow(dead_code)]
repl_index: i32,
#[allow(dead_code)]
repl_cut: i32,
fallback_state: i32,
transitions: TransitionMap,
}
impl State {
fn new() -> State {
State {
match_string: None,
repl_string: None,
repl_index: -1,
repl_cut: -1,
fallback_state: -1,
transitions: TransitionMap::new(),
}
}
}
/// This is only public because the read_dic_file() function returns a Vec
/// of LevelBuilder structs, which can then be passed to write_hyf_file()
/// to create the flattened output.
pub struct LevelBuilder {
states: Vec<State>,
str_to_state: HashMap<Vec<u8>,i32>,
encoding: Option<String>,
nohyphen: Option<String>,
lh_min: u8,
rh_min: u8,
clh_min: u8,
crh_min: u8,
}
impl LevelBuilder {
fn new() -> LevelBuilder {
let mut result = LevelBuilder {
states: Vec::<State>::new(),
str_to_state: HashMap::<Vec<u8>,i32>::new(),
encoding: None,
nohyphen: None,
lh_min: 0,
rh_min: 0,
clh_min: 0,
crh_min: 0,
};
// Initialize the builder with an empty start state.
result.str_to_state.insert(vec![], 0);
result.states.push(State::new());
result
}
fn find_state_number_for(&mut self, text: &[u8]) -> i32 {
let count = self.states.len() as i32;
let index = *self.str_to_state.entry(text.to_vec()).or_insert(count);
if index == count {
self.states.push(State::new());
}
index
}
fn add_pattern(&mut self, pattern: &str) {
let mut bytes = pattern.as_bytes();
let mut text = Vec::<u8>::with_capacity(bytes.len());
let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1);
let mut repl_str = None;
let mut repl_index = 0;
let mut repl_cut = 0;
// Check for replacement rule (non-standard hyphenation spelling change).
if let Some(slash) = bytes.iter().position(|x| *x == b'/') {
let parts = bytes.split_at(slash);
bytes = parts.0;
let mut it = parts.1[1 ..].split(|x| *x == b',');
if let Some(repl) = it.next() {
repl_str = Some(repl.to_vec());
}
if let Some(num) = it.next() {
repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1;
}
if let Some(num) = it.next() {
repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap();
}
}
// Separate the input pattern into parallel arrays of text (bytes) and digits.
let mut got_digit = false;
for byte in bytes {
if *byte <= b'9' && *byte >= b'0' {
assert!(!got_digit, "invalid pattern \"{}\": consecutive digits", pattern);
digits.push(*byte);
got_digit = true;
} else {
text.push(*byte);
if got_digit {
got_digit = false;
} else {
digits.push(b'0');
}
}
}
if !got_digit {
digits.push(b'0');
}
if repl_str.is_none() {
// Optimize away leading zeroes from the digits array.
while !digits.is_empty() && digits[0] == b'0' {
digits.remove(0);
}
} else {
// Convert repl_index and repl_cut from Unicode char to byte indexing.
let start = if text[0] == b'.' { 1 } else { 0 };
if start == 1 {
assert_eq!(digits[0], b'0', "unexpected digit before start of word");
digits.remove(0);
}
let word = std::str::from_utf8(&text[start..]).unwrap();
let mut chars: Vec<_> = word.char_indices().collect();
chars.push((word.len(), '.'));
repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32;
repl_index = chars[repl_index as usize].0 as i32;
}
// Create the new state, or add pattern into an existing state
// (which should not already have a match_string).
let mut state_num = self.find_state_number_for(&text);
let mut state = &mut self.states[state_num as usize];
assert!(state.match_string.is_none(), "duplicate pattern?");
if !digits.is_empty() {
state.match_string = Some(digits);
}
if repl_str.is_some() {
state.repl_string = repl_str;
state.repl_index = repl_index;
state.repl_cut = repl_cut;
}
// Set up prefix transitions, inserting additional states as needed.
while !text.is_empty() {
let last_state = state_num;
let ch = *text.last().unwrap();
text.truncate(text.len() - 1);
state_num = self.find_state_number_for(&text);
if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) {
assert_eq!(exists, last_state, "overwriting existing transition?");
break;
}
}
}
fn merge_duplicate_states(&mut self) {
// We loop here because when we eliminate a duplicate, and update the transitons
// that referenced it, we may thereby create new duplicates that another pass
// will find and compress further.
loop {
let orig_len = self.states.len();
// Used to map State records to the (first) index at which they occur.
let mut state_to_index = HashMap::<&State,i32>::new();
// Mapping of old->new state indexes, and whether each old state is
// a duplicate that should be dropped.
let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len);
let mut next_new_index: i32 = 0;
for index in 0 .. self.states.len() {
// Find existing index for this state, or allocate the next new index to it.
let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index);
// Record the mapping, and whether the state was a duplicate.
mappings.push((new_index, new_index != next_new_index));
// If we used next_new_index for this state, increment it.
if new_index == next_new_index {
next_new_index += 1;
}
}
// If we didn't find any duplicates, next_new_index will have kept pace with
// index, so we know we're finished.
if next_new_index as usize == self.states.len() {
break;
}
// Iterate over all the states, either deleting them or updating indexes
// according to the mapping we created; then repeat the search.
for index in (0 .. self.states.len()).rev() {
if mappings[index].1 {
self.states.remove(index);
} else {
let state = &mut self.states[index];
if state.fallback_state != -1 {
state.fallback_state = mappings[state.fallback_state as usize].0;
}
for t in state.transitions.0.iter_mut() {
*t.1 = mappings[*t.1 as usize].0;
}
}
}
}
}
fn flatten(&self) -> Vec<u8> {
// Calculate total space needed for state data, and build the state_to_offset table.
let mut state_data_size = 0;
let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len());
for state in &self.states {
state_to_offset.push(state_data_size);
state_data_size += if state.repl_string.is_some() { 12 } else { 8 };
state_data_size += state.transitions.0.len() * 4;
}
// Helper to map a state index to its offset in the final data block.
let get_state_offset_for = |state_index: i32| -> u32 {
if state_index < 0 {
return super::INVALID_STATE_OFFSET;
}
state_to_offset[state_index as usize] as u32
};
// Helper to map a byte string to its offset in the final data block, and
// store the bytes into string_data unless using an already-existing string.
let mut string_to_offset = HashMap::<Vec<u8>,usize>::new();
let mut string_data = Vec::<u8>::new();
let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 {
if bytes.is_none() {
return super::INVALID_STRING_OFFSET;
}
assert!(bytes.as_ref().unwrap().len() < 256);
let new_offset = string_data.len();
let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset);
if offset == new_offset {
string_data.push(bytes.as_ref().unwrap().len() as u8);
string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref());
}
offset.try_into().unwrap()
};
// Handle nohyphen string list if present, converting comma separators to NULs
// and trimming any surplus whitespace.
let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET;
let mut nohyphen_count: u16 = 0;
if self.nohyphen.is_some() {
let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect();
nohyphen_count = nohyphen_strings.len().try_into().unwrap();
nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec()));
}
let mut state_data = Vec::<u8>::with_capacity(state_data_size);
for state in &self.states {
state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes());
state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes());
state_data.push(state.transitions.0.len() as u8);
// Determine whether to use an extended state record, and if so add the
// replacement string and index fields.
if state.repl_string.is_none() {
state_data.push(0);
} else {
state_data.push(1);
state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes());
state_data.push(state.repl_index as u8);
state_data.push(state.repl_cut as u8);
}
// Collect transitions into an array so we can sort them.
let mut transitions = vec![];
for (key, value) in state.transitions.0.iter() {
transitions.push((*key, get_state_offset_for(*value)))
}
transitions.sort();
for t in transitions {
// New state offset is stored as a 24-bit value, so we do this manually.
state_data.push((t.1 & 0xff) as u8);
state_data.push(((t.1 >> 8) & 0xff) as u8);
state_data.push(((t.1 >> 16) & 0xff) as u8);
state_data.push(t.0);
}
}
assert_eq!(state_data.len(), state_data_size);
// Pad string data to a 4-byte boundary
while string_data.len() & 3 != 0 {
string_data.push(0);
}
let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len();
let mut result = Vec::<u8>::with_capacity(total_size);
let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32;
let string_data_base: u32 = state_data_base + state_data_size as u32;
result.extend(&state_data_base.to_le_bytes());
result.extend(&string_data_base.to_le_bytes());
result.extend(&nohyphen_string_offset.to_le_bytes());
result.extend(&nohyphen_count.to_le_bytes());
result.push(self.lh_min);
result.push(self.rh_min);
result.push(self.clh_min);
result.push(self.crh_min);
result.extend(state_data.iter());
result.extend(string_data.iter());
assert_eq!(result.len(), total_size);
result
}
}
/// Read a libhyphen-style pattern file and create the corresponding state
/// machine transitions, etc.
/// The returned Vec can be passed to write_hyf_file() to generate a flattened
/// representation of the state machine in mapped_hyph's binary format.
pub fn read_dic_file<T: Read>(dic_file: T) -> Vec<LevelBuilder> {
let reader = BufReader::new(dic_file);
let mut builders = Vec::<LevelBuilder>::new();
builders.push(LevelBuilder::new());
let mut builder = &mut builders[0];
for (index, line) in reader.lines().enumerate() {
let mut trimmed = line.unwrap().trim().to_string();
// Strip comments.
if let Some(i) = trimmed.find('%') {
trimmed = trimmed[..i].trim().to_string();
}
// Ignore empty lines.
if trimmed.is_empty() {
continue;
}
// Uppercase indicates keyword rather than pattern.
if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' {
// First line is encoding; we only support UTF-8.
if builder.encoding.is_none() {
assert_eq!(trimmed, "UTF-8", "Only UTF-8 patterns are accepted!");
builder.encoding = Some(trimmed);
continue;
}
// Check for valid keyword-value pairs.
if trimmed.contains(' ') {
let parts: Vec<&str> = trimmed.split(' ').collect();
assert!(parts.len() == 2);
let keyword = parts[0];
let value = parts[1];
match keyword {
"LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(),
"RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(),
"COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(),
"COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(),
"NOHYPHEN" => builder.nohyphen = Some(trimmed),
_ => println!("unknown keyword: {}", trimmed),
}
continue;
}
// Start a new hyphenation level?
if trimmed == "NEXTLEVEL" {
builders.push(LevelBuilder::new());
builder = builders.last_mut().unwrap();
continue;
}
println!("unknown keyword: {}", trimmed);
continue;
}
// Patterns should always be provided in lowercase; complain if not.
assert_eq!(trimmed, trimmed.to_lowercase(), "pattern \"{}\" not lowercased at line {}", trimmed, index);
builder.add_pattern(&trimmed);
}
// Create default first (compound-word) level if only one level was provided.
// (Maybe this should be optional? Currently just copying libhyphen behavior.)
if builders.len() == 1 {
let (lh_min, rh_min, clh_min, crh_min) =
(builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min);
builders.insert(0, LevelBuilder::new());
builder = builders.first_mut().unwrap();
builder.add_pattern("1-1");
builder.add_pattern("1'1");
builder.add_pattern("1\u{2013}1"); // en-dash
builder.add_pattern("1\u{2019}1"); // curly apostrophe
builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string());
builder.lh_min = lh_min;
builder.rh_min = rh_min;
builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 };
builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 };
}
// Put in fallback states in each builder.
for builder in &mut builders {
for (key, state_index) in builder.str_to_state.iter() {
if key.is_empty() {
continue;
}
let mut fallback_key = key.clone();
while !fallback_key.is_empty() {
fallback_key.remove(0);
if builder.str_to_state.contains_key(&fallback_key) {
break;
}
}
builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key];
}
}
// Merge duplicate states to reduce size.
for builder in &mut builders {
builder.merge_duplicate_states();
}
builders
}
/// Write out the state machines representing a set of hyphenation rules
/// to the given output stream.
pub fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {
let mut flattened = vec![];
for level in levels {
flattened.push(level.flatten());
}
// Write file header: magic number, count of levels.
hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?;
let level_count: u32 = flattened.len() as u32;
hyf_file.write_all(&level_count.to_le_bytes())?;
// Write array of offsets to each level. First level will begin immediately
// after the array of offsets.
let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count;
for flat in &flattened {
hyf_file.write_all(&offset.to_le_bytes())?;
offset += flat.len() as u32;
}
// Write the flattened data for each level.
for flat in &flattened {
hyf_file.write_all(&flat)?;
}
Ok(())
}

165
third_party/rust/mapped_hyph/src/ffi.rs vendored Normal file
View File

@ -0,0 +1,165 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::slice;
use std::str;
use std::ffi::CStr;
use std::os::raw::c_char;
use std::str::Utf8Error;
use memmap::Mmap;
use super::Hyphenator;
/// Opaque type representing a hyphenation dictionary loaded from a file,
/// for use in FFI function signatures.
pub struct HyphDic;
// Helper to convert word and hyphen buffer parameters from raw C pointer/length
// pairs to the Rust types expected by mapped_hyph.
unsafe fn params_from_c<'a>(word: *const c_char, word_len: u32,
hyphens: *mut u8, hyphens_len: u32) ->
(Result<&'a str, Utf8Error>, &'a mut [u8]) {
(str::from_utf8(slice::from_raw_parts(word as *const u8, word_len as usize)),
slice::from_raw_parts_mut(hyphens, hyphens_len as usize))
}
/// C-callable function to load a hyphenation dictionary from a file at `path`.
///
/// Returns null on failure.
///
/// This does not fully validate that the file contains usable hyphenation
/// data, it only opens the file (read-only) and mmap's it into memory, and
/// does some minimal sanity-checking that it *might* be valid.
///
/// The returned `HyphDic` must be released with `mapped_hyph_free_dictionary`.
///
/// # Safety
/// The given `path` must be a valid pointer to a NUL-terminated (C-style)
/// string.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_load_dictionary(path: *const c_char) -> *const HyphDic {
let path_str = match CStr::from_ptr(path).to_str() {
Ok(str) => str,
Err(_) => return std::ptr::null(),
};
let hyph = Box::new(match super::load_file(path_str) {
Some(dic) => dic,
_ => return std::ptr::null(),
});
Box::into_raw(hyph) as *const HyphDic
}
/// C-callable function to free a hyphenation dictionary
/// that was loaded by `mapped_hyph_load_dictionary`.
///
/// # Safety
/// The `dic` parameter must be a `HyphDic` pointer obtained from
/// `mapped_hyph_load_dictionary`, and not previously freed.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_free_dictionary(dic: *mut HyphDic) {
Box::from_raw(dic);
}
/// C-callable function to find hyphenation values for a given `word`,
/// using a dictionary loaded via `mapped_hyph_load_dictionary`.
///
/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters)
/// long.
///
/// Caller must supply the `hyphens` output buffer for results; its size is
/// given in `hyphens_len`.
/// It should be at least `word_len` elements long.
///
/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is
/// too small.
/// Otherwise returns the number of potential hyphenation positions found.
///
/// # Panics
/// This function may panic if the given dictionary is not valid.
///
/// # Safety
/// The `dic` parameter must be a `HyphDic` pointer obtained from
/// `mapped_hyph_load_dictionary`.
///
/// The `word` and `hyphens` parameter must be valid pointers to memory buffers
/// of at least the respective sizes `word_len` and `hyphens_len`.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_dic(dic: *const HyphDic,
word: *const c_char, word_len: u32,
hyphens: *mut u8, hyphens_len: u32) -> i32 {
if word_len > hyphens_len {
return -1;
}
let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len);
if word_str.is_err() {
return -1;
}
Hyphenator::new(&*(dic as *const Mmap))
.find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32
}
/// C-callable function to find hyphenation values for a given `word`,
/// using a dictionary loaded and owned by the caller.
///
/// The dictionary is supplied as a raw memory buffer `dic_buf` of size
/// `dic_len`.
///
/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters)
/// long.
///
/// Caller must supply the `hyphens` output buffer for results; its size is
/// given in `hyphens_len`.
/// It should be at least `word_len` elements long.
///
/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is
/// too small.
/// Otherwise returns the number of potential hyphenation positions found.
///
/// # Panics
/// This function may panic if the given dictionary is not valid.
///
/// # Safety
/// The `dic_buf` parameter must be a valid pointer to a memory block of size
/// at least `dic_len`.
///
/// The `word` and `hyphens` parameter must be valid pointers to memory buffers
/// of at least the respective sizes `word_len` and `hyphens_len`.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_raw(dic_buf: *const u8, dic_len: u32,
word: *const c_char, word_len: u32,
hyphens: *mut u8, hyphens_len: u32) -> i32 {
if word_len > hyphens_len {
return -1;
}
let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len);
if word_str.is_err() {
return -1;
}
Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize))
.find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32
}
/// C-callable function to check if a given memory buffer `dic_buf` of size
/// `dic_len` is potentially usable as a hyphenation dictionary.
///
/// Returns `true` if the given memory buffer looks like it may be a valid
/// hyphenation dictionary, `false` if it is clearly not usable.
///
/// # Safety
/// The `dic_buf` parameter must be a valid pointer to a memory block of size
/// at least `dic_len`.
#[no_mangle]
pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic_len: u32) -> bool {
if dic_buf.is_null() {
return false;
}
let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize));
dic.is_valid_hyphenator()
}

640
third_party/rust/mapped_hyph/src/lib.rs vendored Normal file
View File

@ -0,0 +1,640 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#[macro_use]
extern crate arrayref;
extern crate memmap;
use std::slice;
use std::str;
use std::cmp::max;
use std::fs::File;
use std::mem;
use memmap::Mmap;
// Make submodules available publicly.
pub mod builder;
pub mod ffi;
// 4-byte identification expected at beginning of a compiled dictionary file.
// (This will be updated if an incompatible change to the format is made in
// some future revision.)
const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0'];
const INVALID_STRING_OFFSET: u16 = 0xffff;
const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff;
const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels
const LEVEL_HEADER_SIZE: usize = 16;
// Transition actually holds a 24-bit new state offset and an 8-bit input byte
// to match. We will be interpreting byte ranges as Transition arrays (in the
// State::transitions() method below), so use repr(C) to ensure we have the
// memory layout we expect.
// Transition records do not depend on any specific alignment.
#[repr(C)]
#[derive(Debug,Copy,Clone)]
struct Transition(u8, u8, u8, u8);
impl Transition {
fn new_state_offset(&self) -> usize {
// Read a 24-bit little-endian number from three bytes.
self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16)
}
fn match_byte(&self) -> u8 {
self.3
}
}
// State is an area of the Level's data block that begins with a fixed header,
// followed by an array of transitions. The total size of each State's data
// depends on the number of transitions in the state. Only the basic header
// is defined by the struct here; the rest of the state is accessed via
// pointer magic.
// There are two versions of State, a basic version that supports only simple
// hyphenation (no associated spelling change), and an extended version that
// adds the replacement-string fields to support spelling changes at the
// hyphenation point. Check is_extended() to know which version is present.
// State records are NOT necessarily 4-byte aligned, so multi-byte fields
// should be read with care.
#[derive(Debug,Copy,Clone)]
#[repr(C)]
struct State {
fallback_state: [u8; 4],
match_string_offset: [u8; 2],
num_transitions: u8,
is_extended: u8,
}
#[repr(C)]
struct StateExtended {
state: State,
repl_string_offset: [u8; 2],
repl_index: i8,
repl_cut: i8,
}
impl State {
// Accessors for the various State header fields; see file format description.
fn fallback_state(&self) -> usize {
u32::from_le_bytes(self.fallback_state) as usize
}
fn match_string_offset(&self) -> usize {
u16::from_le_bytes(self.match_string_offset) as usize
}
fn num_transitions(&self) -> u8 {
self.num_transitions
}
fn is_extended(&self) -> bool {
self.is_extended != 0
}
// Accessors that are only valid if is_extended() is true.
// These use `unsafe` to dereference a pointer to the relevant field;
// this is OK because Level::get_state always validates the total state size
// before returning a state reference, so these pointers will be valid for
// any extended state it returns.
#[allow(dead_code)]
fn as_extended(&self) -> &StateExtended {
debug_assert!(self.is_extended());
unsafe { mem::transmute(self) }
}
#[allow(dead_code)]
fn repl_string_offset(&self) -> usize {
u16::from_le_bytes(self.as_extended().repl_string_offset) as usize
}
#[allow(dead_code)]
fn repl_index(&self) -> i8 {
self.as_extended().repl_index
}
#[allow(dead_code)]
fn repl_cut(&self) -> i8 {
self.as_extended().repl_cut
}
// Return the state's Transitions as a slice reference.
fn transitions(&self) -> &[Transition] {
let count = self.num_transitions() as usize;
if count == 0 {
return &[];
}
let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize;
// We know the `offset` here will not look beyond the valid range of memory
// because Level::get_state() checks the state length (accounting for the
// number of transitions) before returning a State reference.
let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition };
// Again, because Level::get_state() already checked the state length, we know
// this slice address and count will be valid.
unsafe { slice::from_raw_parts(trans_ptr, count) }
}
// Look up the Transition for a given input byte, or None.
fn transition_for(&self, b: u8) -> Option<Transition> {
// The transitions array is sorted by match_byte() value, but there are
// usually very few entries; benchmarking showed that using binary_search_by
// here gave no benefit (possibly slightly slower).
self.transitions().iter().copied().find(|t| t.match_byte() == b)
}
// Just for debugging use...
#[allow(dead_code)]
fn deep_show(&self, prefix: &str, dic: &Level) {
if self.match_string_offset() != INVALID_STRING_OFFSET as usize {
let match_string = dic.string_at_offset(self.match_string_offset());
println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap());
}
for t in self.transitions() {
println!("{}{} ->", prefix, t.match_byte() as char);
let next_prefix = format!("{} ", prefix);
dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic);
}
}
}
// We count the presentation-form ligature characters U+FB00..FB06 as multiple
// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these
// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this
// helper returns the "decomposed length" of the ligature given its trailing
// byte.
fn lig_length(trail_byte: u8) -> usize {
// This is only called on valid UTF-8 where we already know trail_byte
// must be >= 0x80.
// Ligature lengths: ff fi fl ffi ffl long-st st
const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8, 2u8 ];
if trail_byte > 0x86 {
return 1;
}
LENGTHS[trail_byte as usize - 0x80] as usize
}
fn is_utf8_trail_byte(byte: u8) -> bool {
(byte & 0xC0) == 0x80
}
fn is_ascii_digit(byte: u8) -> bool {
byte <= b'9' && byte >= b'0'
}
fn is_odd(byte: u8) -> bool {
(byte & 0x01) == 0x01
}
// A hyphenation Level has a header followed by State records and packed string
// data. The total size of the slice depends on the number and size of the
// States and Strings it contains.
// Note that the data of the Level may not have any specific alignment!
#[derive(Debug,Copy,Clone)]
struct Level<'a> {
data: &'a [u8],
// Header fields cached by the constructor for faster access:
state_data_base_: usize,
string_data_base_: usize,
}
impl Level<'_> {
// Constructor that initializes our cache variables.
fn new(data: &[u8]) -> Level {
Level {
data,
state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize,
string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize,
}
}
// Accessors for Level header fields; see file format description.
fn state_data_base(&self) -> usize {
self.state_data_base_ // cached by constructor
}
fn string_data_base(&self) -> usize {
self.string_data_base_ // cached by constructor
}
fn nohyphen_string_offset(&self) -> usize {
u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize
}
#[allow(dead_code)]
fn nohyphen_count(&self) -> u16 {
u16::from_le_bytes(*array_ref!(self.data, 10, 2))
}
fn lh_min(&self) -> usize {
max(1, self.data[12] as usize)
}
fn rh_min(&self) -> usize {
max(1, self.data[13] as usize)
}
fn clh_min(&self) -> usize {
max(1, self.data[14] as usize)
}
fn crh_min(&self) -> usize {
max(1, self.data[15] as usize)
}
fn word_boundary_mins(&self) -> (usize, usize, usize, usize) {
(self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min())
}
// Strings are represented as offsets from the Level's string_data_base.
// This returns a byte slice referencing the string at a given offset,
// or an empty slice if invalid.
fn string_at_offset(&self, offset: usize) -> &'_ [u8] {
if offset == INVALID_STRING_OFFSET as usize {
return &[];
}
let string_base = self.string_data_base() as usize + offset;
// TODO: move this to the validation function.
debug_assert!(string_base < self.data.len());
if string_base + 1 > self.data.len() {
return &[];
}
let len = self.data[string_base] as usize;
// TODO: move this to the validation function.
debug_assert!(string_base + 1 + len <= self.data.len());
if string_base + 1 + len > self.data.len() {
return &[];
}
self.data.get(string_base + 1 .. string_base + 1 + len).unwrap()
}
// The nohyphen field actually contains multiple NUL-separated substrings;
// return them as a vector of individual byte slices.
fn nohyphen(&self) -> Vec<&[u8]> {
let string_offset = self.nohyphen_string_offset();
let nohyph_str = self.string_at_offset(string_offset as usize);
if nohyph_str.is_empty() {
return vec![];
}
nohyph_str.split(|&b| b == 0).collect()
}
// States are represented as an offset from the Level's state_data_base.
// This returns a reference to the State at a given offset, or None if invalid.
fn get_state(&self, offset: usize) -> Option<&State> {
if offset == INVALID_STATE_OFFSET as usize {
return None;
}
debug_assert_eq!(offset & 3, 0);
let state_base = self.state_data_base() + offset;
// TODO: move this to the validation function.
debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base());
if state_base + mem::size_of::<State>() > self.string_data_base() {
return None;
}
let state_ptr = &self.data[state_base] as *const u8 as *const State;
// This is safe because we just checked against self.string_data_base() above.
let state = unsafe { state_ptr.as_ref().unwrap() };
let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() }
+ mem::size_of::<Transition>() * state.num_transitions() as usize;
// TODO: move this to the validation function.
debug_assert!(state_base + length <= self.string_data_base());
if state_base + length > self.string_data_base() {
return None;
}
// This is safe because we checked the full state length against self.string_data_base().
unsafe { state_ptr.as_ref() }
}
// Sets hyphenation values (odd = potential break, even = no break) in values[],
// and returns the change in the number of odd values present, so the caller can
// keep track of the total number of potential breaks in the word.
fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize {
// Bail out immediately if the word is too short to hyphenate.
if word.len() < lh_min + rh_min {
return 0;
}
let start_state = self.get_state(0);
let mut st = start_state;
let mut hyph_count = 0;
for i in 0 .. word.len() + 2 {
// Loop over the word by bytes, with a virtual '.' added at each end
// to match word-boundary patterns.
let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] };
loop {
// Loop to repeatedly fall back if we don't find a matching transition.
// Note that this could infinite-loop if there is a state whose fallback
// points to itself (or a cycle of fallbacks), but this would represent
// a table compilation error.
// (A potential validation function could check for fallback cycles.)
if st.is_none() {
st = start_state;
break;
}
let state = st.unwrap();
if let Some(tr) = state.transition_for(b) {
// Found a transition for the current byte. Look up the new state;
// if it has a match_string, merge its weights into `values`.
st = self.get_state(tr.new_state_offset());
if let Some(state) = st {
let match_offset = state.match_string_offset();
if match_offset != INVALID_STRING_OFFSET as usize {
if state.is_extended() {
debug_assert!(false, "extended hyphenation not supported by this function");
} else {
let match_str = self.string_at_offset(match_offset);
let offset = i + 1 - match_str.len();
assert!(offset + match_str.len() <= word.len() + 2);
for (j, ch) in match_str.iter().enumerate() {
let index = offset + j;
if index >= lh_min && index <= word.len() - rh_min {
// lh_min and rh_min are guaranteed to be >= 1,
// so this will not try to access outside values[].
let old_value = values[index - 1];
let value = ch - b'0';
if value > old_value {
if is_odd(old_value) != is_odd(value) {
// Adjust hyph_count for the change we're making
hyph_count += if is_odd(value) { 1 } else { -1 };
}
values[index - 1] = value;
}
}
}
}
}
}
// We have handled the current input byte; leave the fallback loop
// and get next input.
break;
}
// No transition for the current byte; go to fallback state and try again.
st = self.get_state(state.fallback_state());
}
}
// If the word was not purely ASCII, or if the word begins/ends with
// digits, the use of lh_min and rh_min above may not have correctly
// excluded enough positions, so we need to fix things up here.
let mut index = 0;
let mut count = 0;
let word_bytes = word.as_bytes();
let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; };
// Handle lh_min.
while count < lh_min - 1 && index < word_bytes.len() {
let byte = word_bytes[index];
clear_hyphen_at(index);
if byte < 0x80 {
index += 1;
if is_ascii_digit(byte) {
continue; // ASCII digits don't count
}
} else if byte == 0xEF && word_bytes[index + 1] == 0xAC {
// Unicode presentation-form ligature characters, which we count as
// multiple chars for the purpose of lh_min/rh_min, all begin with
// 0xEF, 0xAC in UTF-8.
count += lig_length(word_bytes[index + 2]);
clear_hyphen_at(index + 1);
clear_hyphen_at(index + 2);
index += 3;
continue;
} else {
index += 1;
while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index]) {
clear_hyphen_at(index);
index += 1;
}
}
count += 1;
}
// Handle rh_min.
count = 0;
index = word.len();
while count < rh_min && index > 0 {
index -= 1;
let byte = word_bytes[index];
if index < word.len() - 1 {
clear_hyphen_at(index);
}
if byte < 0x80 {
// Only count if not an ASCII digit
if !is_ascii_digit(byte) {
count += 1;
}
continue;
}
if is_utf8_trail_byte(byte) {
continue;
}
if byte == 0xEF && word_bytes[index + 1] == 0xAC {
// Presentation-form ligatures count as multiple chars.
count += lig_length(word_bytes[index + 2]);
continue;
}
count += 1;
}
hyph_count
}
}
/// Hyphenation engine encapsulating a language-specific set of patterns (rules)
/// that identify possible break positions within a word.
pub struct Hyphenator<'a>(&'a [u8]);
impl Hyphenator<'_> {
/// Return a Hyphenator that wraps the given buffer.
/// This does *not* check that the given buffer is in fact a valid hyphenation table.
/// Use is_valid_hyphenator() to determine whether it is usable.
/// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
/// unvalidated data is not unsafe, but may panic.)
pub fn new(buffer: &[u8]) -> Hyphenator {
Hyphenator(buffer)
}
// Internal implementation details
fn magic_number(&self) -> &[u8] {
&self.0[0 .. 4]
}
fn num_levels(&self) -> usize {
u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize
}
fn level(&self, i: usize) -> Level {
let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize;
let limit = if i == self.num_levels() - 1 {
self.0.len()
} else {
u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize
};
debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len());
debug_assert_eq!(offset & 3, 0);
debug_assert_eq!(limit & 3, 0);
Level::new(&self.0[offset .. limit])
}
/// Identify acceptable hyphenation positions in the given `word`.
///
/// The caller-supplied `values` must be at least as long as the `word`.
///
/// On return, any elements with an odd value indicate positions in the word
/// after which a hyphen could be inserted.
///
/// Returns the number of possible hyphenation positions that were found.
///
/// # Panics
/// If the given `values` slice is too small to hold the results.
///
/// If the block of memory represented by `self.0` is not in fact a valid
/// hyphenation dictionary, this function may panic with an overflow or
/// array bounds violation.
pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize {
assert!(values.len() >= word.len());
values.iter_mut().for_each(|x| *x = 0);
let top_level = self.level(0);
let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins();
if word.len() < lh_min + rh_min {
return 0;
}
let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min);
let compound = hyph_count > 0;
// Subsequent levels are applied to fragments between potential breaks
// already found:
for l in 1 .. self.num_levels() {
let level = self.level(l);
if hyph_count > 0 {
let mut begin = 0;
let mut lh = lh_min;
// lh_min and rh_min are both guaranteed to be greater than zero,
// so this loop will not reach fully to the end of the word.
for i in lh_min - 1 .. word.len() - rh_min {
if is_odd(values[i]) {
if i > begin {
// We've found a component of a compound;
// clear the corresponding values and apply the new level.
// (These values must be even, so hyph_count is unchanged.)
values[begin .. i].iter_mut().for_each(|x| {
*x = 0;
});
hyph_count += level.find_hyphen_values(&word[begin ..= i],
&mut values[begin ..= i],
lh, crh_min);
}
begin = i + 1;
lh = clh_min;
}
}
if begin == 0 {
// No compound-word breaks were found, just apply level to the whole word.
hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
} else if begin < word.len() {
// Handle trailing component of compound.
hyph_count += level.find_hyphen_values(&word[begin .. word.len()],
&mut values[begin .. word.len()],
clh_min, rh_min);
}
} else {
hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
}
}
// Only need to check nohyphen strings if top-level (compound) breaks were found.
if compound && hyph_count > 0 {
let nohyph = top_level.nohyphen();
if !nohyph.is_empty() {
for i in lh_min ..= word.len() - rh_min {
if is_odd(values[i - 1]) {
for nh in &nohyph {
if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] {
values[i - 1] = 0;
hyph_count -= 1;
break;
}
if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] {
values[i - 1] = 0;
hyph_count -= 1;
break;
}
}
}
}
}
}
hyph_count
}
/// Generate the hyphenated form of a `word` by inserting the given `hyphen_char`
/// at each valid break position.
///
/// # Panics
/// If the block of memory represented by `self` is not in fact a valid
/// hyphenation dictionary, this function may panic with an overflow or
/// array bounds violation.
///
/// Also panics if the length of the hyphenated word would overflow `usize`.
pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String {
let mut values = vec![0u8; word.len()];
let hyph_count = self.find_hyphen_values(word, &mut values);
if hyph_count <= 0 {
return word.to_string();
}
// We know how long the result will be, so we can preallocate here.
let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8();
let mut result = String::with_capacity(result_len);
let mut n = 0;
for ch in word.char_indices() {
if ch.0 > 0 && is_odd(values[ch.0 - 1]) {
result.push(hyphchar);
n += 1;
}
result.push(ch.1);
}
debug_assert_eq!(n, hyph_count);
debug_assert_eq!(result_len, result.len());
result
}
/// Check if the block of memory looks like it could be a valid hyphenation
/// table.
pub fn is_valid_hyphenator(&self) -> bool {
// Size must be at least 4 bytes for magic_number + 4 bytes num_levels;
// smaller than this cannot be safely inspected.
if self.0.len() < FILE_HEADER_SIZE {
return false;
}
if self.magic_number() != MAGIC_NUMBER {
return false;
}
// For each level, there's a 4-byte offset in the header, and the level
// has its own 16-byte header, so we can check a minimum size again here.
let num_levels = self.num_levels();
if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels {
return false;
}
// Check that state_data_base and string_data_base for each hyphenation
// level are within range.
for l in 0 .. num_levels {
let level = self.level(l);
if level.state_data_base() < LEVEL_HEADER_SIZE ||
level.state_data_base() > level.string_data_base() ||
level.string_data_base() > level.data.len() {
return false;
}
// TODO: consider doing more extensive validation of states and
// strings within the level?
}
// It's still possible the dic is internally broken, but at least it's
// worth trying to use it!
true
}
}
/// Load the compiled hyphenation file at `dic_path`, if present.
///
/// Returns `None` if the specified file cannot be opened or mapped,
/// otherwise returns a `memmap::Mmap` mapping the file.
///
/// # Safety
///
/// This is unsafe for the same reason Mmap::map() is unsafe:
/// mapped_hyph does not guarantee safety if the mapped file is modified
/// (e.g. by another process) while we're using it.
///
/// This verifies that the file looks superficially like it may be a
/// compiled hyphenation table, but does *not* fully check the validity
/// of the file contents! Calling hyphenation functions with the returned
/// data is not unsafe, but may panic if the data is invalid.
pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> {
let file = File::open(dic_path).ok()?;
let dic = Mmap::map(&file).ok()?;
let hyph = Hyphenator(&*dic);
if hyph.is_valid_hyphenator() {
return Some(dic);
}
None
}

View File

@ -0,0 +1,67 @@
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
extern crate mapped_hyph;
use mapped_hyph::Hyphenator;
fn main() {
let dic_path = "hyph_en_US.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
println!("{}", hyph.hyphenate_word("haha", '-'));
println!("{}", hyph.hyphenate_word("hahaha", '-'));
println!("{}", hyph.hyphenate_word("photo", '-'));
println!("{}", hyph.hyphenate_word("photograph", '-'));
println!("{}", hyph.hyphenate_word("photographer", '-'));
println!("{}", hyph.hyphenate_word("photographic", '-'));
println!("{}", hyph.hyphenate_word("photographical", '-'));
println!("{}", hyph.hyphenate_word("photographically", '-'));
println!("{}", hyph.hyphenate_word("supercalifragilisticexpialidocious", '-'));
println!("{}", hyph.hyphenate_word("o'dwyer", '='));
println!("{}", hyph.hyphenate_word("o'callahan", '='));
println!("{}", hyph.hyphenate_word("odwyer", '='));
println!("{}", hyph.hyphenate_word("ocallahan", '='));
println!("{}", hyph.hyphenate_word("petti-fogging", '='));
println!("{}", hyph.hyphenate_word("e-mailing", '='));
println!("{}", hyph.hyphenate_word("-x-mailing", '='));
println!("{}", hyph.hyphenate_word("-strikeout-", '='));
let dic2 = match unsafe { mapped_hyph::load_file("tests/compound.hyf") } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", "tests/compound.hyf"),
};
let h2 = Hyphenator::new(&*dic2);
println!("{}", h2.hyphenate_word("motorcycle", '='));
let dic3 = match unsafe { mapped_hyph::load_file("tests/rhmin.hyf") } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let h3 = Hyphenator::new(&*dic3);
println!("{}", h3.hyphenate_word("övéit", '='));
println!("{}", h3.hyphenate_word("అంగడిధర", '='));
let dic4 = match unsafe { mapped_hyph::load_file("tests/num.hyf") } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", "tests/num.hyf"),
};
let h4 = Hyphenator::new(&*dic4);
println!("{}", h4.hyphenate_word("123foobar123", '='));
println!("{}", h4.hyphenate_word("123foobarfoobar", '='));
println!("{}", h4.hyphenate_word("foobarfoobar123", '='));
println!("{}", h4.hyphenate_word("123foobarfoobar123", '='));
}

View File

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

@ -0,0 +1,169 @@
// Any copyright to the test code below is dedicated to the Public Domain.
// http://creativecommons.org/publicdomain/zero/1.0/
use mapped_hyph::Hyphenator;
#[test]
fn basic_tests() {
let dic_path = "hyph_en_US.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("haha", '-'), "haha");
assert_eq!(hyph.hyphenate_word("hahaha", '-'), "ha-haha");
assert_eq!(hyph.hyphenate_word("photo", '-'), "photo");
assert_eq!(hyph.hyphenate_word("photograph", '-'), "pho-to-graph");
assert_eq!(hyph.hyphenate_word("photographer", '-'), "pho-tog-ra-pher");
assert_eq!(hyph.hyphenate_word("photographic", '-'), "pho-to-graphic");
assert_eq!(hyph.hyphenate_word("photographical", '-'), "pho-to-graph-i-cal");
assert_eq!(hyph.hyphenate_word("photographically", '-'), "pho-to-graph-i-cally");
assert_eq!(hyph.hyphenate_word("supercalifragilisticexpialidocious", '-'), "su-per-cal-ifrag-ilis-tic-ex-pi-ali-do-cious");
}
// Testcases adapted from tests included with libhyphen.
// (Using only the UTF-8 dictionaries/tests, and omitting those that require
// the extended hyphenation algorithm.)
#[test]
fn base() {
let dic_path = "tests/base.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
use std::fs::File;
use std::io::{BufRead,BufReader};
let words: Vec<String> = {
let file = File::open("tests/base.word").unwrap();
BufReader::new(file).lines().map(|l| l.unwrap()).collect()
};
let hyphs: Vec<String> = {
let file = File::open("tests/base.hyph").unwrap();
BufReader::new(file).lines().map(|l| l.unwrap()).collect()
};
for i in 0 .. words.len() {
assert_eq!(hyph.hyphenate_word(&words[i], '='), hyphs[i]);
}
}
#[test]
fn compound() {
let dic_path = "tests/compound.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("motorcycle", '-'), "mo-tor-cy-cle");
}
#[test]
fn compound4() {
let dic_path = "tests/compound4.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("motorcycle", '-'), "motor-cycle");
}
#[test]
fn compound5() {
let dic_path = "tests/compound5.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("postea", '-'), "post-e-a");
}
#[test]
fn compound6() {
let dic_path = "tests/compound6.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("meaque", '-'), "me-a-que");
}
#[test]
fn settings2() {
let dic_path = "tests/settings2.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("őőőőőőő", '='), "ő=ő=ő=ő=ő=ő=ő");
}
#[test]
fn settings3() {
let dic_path = "tests/settings3.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("őőőőőőő", '='), "őő=ő=ő=ő=őő");
}
#[test]
fn hyphen() {
let dic_path = "tests/hyphen.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("foobar'foobar-foobarfoobar", '='), "foobar'foobar-foobarfoobar");
}
#[test]
fn lhmin() {
let dic_path = "tests/lhmin.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("miért", '='), "mi=ért");
}
#[test]
fn rhmin() {
let dic_path = "tests/rhmin.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("övéit", '='), "övéit");
assert_eq!(hyph.hyphenate_word("అంగడిధర", '='), "అం=గ=డిధర");
}
#[test]
fn num() {
let dic_path = "tests/num.hyf";
let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
Some(dic) => dic,
_ => panic!("failed to load dictionary {}", dic_path),
};
let hyph = Hyphenator::new(&*dic);
assert_eq!(hyph.hyphenate_word("foobar", '='), "foobar");
assert_eq!(hyph.hyphenate_word("foobarfoobar", '='), "foobar=foobar");
assert_eq!(hyph.hyphenate_word("barfoobarfoo", '='), "barfoo=barfoo");
assert_eq!(hyph.hyphenate_word("123foobarfoobar", '='), "123foobar=foobar");
assert_eq!(hyph.hyphenate_word("foobarfoobar123", '='), "foobar=foobar123");
assert_eq!(hyph.hyphenate_word("123foobarfoobar123", '='), "123foobar=foobar123");
assert_eq!(hyph.hyphenate_word("123barfoobarfoo", '='), "123barfoo=barfoo");
assert_eq!(hyph.hyphenate_word("barfoobarfoo123", '='), "barfoo=barfoo123");
assert_eq!(hyph.hyphenate_word("123barfoobarfoo123", '='), "123barfoo=barfoo123");
}

View File

@ -44,6 +44,7 @@ audio_thread_priority = "0.20.2"
mdns_service = { path="../../../../media/mtransport/mdns_service", optional = true }
neqo_glue = { path = "../../../../netwerk/socket/neqo_glue" }
rlbox_lucet_sandbox = { version = "0.1.0", optional = true }
mapped_hyph = { git = "https://github.com/jfkthame/mapped_hyph.git", tag = "v0.3.0" }
[build-dependencies]
rustc_version = "0.2"

View File

@ -6,6 +6,7 @@
extern crate geckoservo;
extern crate mapped_hyph;
extern crate kvstore;
extern crate mp4parse_capi;
extern crate nsstring;