Bug 971047 - Import the CLD language detection library, r=felipe,azakai,gps.

This commit is contained in:
Florian Quèze 2014-03-13 18:28:53 +01:00
parent 59464c85ad
commit c93fd59725
60 changed files with 93805 additions and 0 deletions

View File

@ -0,0 +1,55 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
this.EXPORTED_SYMBOLS = ["LanguageDetector"];
Components.utils.import("resource://gre/modules/XPCOMUtils.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "Promise",
"resource://gre/modules/Promise.jsm");
const WORKER_URL = "resource:///modules/translation/cld-worker.js";
let detectionQueue = [];
let workerReady = false;
let pendingStrings = [];
XPCOMUtils.defineLazyGetter(this, "worker", () => {
let worker = new Worker(WORKER_URL);
worker.onmessage = function(aMsg) {
if (aMsg.data == "ready") {
workerReady = true;
for (let string of pendingStrings)
worker.postMessage(string);
pendingStrings = [];
}
else
detectionQueue.shift().resolve(aMsg.data);
}
return worker;
});
this.LanguageDetector = {
/**
* Detect the language of a given string
*
* @returns {Promise}
* @resolves When detection is finished, with a object containing
* these fields:
* - 'language' (string with a language code)
* - 'confident' (boolean).
*/
detectLanguage: function(aString) {
let deferred = Promise.defer();
detectionQueue.push(deferred);
if (worker && workerReady)
worker.postMessage(aString);
else
pendingStrings.push(aString);
return deferred.promise;
}
};

View File

@ -0,0 +1,44 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/. */
CC=emcc
SOURCES= \
internal/cldutil.cc \
internal/cldutil_shared.cc \
internal/compact_lang_det.cc \
internal/compact_lang_det_hint_code.cc \
internal/compact_lang_det_impl.cc \
internal/debug_empty.cc \
internal/fixunicodevalue.cc \
internal/generated_entities.cc \
internal/generated_language.cc \
internal/generated_ulscript.cc \
internal/getonescriptspan.cc \
internal/lang_script.cc \
internal/offsetmap.cc \
internal/scoreonescriptspan.cc \
internal/tote.cc \
internal/utf8statetable.cc \
internal/cld_generated_cjk_uni_prop_80.cc \
internal/cld2_generated_cjk_compatible.cc \
internal/cld_generated_cjk_delta_bi_4.cc \
internal/generated_distinct_bi_0.cc \
internal/cld2_generated_quadchrome0122_16.cc \
internal/cld2_generated_deltaoctachrome0122.cc \
internal/cld2_generated_distinctoctachrome0122.cc \
internal/cld_generated_score_quad_octa_0122_2.cc \
cldapp.cc \
$(NULL)
%.o: %.cc
$(CC) -Os -I. -o $@ $<
FLAGS=-s USE_TYPED_ARRAYS=2 -O3 -s INLINING_LIMIT=1 --llvm-lto 1 --memory-init-file 1 --closure 1
all: $(SOURCES:.cc=.o)
$(CC) $(FLAGS) -I. -o cld-worker.js $^ --post-js post.js -s EXPORTED_FUNCTIONS="['_detectLangCode', '_lastResultReliable']"
clean:
rm -rf $(SOURCES:.cc=.o)

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -0,0 +1,23 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "public/compact_lang_det.h"
extern "C" {
using namespace CLD2;
bool g_is_reliable;
const char* detectLangCode(const char* src) {
return LanguageCode(DetectLanguage(src, strlen(src),
true /* is_plain_text */,
&g_is_reliable));
}
bool lastResultReliable(void) {
return g_is_reliable;
}
}

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,216 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
#include "scoreonescriptspan.h"
/*
There are two primary parts to a CLD2 dynamic data file:
1. A header, wherein trivial data, block lengths and block offsets are kept
2. A data block, wherein the large binary blocks are kept
By reading the header, an application can determine the offsets and lengths of
all the data blocks for all tables. Offsets in the header are expressed
relative to the first byte of the file, inclusive of the header itself; thus,
any offset whose value is less than the length of the header is invalid.
Any offset whose value is zero indicates a field that is null in the
underlying CLD2 data; a real example of this is the fast_state field of the
UTF8PropObj, which may be null.
The size of the header can be precalculated by calling calculateHeaderSize(),
which will indicate the exact size of the header for a data file that contains
a given number of CLD2TableSummary objects.
Notes on endianness:
The data format is only suitable for little-endian machines. For big-endian
systems, a tedious transformation would need to be made first to reverse the
byte order of significant portions of the binary - not just the lengths, but
also some of the underlying table data.
Note on 32/64 bit:
The data format is agnostic to 32/64 bit pointers. All the offsets within the
data blob itself are 32-bit values relative to the start of the file, and the
file should certainly never be gigabytes in size!
When the file is ultimately read by the loading code and mmap()'d, new
pointers are generated at whatever size the system uses, initialized to the
start of the mmap, and incremented by the 32-bit offset. This should be safe
regardless of 32- or 64-bit architectures.
--------------------------------------------------------------------
FIELD
--------------------------------------------------------------------
DATA_FILE_MARKER (no null terminator)
total file size (sanity check, uint32)
--------------------------------------------------------------------
UTF8PropObj: const uint32 state0
UTF8PropObj: const uint32 state0_size
UTF8PropObj: const uint32 total_size
UTF8PropObj: const int max_expand
UTF8PropObj: const int entry_shift (coerced to 32 bits)
UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
UTF8PropObj: const uint32 losub
UTF8PropObj: const uint32 hiadd
offset of UTF8PropObj: const uint8* state_table
length of UTF8PropObj: const uint8* state_table
offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
offset of UTF8PropObj: const uint8* remap_string
length of UTF8PropObj: const uint8* remap_string
offset of UTF8PropObj: const uint8* fast_state
length of UTF8PropObj: const uint8* fast_state
--------------------------------------------------------------------
start of const short kAvgDeltaOctaScore[]
length of const short kAvgDeltaOctaScore[]
--------------------------------------------------------------------
number of CLD2TableSummary objects encoded (n)
[Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
[Table 1]: CLD2TableSummary: uint32 kCLDTableSize
[Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
[Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
[Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
[Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
.
.
.
[Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
[Table n]: CLD2TableSummary: uint32 kCLDTableSize
[Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
[Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
[Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
[Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
--------------------------------------------------------------------
Immediately after the header fields comes the data block. The data block has
the following content, in this order (note that padding is applied in order to
keep lookups word-aligned):
UTF8PropObj: const uint8* state_table
UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
UTF8PropObj: const uint8* remap_string
UTF8PropObj: const uint8* fast_state
const short kAvgDeltaOctaScore[]
[Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
.
.
.
[Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
It is STRONGLY recommended that the chunks within the data block be kept
128-bit aligned for efficiency reasons, although the code will work without
such alignment: the main lookup tables have randomly-accessed groups of four
4-byte entries, and these must be 16-byte aligned to avoid the performance
cost of multiple cache misses per group.
*/
namespace CLD2DynamicData {
static const char* DATA_FILE_MARKER = "cld2_data_file00";
static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
// Nicer version of memcmp that shows the offset at which bytes differ
bool mem_compare(const void* data1, const void* data2, const int length);
// Enable or disable debugging; 0 to disable, 1 to enable
void setDebug(int debug);
// Lower-level structure for individual tables. There are n table headers in
// a given file header.
typedef struct {
CLD2::uint32 kCLDTableSizeOne;
CLD2::uint32 kCLDTableSize;
CLD2::uint32 kCLDTableKeyMask;
CLD2::uint32 kCLDTableBuildDate;
CLD2::uint32 startOf_kCLDTable;
CLD2::uint32 lengthOf_kCLDTable;
CLD2::uint32 startOf_kCLDTableInd;
CLD2::uint32 lengthOf_kCLDTableInd;
CLD2::uint32 startOf_kRecognizedLangScripts;
CLD2::uint32 lengthOf_kRecognizedLangScripts;
} TableHeader;
// Top-level structure for a CLD2 Data File Header.
// Contains all the primitive fields for the header as well as an array of
// headers for the individual tables.
typedef struct {
// Marker fields help recognize and verify the data file
char sanityString[DATA_FILE_MARKER_LENGTH];
CLD2::uint32 totalFileSizeBytes;
// UTF8 primitives
CLD2::uint32 utf8PropObj_state0;
CLD2::uint32 utf8PropObj_state0_size;
CLD2::uint32 utf8PropObj_total_size;
CLD2::uint32 utf8PropObj_max_expand;
CLD2::uint32 utf8PropObj_entry_shift;
CLD2::uint32 utf8PropObj_bytes_per_entry;
CLD2::uint32 utf8PropObj_losub;
CLD2::uint32 utf8PropObj_hiadd;
CLD2::uint32 startOf_utf8PropObj_state_table;
CLD2::uint32 lengthOf_utf8PropObj_state_table;
CLD2::uint32 startOf_utf8PropObj_remap_base;
CLD2::uint32 lengthOf_utf8PropObj_remap_base;
CLD2::uint32 startOf_utf8PropObj_remap_string;
CLD2::uint32 lengthOf_utf8PropObj_remap_string;
CLD2::uint32 startOf_utf8PropObj_fast_state;
CLD2::uint32 lengthOf_utf8PropObj_fast_state;
// Average delta-octa-score bits
CLD2::uint32 startOf_kAvgDeltaOctaScore;
CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
// Table bits
CLD2::uint32 numTablesEncoded;
TableHeader* tableHeaders;
} FileHeader;
// Calculate the exact size of a header that encodes the specified number of
// tables. This can be used to reserve space within the data file,
// calculate offsets, and so on.
CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
// Dump a given header to stdout as a human-readable string.
void dumpHeader(FileHeader* header);
// Verify that a given pair of scoring tables match precisely
// If there is a problem, returns an error message; otherwise, the empty string.
bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
// Return true iff the program is running in little-endian mode.
bool isLittleEndian();
// Return true iff the core size assumptions are ok on this platform.
bool coreAssumptionsOk();
} // End namespace CLD2DynamicData
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

View File

@ -0,0 +1,52 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
#include "scoreonescriptspan.h"
#include "cld2_dynamic_data.h"
namespace CLD2DynamicDataLoader {
// Read a header from the specified file and return it.
// The header returned is dynamically allocated; you must 'delete' the array
// of TableHeaders as well as the returned FileHeader* when done.
CLD2DynamicData::FileHeader* loadHeader(const char* fileName);
// Load data directly into a ScoringTables structure using a private, read-only
// mmap and return the newly-allocated structure.
// The out-parameter "mmapAddressOut" is a pointer to a void*; the starting
// address of the mmap()'d block will be written here.
// The out-parameter "mmapLengthOut" is a pointer to an int; the length of the
// mmap()'d block will be written here.
// It is up to the caller to delete
CLD2::ScoringTables* loadDataFile(const char* fileName,
void** mmapAddressOut, int* mmapLengthOut);
// Given pointers to the data from a previous invocation of loadDataFile,
// unloads the data safely - freeing and deleting any malloc'd/new'd objects.
// When this method returns, the mmap has been deleted, as have all the scoring
// tables; the pointers passed in are all zeroed, such that:
// *scoringTables == NULL
// *mmapAddress == NULL
// mmapLength == NULL
// This is the only safe way to unload data that was previously loaded, as there
// is an unfortunate mixture of new and malloc involved in building the
// in-memory represtation of the data.
void unloadData(CLD2::ScoringTables** scoringTables,
void** mmapAddress, int* mmapLength);
} // End namespace CLD2DynamicDataExtractor
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_

View File

@ -0,0 +1,298 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// CJK compatible CLD2 scoring lookup table
//
#include "cld2tablesummary.h"
namespace CLD2 {
static const uint32 kCompatTableBuildDate = 20130128; // yyyymmdd
static const uint32 kCompatTableSize = 1; // Total Bucket count
static const uint32 kCompatTableKeyMask = 0xffffff00; // Mask hash key
static const char* const kCompatTableRecognizedLangScripts =
"zh-Hans zh-Hant ja-Hani ko-Hani vi-Hani za-Hani ";
// Empty table
static const IndirectProbBucket4 kCompatTable[kCompatTableSize] = {
// key[4], words[4] in UTF-8
// value[4]
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000]
};
// These are back-derived CTJKVZ probabilities from the table
// kTargetCTJKVZProbs in cldutil.cc
// This is all part of using one-byte mappings for CJK but wanting to
// convert them to normal langprob values to share the scoring code.
static const uint32 kCompatTableSizeOne = 0; // One-langprob count
static const uint32 kCompatTableIndSize = 239 * 2; // Largest subscript
static const uint32 kCompatTableInd[kCompatTableIndSize] = {
// [0000]
0x00000000, 0x00000000, // [0] zh.0 zhT.0 ja.0 ko.0 vi.0 za.0
0x00006142, 0x00000000, // [1] zh.0 zhT.0 ja.0 ko.0 vi.0 za.12
0x00002d42, 0x00000000, // [2] zh.0 zhT.0 ja.0 ko.0 vi.12 za.0
0x00000342, 0x00000000, // [3] zh.0 zhT.0 ja.0 ko.12 vi.0 za.0
0x00000242, 0x00000000, // [4] zh.0 zhT.0 ja.12 ko.0 vi.0 za.0
0x00001d42, 0x00000000, // [5] zh.0 zhT.12 ja.0 ko.0 vi.0 za.0
0x00000542, 0x00000000, // [6] zh.12 zhT.0 ja.0 ko.0 vi.0 za.0
0x2d00051f, 0x00000000, // [7] zh.8 zhT.0 ja.0 ko.0 vi.4 za.0
0x0300051f, 0x00000000, // [8] zh.8 zhT.0 ja.0 ko.4 vi.0 za.0
0x0200051f, 0x00000000, // [9] zh.8 zhT.0 ja.4 ko.0 vi.0 za.0
0x1d00051f, 0x00000000, // [10] zh.8 zhT.4 ja.0 ko.0 vi.0 za.0
0x031d05ea, 0x00000000, // [11] zh.8 zhT.2 ja.0 ko.2 vi.0 za.0
0x0000611c, 0x00000000, // [12] zh.0 zhT.0 ja.0 ko.0 vi.0 za.8
0x1d00021f, 0x00000000, // [13] zh.0 zhT.4 ja.8 ko.0 vi.0 za.0
0x0500611f, 0x00000000, // [14] zh.4 zhT.0 ja.0 ko.0 vi.0 za.8
0x0000021c, 0x00000000, // [15] zh.0 zhT.0 ja.8 ko.0 vi.0 za.0
0x021d05ea, 0x00000000, // [16] zh.8 zhT.2 ja.2 ko.0 vi.0 za.0
0x02001d1f, 0x00000000, // [17] zh.0 zhT.8 ja.4 ko.0 vi.0 za.0
0x6100051f, 0x00000000, // [18] zh.8 zhT.0 ja.0 ko.0 vi.0 za.4
0x02001d1d, 0x00000000, // [19] zh.0 zhT.8 ja.2 ko.0 vi.0 za.0
0x05001d1f, 0x00000000, // [20] zh.4 zhT.8 ja.0 ko.0 vi.0 za.0
0x03051dea, 0x00000000, // [21] zh.2 zhT.8 ja.0 ko.2 vi.0 za.0
0x051d02ea, 0x00000000, // [22] zh.2 zhT.2 ja.8 ko.0 vi.0 za.0
0x00001d1c, 0x00000000, // [23] zh.0 zhT.8 ja.0 ko.0 vi.0 za.0
0x1d00021d, 0x00000000, // [24] zh.0 zhT.2 ja.8 ko.0 vi.0 za.0
0x02051dea, 0x00000000, // [25] zh.2 zhT.8 ja.2 ko.0 vi.0 za.0
0x0000051c, 0x00000000, // [26] zh.8 zhT.0 ja.0 ko.0 vi.0 za.0
0x05001d1d, 0x00000000, // [27] zh.2 zhT.8 ja.0 ko.0 vi.0 za.0
0x1d00051d, 0x00000000, // [28] zh.8 zhT.2 ja.0 ko.0 vi.0 za.0
0x2d021ded, 0x00000000, // [29] zh.0 zhT.6 ja.2 ko.0 vi.2 za.0
0x05002d10, 0x00000000, // [30] zh.2 zhT.0 ja.0 ko.0 vi.6 za.0
0x05002d12, 0x00000000, // [31] zh.4 zhT.0 ja.0 ko.0 vi.6 za.0
0x2d051dec, 0x00000000, // [32] zh.4 zhT.6 ja.0 ko.0 vi.4 za.0
0x02051d10, 0x00002d01, // [33] zh.4 zhT.6 ja.2 ko.0 vi.2 za.0
0x02051dec, 0x00002d01, // [34] zh.4 zhT.6 ja.4 ko.0 vi.2 za.0
0x1d050212, 0x00000000, // [35] zh.5 zhT.4 ja.6 ko.0 vi.0 za.0
0x2d000512, 0x00000000, // [36] zh.6 zhT.0 ja.0 ko.0 vi.4 za.0
0x022d0510, 0x00000000, // [37] zh.6 zhT.0 ja.2 ko.0 vi.4 za.0
0x2d0205ec, 0x00000000, // [38] zh.6 zhT.0 ja.4 ko.0 vi.4 za.0
0x1d2d0510, 0x00000000, // [39] zh.6 zhT.2 ja.0 ko.0 vi.4 za.0
0x022d0510, 0x00001d01, // [40] zh.6 zhT.2 ja.2 ko.0 vi.4 za.0
0x1d020510, 0x00002d01, // [41] zh.6 zhT.2 ja.4 ko.0 vi.2 za.0
0x2d1d0510, 0x00000000, // [42] zh.6 zhT.4 ja.0 ko.0 vi.2 za.0
0x021d0510, 0x00002d01, // [43] zh.6 zhT.4 ja.2 ko.0 vi.2 za.0
0x03000210, 0x00000000, // [44] zh.0 zhT.0 ja.6 ko.2 vi.0 za.0
0x61021ded, 0x00000000, // [45] zh.0 zhT.6 ja.2 ko.0 vi.0 za.2
0x021d61ed, 0x00000501, // [46] zh.2 zhT.2 ja.2 ko.0 vi.0 za.6
0x05030210, 0x00001d01, // [47] zh.2 zhT.2 ja.6 ko.4 vi.0 za.0
0x051d6110, 0x00000000, // [48] zh.2 zhT.4 ja.0 ko.0 vi.0 za.6
0x05031d10, 0x00000000, // [49] zh.2 zhT.6 ja.0 ko.4 vi.0 za.0
0x02031d10, 0x00000501, // [50] zh.2 zhT.6 ja.2 ko.4 vi.0 za.0
0x03021dec, 0x00000501, // [51] zh.2 zhT.6 ja.4 ko.4 vi.0 za.0
0x02056110, 0x00000000, // [52] zh.4 zhT.0 ja.2 ko.0 vi.0 za.6
0x1d050210, 0x00000301, // [53] zh.4 zhT.2 ja.6 ko.2 vi.0 za.0
0x051d61ec, 0x00000201, // [54] zh.4 zhT.4 ja.2 ko.0 vi.0 za.6
0x02051dec, 0x00006101, // [55] zh.4 zhT.6 ja.4 ko.0 vi.0 za.2
0x610205ed, 0x00000000, // [56] zh.6 zhT.0 ja.2 ko.0 vi.0 za.2
0x611d05ed, 0x00000000, // [57] zh.6 zhT.2 ja.0 ko.0 vi.0 za.2
0x02610510, 0x00001d01, // [58] zh.6 zhT.2 ja.2 ko.0 vi.0 za.4
0x1d020510, 0x00006101, // [59] zh.6 zhT.2 ja.4 ko.0 vi.0 za.2
0x61051dec, 0x00000201, // [60] zh.4 zhT.6 ja.2 ko.0 vi.0 za.4
0x611d05ec, 0x00000201, // [61] zh.6 zhT.4 ja.2 ko.0 vi.0 za.4
0x05006110, 0x00000000, // [62] zh.2 zhT.0 ja.0 ko.0 vi.0 za.6
0x031d05ed, 0x00000000, // [63] zh.6 zhT.2 ja.0 ko.2 vi.0 za.0
0x051d61ed, 0x00000000, // [64] zh.2 zhT.2 ja.0 ko.0 vi.0 za.6
0x1d0205eb, 0x00000000, // [65] zh.6 zhT.2 ja.6 ko.0 vi.0 za.0
0x021d0510, 0x00006101, // [66] zh.6 zhT.4 ja.2 ko.0 vi.0 za.2
0x021d0510, 0x00000301, // [67] zh.6 zhT.4 ja.2 ko.2 vi.0 za.0
0x02051dec, 0x00000301, // [68] zh.4 zhT.6 ja.4 ko.2 vi.0 za.0
0x02610510, 0x00000000, // [69] zh.6 zhT.0 ja.2 ko.0 vi.0 za.4
0x61020510, 0x00000000, // [70] zh.6 zhT.0 ja.4 ko.0 vi.0 za.2
0x02000514, 0x00000000, // [71] zh.6 zhT.0 ja.6 ko.0 vi.0 za.0
0x021d05ed, 0x00000000, // [72] zh.6 zhT.2 ja.2 ko.0 vi.0 za.0
0x611d0510, 0x00000000, // [73] zh.6 zhT.4 ja.0 ko.0 vi.0 za.2
0x1d020512, 0x00000000, // [74] zh.6 zhT.4 ja.5 ko.0 vi.0 za.0
0x03001d10, 0x00000000, // [75] zh.0 zhT.6 ja.0 ko.2 vi.0 za.0
0x03021ded, 0x00000000, // [76] zh.0 zhT.6 ja.2 ko.2 vi.0 za.0
0x03051ded, 0x00000000, // [77] zh.2 zhT.6 ja.0 ko.2 vi.0 za.0
0x02051ded, 0x00000301, // [78] zh.2 zhT.6 ja.2 ko.2 vi.0 za.0
0x1d056110, 0x00000000, // [79] zh.4 zhT.2 ja.0 ko.0 vi.0 za.6
0x611d05ec, 0x00000000, // [80] zh.6 zhT.4 ja.0 ko.0 vi.0 za.4
0x031d0510, 0x00000000, // [81] zh.6 zhT.4 ja.0 ko.2 vi.0 za.0
0x031d05eb, 0x00000000, // [82] zh.6 zhT.6 ja.0 ko.2 vi.0 za.0
0x610205ec, 0x00000000, // [83] zh.6 zhT.0 ja.4 ko.0 vi.0 za.4
0x1d610510, 0x00000000, // [84] zh.6 zhT.2 ja.0 ko.0 vi.0 za.4
0x021d05eb, 0x00000301, // [85] zh.6 zhT.6 ja.2 ko.2 vi.0 za.0
0x61051d10, 0x00000000, // [86] zh.4 zhT.6 ja.0 ko.0 vi.0 za.2
0x05021deb, 0x00000000, // [87] zh.2 zhT.6 ja.6 ko.0 vi.0 za.0
0x051d0212, 0x00000000, // [88] zh.4 zhT.5 ja.6 ko.0 vi.0 za.0
0x03051d10, 0x00000000, // [89] zh.4 zhT.6 ja.0 ko.2 vi.0 za.0
0x1d6105eb, 0x00000000, // [90] zh.6 zhT.2 ja.0 ko.0 vi.0 za.6
0x03021d10, 0x00000000, // [91] zh.0 zhT.6 ja.4 ko.2 vi.0 za.0
0x05000212, 0x00000000, // [92] zh.4 zhT.0 ja.6 ko.0 vi.0 za.0
0x05021d10, 0x00000301, // [93] zh.2 zhT.6 ja.4 ko.2 vi.0 za.0
0x61051dec, 0x00000000, // [94] zh.4 zhT.6 ja.0 ko.0 vi.0 za.4
0x021d05ed, 0x00000000, // [95] zh.6 zhT.2 ja.2 ko.0 vi.0 za.0
0x02051d10, 0x00000301, // [96] zh.4 zhT.6 ja.2 ko.2 vi.0 za.0
0x05021d12, 0x00000000, // [97] zh.4 zhT.6 ja.5 ko.0 vi.0 za.0
0x02000510, 0x00000000, // [98] zh.6 zhT.0 ja.2 ko.0 vi.0 za.0
0x021d05ec, 0x00000000, // [99] zh.6 zhT.4 ja.4 ko.0 vi.0 za.0
0x1d050210, 0x00000000, // [100] zh.4 zhT.2 ja.6 ko.0 vi.0 za.0
0x05000210, 0x00000000, // [101] zh.2 zhT.0 ja.6 ko.0 vi.0 za.0
0x051d61ec, 0x00000000, // [102] zh.4 zhT.4 ja.0 ko.0 vi.0 za.6
0x051d02ec, 0x00000000, // [103] zh.4 zhT.4 ja.6 ko.0 vi.0 za.0
0x02051d10, 0x00006101, // [104] zh.4 zhT.6 ja.2 ko.0 vi.0 za.2
0x051d02ed, 0x00000000, // [105] zh.2 zhT.2 ja.6 ko.0 vi.0 za.0
0x051d0210, 0x00000000, // [106] zh.2 zhT.4 ja.6 ko.0 vi.0 za.0
0x02001d14, 0x00000000, // [107] zh.0 zhT.6 ja.6 ko.0 vi.0 za.0
0x1d020510, 0x00000000, // [108] zh.6 zhT.2 ja.4 ko.0 vi.0 za.0
0x1d000212, 0x00000000, // [109] zh.0 zhT.4 ja.6 ko.0 vi.0 za.0
0x05006112, 0x00000000, // [110] zh.4 zhT.0 ja.0 ko.0 vi.0 za.6
0x02051dec, 0x00000000, // [111] zh.4 zhT.6 ja.4 ko.0 vi.0 za.0
0x61000514, 0x00000000, // [112] zh.6 zhT.0 ja.0 ko.0 vi.0 za.6
0x61000510, 0x00000000, // [113] zh.6 zhT.0 ja.0 ko.0 vi.0 za.2
0x02000512, 0x00000000, // [114] zh.6 zhT.0 ja.4 ko.0 vi.0 za.0
0x021d0512, 0x00000000, // [115] zh.6 zhT.5 ja.4 ko.0 vi.0 za.0
0x1d000210, 0x00000000, // [116] zh.0 zhT.2 ja.6 ko.0 vi.0 za.0
0x0000020f, 0x00000000, // [117] zh.0 zhT.0 ja.6 ko.0 vi.0 za.0
0x021d05eb, 0x00000000, // [118] zh.6 zhT.6 ja.2 ko.0 vi.0 za.0
0x05021d10, 0x00000000, // [119] zh.2 zhT.6 ja.4 ko.0 vi.0 za.0
0x021d0510, 0x00000000, // [120] zh.6 zhT.4 ja.2 ko.0 vi.0 za.0
0x02051ded, 0x00000000, // [121] zh.2 zhT.6 ja.2 ko.0 vi.0 za.0
0x05001d10, 0x00000000, // [122] zh.2 zhT.6 ja.0 ko.0 vi.0 za.0
0x61000512, 0x00000000, // [123] zh.6 zhT.0 ja.0 ko.0 vi.0 za.4
0x1d000512, 0x00000000, // [124] zh.6 zhT.4 ja.0 ko.0 vi.0 za.0
0x1d000514, 0x00000000, // [125] zh.6 zhT.6 ja.0 ko.0 vi.0 za.0
0x02051d12, 0x00000000, // [126] zh.5 zhT.6 ja.4 ko.0 vi.0 za.0
0x00001d0f, 0x00000000, // [127] zh.0 zhT.6 ja.0 ko.0 vi.0 za.0
0x1d000510, 0x00000000, // [128] zh.6 zhT.2 ja.0 ko.0 vi.0 za.0
0x02001d10, 0x00000000, // [129] zh.0 zhT.6 ja.2 ko.0 vi.0 za.0
0x02051d10, 0x00000000, // [130] zh.4 zhT.6 ja.2 ko.0 vi.0 za.0
0x02001d12, 0x00000000, // [131] zh.0 zhT.6 ja.4 ko.0 vi.0 za.0
0x05001d12, 0x00000000, // [132] zh.4 zhT.6 ja.0 ko.0 vi.0 za.0
0x0000050f, 0x00000000, // [133] zh.6 zhT.0 ja.0 ko.0 vi.0 za.0
0x021d0513, 0x00000000, // [134] zh.6 zhT.6 ja.5 ko.0 vi.0 za.0
0x1d020513, 0x00000000, // [135] zh.6 zhT.5 ja.6 ko.0 vi.0 za.0
0x05021d13, 0x00000000, // [136] zh.5 zhT.6 ja.6 ko.0 vi.0 za.0
0x051d02af, 0x00000000, // [137] zh.5 zhT.5 ja.6 ko.0 vi.0 za.0
0x02051daf, 0x00000000, // [138] zh.5 zhT.6 ja.5 ko.0 vi.0 za.0
0x021d05af, 0x00000000, // [139] zh.6 zhT.5 ja.5 ko.0 vi.0 za.0
0x021d0514, 0x00000000, // [140] zh.6 zhT.6 ja.6 ko.0 vi.0 za.0
0x1d000513, 0x00000000, // [141] zh.6 zhT.5 ja.0 ko.0 vi.0 za.0
0x02000513, 0x00000000, // [142] zh.6 zhT.0 ja.5 ko.0 vi.0 za.0
0x02001d13, 0x00000000, // [143] zh.0 zhT.6 ja.5 ko.0 vi.0 za.0
0x05001d13, 0x00000000, // [144] zh.5 zhT.6 ja.0 ko.0 vi.0 za.0
0x05000213, 0x00000000, // [145] zh.5 zhT.0 ja.6 ko.0 vi.0 za.0
0x1d000213, 0x00000000, // [146] zh.0 zhT.5 ja.6 ko.0 vi.0 za.0
0x00002d06, 0x00000000, // [147] zh.0 zhT.0 ja.0 ko.0 vi.4 za.0
0x00000306, 0x00000000, // [148] zh.0 zhT.0 ja.0 ko.4 vi.0 za.0
0x051d2dee, 0x00000000, // [149] zh.2 zhT.2 ja.0 ko.0 vi.4 za.0
0x021d2dee, 0x00000501, // [150] zh.2 zhT.2 ja.2 ko.0 vi.4 za.0
0x2d051dee, 0x00000000, // [151] zh.2 zhT.4 ja.0 ko.0 vi.2 za.0
0x02051dee, 0x00002d01, // [152] zh.2 zhT.4 ja.2 ko.0 vi.2 za.0
0x05021d55, 0x00002d01, // [153] zh.2 zhT.4 ja.4 ko.0 vi.2 za.0
0x022d0555, 0x00000000, // [154] zh.4 zhT.0 ja.2 ko.0 vi.4 za.0
0x2d020555, 0x00000000, // [155] zh.4 zhT.0 ja.4 ko.0 vi.2 za.0
0x2d1d05ee, 0x00000000, // [156] zh.4 zhT.2 ja.0 ko.0 vi.2 za.0
0x021d05ee, 0x00002d01, // [157] zh.4 zhT.2 ja.2 ko.0 vi.2 za.0
0x2d1d0555, 0x00000000, // [158] zh.4 zhT.4 ja.0 ko.0 vi.2 za.0
0x021d0555, 0x00002d01, // [159] zh.4 zhT.4 ja.2 ko.0 vi.2 za.0
0x021d0509, 0x00002d01, // [160] zh.4 zhT.4 ja.4 ko.0 vi.2 za.0
0x1d0203ee, 0x00000000, // [161] zh.0 zhT.2 ja.2 ko.4 vi.0 za.0
0x051d02ee, 0x00000301, // [162] zh.2 zhT.2 ja.4 ko.2 vi.0 za.0
0x05021d55, 0x00006101, // [163] zh.2 zhT.4 ja.4 ko.0 vi.0 za.2
0x05021d55, 0x00000301, // [164] zh.2 zhT.4 ja.4 ko.2 vi.0 za.0
0x61020555, 0x00000000, // [165] zh.4 zhT.0 ja.4 ko.0 vi.0 za.2
0x61020509, 0x00000000, // [166] zh.4 zhT.0 ja.4 ko.0 vi.0 za.4
0x02030555, 0x00001d01, // [167] zh.4 zhT.2 ja.2 ko.4 vi.0 za.0
0x031d0555, 0x00000000, // [168] zh.4 zhT.4 ja.0 ko.2 vi.0 za.0
0x051d03ee, 0x00000000, // [169] zh.2 zhT.2 ja.0 ko.4 vi.0 za.0
0x02051dee, 0x00000301, // [170] zh.2 zhT.4 ja.2 ko.2 vi.0 za.0
0x021d0555, 0x00000301, // [171] zh.4 zhT.4 ja.2 ko.2 vi.0 za.0
0x02000509, 0x00000000, // [172] zh.4 zhT.0 ja.4 ko.0 vi.0 za.0
0x021d0509, 0x00006106, // [173] zh.4 zhT.4 ja.4 ko.0 vi.0 za.4
0x03001d07, 0x00000000, // [174] zh.0 zhT.4 ja.0 ko.2 vi.0 za.0
0x03021dee, 0x00000000, // [175] zh.0 zhT.4 ja.2 ko.2 vi.0 za.0
0x610205ee, 0x00000000, // [176] zh.4 zhT.0 ja.2 ko.0 vi.0 za.2
0x1d610555, 0x00000000, // [177] zh.4 zhT.2 ja.0 ko.0 vi.0 za.4
0x021d61ee, 0x00000501, // [178] zh.2 zhT.2 ja.2 ko.0 vi.0 za.4
0x03000507, 0x00000000, // [179] zh.4 zhT.0 ja.0 ko.2 vi.0 za.0
0x021d0509, 0x00006101, // [180] zh.4 zhT.4 ja.4 ko.0 vi.0 za.2
0x61000509, 0x00000000, // [181] zh.4 zhT.0 ja.0 ko.0 vi.0 za.4
0x02610555, 0x00000000, // [182] zh.4 zhT.0 ja.2 ko.0 vi.0 za.4
0x611d05ee, 0x00000000, // [183] zh.4 zhT.2 ja.0 ko.0 vi.0 za.2
0x021d05ee, 0x00006101, // [184] zh.4 zhT.2 ja.2 ko.0 vi.0 za.2
0x03051dee, 0x00000000, // [185] zh.2 zhT.4 ja.0 ko.2 vi.0 za.0
0x051d61ee, 0x00000000, // [186] zh.2 zhT.2 ja.0 ko.0 vi.0 za.4
0x05611d55, 0x00000000, // [187] zh.2 zhT.4 ja.0 ko.0 vi.0 za.4
0x02611d55, 0x00000501, // [188] zh.2 zhT.4 ja.2 ko.0 vi.0 za.4
0x1d020555, 0x00000000, // [189] zh.4 zhT.2 ja.4 ko.0 vi.0 za.0
0x05000207, 0x00000000, // [190] zh.2 zhT.0 ja.4 ko.0 vi.0 za.0
0x02000507, 0x00000000, // [191] zh.4 zhT.0 ja.2 ko.0 vi.0 za.0
0x611d0509, 0x00000000, // [192] zh.4 zhT.4 ja.0 ko.0 vi.0 za.4
0x611d0509, 0x00000201, // [193] zh.4 zhT.4 ja.2 ko.0 vi.0 za.4
0x02001d09, 0x00000000, // [194] zh.0 zhT.4 ja.4 ko.0 vi.0 za.0
0x611d0555, 0x00000000, // [195] zh.4 zhT.4 ja.0 ko.0 vi.0 za.2
0x61051dee, 0x00000000, // [196] zh.2 zhT.4 ja.0 ko.0 vi.0 za.2
0x051d02ee, 0x00000000, // [197] zh.2 zhT.2 ja.4 ko.0 vi.0 za.0
0x1d000207, 0x00000000, // [198] zh.0 zhT.2 ja.4 ko.0 vi.0 za.0
0x021d05ee, 0x00000000, // [199] zh.4 zhT.2 ja.2 ko.0 vi.0 za.0
0x02051dee, 0x00006101, // [200] zh.2 zhT.4 ja.2 ko.0 vi.0 za.2
0x021d0509, 0x00000000, // [201] zh.4 zhT.4 ja.4 ko.0 vi.0 za.0
0x05021d55, 0x00000000, // [202] zh.2 zhT.4 ja.4 ko.0 vi.0 za.0
0x00000206, 0x00000000, // [203] zh.0 zhT.0 ja.4 ko.0 vi.0 za.0
0x02001d07, 0x00000000, // [204] zh.0 zhT.4 ja.2 ko.0 vi.0 za.0
0x021d0555, 0x00006101, // [205] zh.4 zhT.4 ja.2 ko.0 vi.0 za.2
0x02051dee, 0x00000000, // [206] zh.2 zhT.4 ja.2 ko.0 vi.0 za.0
0x1d000507, 0x00000000, // [207] zh.4 zhT.2 ja.0 ko.0 vi.0 za.0
0x1d000509, 0x00000000, // [208] zh.4 zhT.4 ja.0 ko.0 vi.0 za.0
0x021d0555, 0x00000000, // [209] zh.4 zhT.4 ja.2 ko.0 vi.0 za.0
0x05001d07, 0x00000000, // [210] zh.2 zhT.4 ja.0 ko.0 vi.0 za.0
0x00001d06, 0x00000000, // [211] zh.0 zhT.4 ja.0 ko.0 vi.0 za.0
0x00000506, 0x00000000, // [212] zh.4 zhT.0 ja.0 ko.0 vi.0 za.0
0x2d000309, 0x00000000, // [213] zh.0 zhT.0 ja.0 ko.4 vi.4 za.0
0x2d000209, 0x00000000, // [214] zh.0 zhT.0 ja.4 ko.0 vi.4 za.0
0x03000209, 0x00000000, // [215] zh.0 zhT.0 ja.4 ko.4 vi.0 za.0
0x2d001d09, 0x00000000, // [216] zh.0 zhT.4 ja.0 ko.0 vi.4 za.0
0x03001d09, 0x00000000, // [217] zh.0 zhT.4 ja.0 ko.4 vi.0 za.0
0x2d000509, 0x00000000, // [218] zh.4 zhT.0 ja.0 ko.0 vi.4 za.0
0x03000509, 0x00000000, // [219] zh.4 zhT.0 ja.0 ko.4 vi.0 za.0
0x00000501, 0x00000000, // [220] zh.2 zhT.0 ja.0 ko.0 vi.0 za.0
0x00001d01, 0x00000000, // [221] zh.0 zhT.2 ja.0 ko.0 vi.0 za.0
0x2d031d02, 0x00000000, // [222] zh.0 zhT.2 ja.0 ko.2 vi.2 za.0
0x2d021d02, 0x00000000, // [223] zh.0 zhT.2 ja.2 ko.0 vi.2 za.0
0x2d030502, 0x00000000, // [224] zh.2 zhT.0 ja.0 ko.2 vi.2 za.0
0x2d020502, 0x00000000, // [225] zh.2 zhT.0 ja.2 ko.0 vi.2 za.0
0x03020502, 0x00000000, // [226] zh.2 zhT.0 ja.2 ko.2 vi.0 za.0
0x2d1d0502, 0x00000000, // [227] zh.2 zhT.2 ja.0 ko.0 vi.2 za.0
0x021d0502, 0x00000301, // [228] zh.2 zhT.2 ja.2 ko.2 vi.0 za.0
0x031d0502, 0x00000000, // [229] zh.2 zhT.2 ja.0 ko.2 vi.0 za.0
0x1d000502, 0x00000000, // [230] zh.2 zhT.2 ja.0 ko.0 vi.0 za.0
0x00000201, 0x00000000, // [231] zh.0 zhT.0 ja.2 ko.0 vi.0 za.0
0x02001d02, 0x00000000, // [232] zh.0 zhT.2 ja.2 ko.0 vi.0 za.0
0x021d0502, 0x00000000, // [233] zh.2 zhT.2 ja.2 ko.0 vi.0 za.0
0x00000301, 0x00000000, // [234] zh.0 zhT.0 ja.0 ko.2 vi.0 za.0
0x02000502, 0x00000000, // [235] zh.2 zhT.0 ja.2 ko.0 vi.0 za.0
0x03001d02, 0x00000000, // [236] zh.0 zhT.2 ja.0 ko.2 vi.0 za.0
0x03000202, 0x00000000, // [237] zh.0 zhT.0 ja.2 ko.2 vi.0 za.0
0x03021d02, 0x00000000, // [238] zh.0 zhT.2 ja.2 ko.2 vi.0 za.0
};
extern const CLD2TableSummary kCjkCompat_obj = {
kCompatTable,
kCompatTableInd,
kCompatTableSizeOne,
kCompatTableSize,
kCompatTableKeyMask,
kCompatTableBuildDate,
kCompatTableRecognizedLangScripts,
};
} // End namespace CLD2
// End of generated tables

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_
#include "integral_types.h"
namespace CLD2 {
// Hash bucket for four-way associative lookup, indirect probabilities
// 16 bytes per bucket, 4-byte entries
typedef struct {
uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
} IndirectProbBucket4;
// Expanded version December 2012.
// Moves cutoff for 6-language vs. 3-language indirects
// Has list of recognized lang-script combinations
typedef struct {
const IndirectProbBucket4* kCLDTable;
// Each bucket has four entries, part
// key and part indirect subscript
const uint32* kCLDTableInd; // Each entry is three packed lang/prob
uint32 kCLDTableSizeOne; // Indirect subscripts >= this: 2 entries
uint32 kCLDTableSize; // Bucket count
uint32 kCLDTableKeyMask; // Mask hash key
uint32 kCLDTableBuildDate; // yyyymmdd
const char* kRecognizedLangScripts; // Character string of lang-Scripts
// recognized: "en-Latn az-Arab ..."
// Single space delimiter, Random order
} CLD2TableSummary;
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,640 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace CLD2 {
// Average score per 1024 bytes
static const int kAvgDeltaOctaScoreSize = 614 * 4;
extern const short kAvgDeltaOctaScore[kAvgDeltaOctaScoreSize] = {
// Latn Cyrl Arab Other script
// Updated 20140202 for CLD2 Chrome 256K entries
1314, 0, 0, 0, // 0 ENGLISH en
1188, 0, 0, 0, // 1 DANISH da
1133, 0, 0, 0, // 2 DUTCH nl
1306, 0, 0, 0, // 3 FINNISH fi
1033, 0, 0, 0, // 4 FRENCH fr
1306, 0, 0, 0, // 5 GERMAN de
0, 0, 0, 776, // 6 HEBREW iw
960, 0, 0, 0, // 7 ITALIAN it
0, 0, 0, 3100, // 8 Japanese ja
0, 0, 0, 3669, // 9 Korean ko
1274, 0, 0, 0, // 10 NORWEGIAN no
1313, 0, 0, 0, // 11 POLISH pl
1061, 0, 0, 0, // 12 PORTUGUESE pt
0, 776, 0, 0, // 13 RUSSIAN ru
762, 0, 0, 0, // 14 SPANISH es
1040, 0, 0, 0, // 15 SWEDISH sv
0, 0, 0, 1928, // 16 Chinese zh
1286, 0, 0, 0, // 17 CZECH cs
0, 0, 0, 1024, // 18 GREEK el
1235, 0, 0, 0, // 19 ICELANDIC is
1236, 0, 0, 0, // 20 LATVIAN lv
1157, 0, 0, 0, // 21 LITHUANIAN lt
961, 771, 0, 0, // 22 ROMANIAN ro
1284, 0, 0, 0, // 23 HUNGARIAN hu
1172, 0, 0, 0, // 24 ESTONIAN et
0, 0, 0, 0, // 25 Ignore xxx
0, 0, 0, 0, // 26 Unknown un
0, 793, 0, 0, // 27 BULGARIAN bg
563, 0, 0, 0, // 28 CROATIAN hr
564, 930, 0, 0, // 29 SERBIAN sr
1424, 0, 0, 0, // 30 IRISH ga
888, 0, 0, 0, // 31 GALICIAN gl
1381, 0, 0, 0, // 32 TAGALOG tl
1298, 0, 0, 0, // 33 TURKISH tr
0, 803, 0, 0, // 34 UKRAINIAN uk
0, 0, 0, 744, // 35 HINDI hi
0, 854, 0, 0, // 36 MACEDONIAN mk
0, 0, 0, 600, // 37 BENGALI bn
1418, 0, 0, 0, // 38 INDONESIAN id
0, 0, 0, 0, // 39 LATIN la
1521, 0, 0, 0, // 40 MALAY ms
0, 0, 0, 1024, // 41 MALAYALAM ml
1669, 0, 0, 0, // 42 WELSH cy
0, 0, 0, 545, // 43 NEPALI ne
0, 0, 0, 1024, // 44 TELUGU te
1304, 0, 0, 0, // 45 ALBANIAN sq
0, 0, 0, 1024, // 46 TAMIL ta
0, 594, 0, 0, // 47 BELARUSIAN be
1115, 0, 0, 0, // 48 JAVANESE jw
0, 0, 0, 0, // 49 OCCITAN oc
0, 0, 1033, 0, // 50 URDU ur
0, 0, 0, 527, // 51 BIHARI bh
0, 0, 0, 1024, // 52 GUJARATI gu
0, 0, 0, 1024, // 53 THAI th
0, 0, 843, 0, // 54 ARABIC ar
806, 0, 0, 0, // 55 CATALAN ca
0, 0, 0, 0, // 56 ESPERANTO eo
1425, 0, 0, 0, // 57 BASQUE eu
0, 0, 0, 0, // 58 INTERLINGUA ia
0, 0, 0, 1024, // 59 KANNADA kn
0, 0, 0, 1024, // 60 PUNJABI pa
1583, 0, 0, 0, // 61 SCOTS_GAELIC gd
1396, 0, 0, 0, // 62 SWAHILI sw
718, 0, 0, 0, // 63 SLOVENIAN sl
0, 0, 0, 584, // 64 MARATHI mr
1219, 0, 0, 0, // 65 MALTESE mt
1470, 0, 0, 0, // 66 VIETNAMESE vi
0, 0, 0, 0, // 67 FRISIAN fy
1314, 0, 0, 0, // 68 SLOVAK sk
0, 0, 0, 1908, // 69 ChineseT zh-Hant
0, 0, 0, 0, // 70 FAROESE fo
0, 0, 0, 0, // 71 SUNDANESE su
0, 0, 0, 0, // 72 UZBEK uz
0, 0, 0, 0, // 73 AMHARIC am
1425, 0, 0, 0, // 74 AZERBAIJANI az
0, 0, 0, 1024, // 75 GEORGIAN ka
0, 0, 0, 0, // 76 TIGRINYA ti
0, 0, 1013, 0, // 77 PERSIAN fa
596, 0, 0, 0, // 78 BOSNIAN bs
0, 0, 0, 1024, // 79 SINHALESE si
0, 0, 0, 0, // 80 NORWEGIAN_N nn
0, 0, 0, 0, // 81 81
0, 0, 0, 0, // 82 82
0, 0, 0, 0, // 83 XHOSA xh
1962, 0, 0, 0, // 84 ZULU zu
0, 0, 0, 0, // 85 GUARANI gn
0, 0, 0, 0, // 86 SESOTHO st
0, 0, 0, 0, // 87 TURKMEN tk
0, 0, 0, 0, // 88 KYRGYZ ky
0, 0, 0, 0, // 89 BRETON br
0, 0, 0, 0, // 90 TWI tw
0, 0, 0, 978, // 91 YIDDISH yi
0, 0, 0, 0, // 92 92
1544, 0, 0, 0, // 93 SOMALI so
0, 0, 0, 0, // 94 UIGHUR ug
0, 0, 0, 0, // 95 KURDISH ku
0, 0, 0, 1024, // 96 MONGOLIAN mn
0, 0, 0, 1024, // 97 ARMENIAN hy
0, 0, 0, 1024, // 98 LAOTHIAN lo
0, 0, 0, 0, // 99 SINDHI sd
0, 0, 0, 0, // 100 RHAETO_ROMANCE rm
1179, 0, 0, 0, // 101 AFRIKAANS af
0, 0, 0, 0, // 102 LUXEMBOURGISH lb
0, 0, 0, 1024, // 103 BURMESE my
0, 0, 0, 1024, // 104 KHMER km
0, 0, 0, 0, // 105 TIBETAN bo
0, 0, 0, 1024, // 106 DHIVEHI dv
0, 0, 0, 1024, // 107 CHEROKEE chr
0, 0, 0, 1024, // 108 SYRIAC syr
0, 0, 0, 1024, // 109 LIMBU lif
0, 0, 0, 1024, // 110 ORIYA or
0, 0, 0, 0, // 111 ASSAMESE as
0, 0, 0, 0, // 112 CORSICAN co
0, 0, 0, 0, // 113 INTERLINGUE ie
0, 0, 0, 0, // 114 KAZAKH kk
0, 0, 0, 0, // 115 LINGALA ln
0, 0, 0, 0, // 116 116
0, 0, 0, 0, // 117 PASHTO ps
0, 0, 0, 0, // 118 QUECHUA qu
0, 0, 0, 0, // 119 SHONA sn
0, 0, 0, 0, // 120 TAJIK tg
0, 0, 0, 0, // 121 TATAR tt
0, 0, 0, 0, // 122 TONGA to
929, 0, 0, 0, // 123 YORUBA yo
0, 0, 0, 0, // 124 124
0, 0, 0, 0, // 125 125
0, 0, 0, 0, // 126 126
0, 0, 0, 0, // 127 127
0, 0, 0, 0, // 128 MAORI mi
0, 0, 0, 0, // 129 WOLOF wo
0, 0, 0, 0, // 130 ABKHAZIAN ab
0, 0, 0, 0, // 131 AFAR aa
0, 0, 0, 0, // 132 AYMARA ay
0, 0, 0, 0, // 133 BASHKIR ba
0, 0, 0, 0, // 134 BISLAMA bi
0, 0, 0, 0, // 135 DZONGKHA dz
0, 0, 0, 0, // 136 FIJIAN fj
0, 0, 0, 0, // 137 GREENLANDIC kl
1345, 0, 0, 0, // 138 HAUSA ha
1346, 0, 0, 0, // 139 HAITIAN_CREOLE ht
0, 0, 0, 0, // 140 INUPIAK ik
0, 0, 0, 1024, // 141 INUKTITUT iu
0, 0, 0, 0, // 142 KASHMIRI ks
1862, 0, 0, 0, // 143 KINYARWANDA rw
0, 0, 0, 0, // 144 MALAGASY mg
0, 0, 0, 0, // 145 NAURU na
0, 0, 0, 0, // 146 OROMO om
0, 0, 0, 0, // 147 RUNDI rn
0, 0, 0, 0, // 148 SAMOAN sm
0, 0, 0, 0, // 149 SANGO sg
0, 0, 0, 0, // 150 SANSKRIT sa
0, 0, 0, 0, // 151 SISWANT ss
0, 0, 0, 0, // 152 TSONGA ts
0, 0, 0, 0, // 153 TSWANA tn
0, 0, 0, 0, // 154 VOLAPUK vo
0, 0, 0, 0, // 155 ZHUANG za
0, 0, 0, 0, // 156 KHASI kha
0, 0, 0, 0, // 157 SCOTS sco
1385, 0, 0, 0, // 158 GANDA lg
0, 0, 0, 0, // 159 MANX gv
0, 0, 0, 0, // 160 MONTENEGRIN sr-ME
0, 0, 0, 0, // 161 AKAN ak
1245, 0, 0, 0, // 162 IGBO ig
0, 0, 0, 0, // 163 MAURITIAN_CREOLE mfe
0, 0, 0, 0, // 164 HAWAIIAN haw
1357, 0, 0, 0, // 165 CEBUANO ceb
0, 0, 0, 0, // 166 EWE ee
0, 0, 0, 0, // 167 GA gaa
2053, 0, 0, 0, // 168 HMONG hmn
0, 0, 0, 0, // 169 KRIO kri
0, 0, 0, 0, // 170 LOZI loz
0, 0, 0, 0, // 171 LUBA_LULUA lua
0, 0, 0, 0, // 172 LUO_KENYA_AND_TANZANIA luo
0, 0, 0, 0, // 173 NEWARI new
0, 0, 0, 0, // 174 NYANJA ny
0, 0, 0, 0, // 175 OSSETIAN os
0, 0, 0, 0, // 176 PAMPANGA pam
0, 0, 0, 0, // 177 PEDI nso
0, 0, 0, 0, // 178 RAJASTHANI raj
0, 0, 0, 0, // 179 SESELWA crs
0, 0, 0, 0, // 180 TUMBUKA tum
0, 0, 0, 0, // 181 VENDA ve
0, 0, 0, 0, // 182 WARAY_PHILIPPINES war
0, 0, 0, 0, // 183 183
0, 0, 0, 0, // 184 184
0, 0, 0, 0, // 185 185
0, 0, 0, 0, // 186 186
0, 0, 0, 0, // 187 187
0, 0, 0, 0, // 188 188
0, 0, 0, 0, // 189 189
0, 0, 0, 0, // 190 190
0, 0, 0, 0, // 191 191
0, 0, 0, 0, // 192 192
0, 0, 0, 0, // 193 193
0, 0, 0, 0, // 194 194
0, 0, 0, 0, // 195 195
0, 0, 0, 0, // 196 196
0, 0, 0, 0, // 197 197
0, 0, 0, 0, // 198 198
0, 0, 0, 0, // 199 199
0, 0, 0, 0, // 200 200
0, 0, 0, 0, // 201 201
0, 0, 0, 0, // 202 202
0, 0, 0, 0, // 203 203
0, 0, 0, 0, // 204 204
0, 0, 0, 0, // 205 205
0, 0, 0, 0, // 206 206
0, 0, 0, 0, // 207 207
0, 0, 0, 0, // 208 208
0, 0, 0, 0, // 209 209
0, 0, 0, 0, // 210 210
0, 0, 0, 0, // 211 211
0, 0, 0, 0, // 212 212
0, 0, 0, 0, // 213 213
0, 0, 0, 0, // 214 214
0, 0, 0, 0, // 215 215
0, 0, 0, 0, // 216 216
0, 0, 0, 0, // 217 217
0, 0, 0, 0, // 218 218
0, 0, 0, 0, // 219 219
0, 0, 0, 0, // 220 220
0, 0, 0, 0, // 221 221
0, 0, 0, 0, // 222 222
0, 0, 0, 0, // 223 223
0, 0, 0, 0, // 224 224
0, 0, 0, 0, // 225 225
0, 0, 0, 0, // 226 226
0, 0, 0, 0, // 227 227
0, 0, 0, 0, // 228 228
0, 0, 0, 0, // 229 229
0, 0, 0, 0, // 230 230
0, 0, 0, 0, // 231 231
0, 0, 0, 0, // 232 232
0, 0, 0, 0, // 233 233
0, 0, 0, 0, // 234 234
0, 0, 0, 0, // 235 235
0, 0, 0, 0, // 236 236
0, 0, 0, 0, // 237 237
0, 0, 0, 0, // 238 238
0, 0, 0, 0, // 239 239
0, 0, 0, 0, // 240 240
0, 0, 0, 0, // 241 241
0, 0, 0, 0, // 242 242
0, 0, 0, 0, // 243 243
0, 0, 0, 0, // 244 244
0, 0, 0, 0, // 245 245
0, 0, 0, 0, // 246 246
0, 0, 0, 0, // 247 247
0, 0, 0, 0, // 248 248
0, 0, 0, 0, // 249 249
0, 0, 0, 0, // 250 250
0, 0, 0, 0, // 251 251
0, 0, 0, 0, // 252 252
0, 0, 0, 0, // 253 253
0, 0, 0, 0, // 254 254
0, 0, 0, 0, // 255 255
0, 0, 0, 0, // 256 256
0, 0, 0, 0, // 257 257
0, 0, 0, 0, // 258 258
0, 0, 0, 0, // 259 259
0, 0, 0, 0, // 260 260
0, 0, 0, 0, // 261 261
0, 0, 0, 0, // 262 262
0, 0, 0, 0, // 263 263
0, 0, 0, 0, // 264 264
0, 0, 0, 0, // 265 265
0, 0, 0, 0, // 266 266
0, 0, 0, 0, // 267 267
0, 0, 0, 0, // 268 268
0, 0, 0, 0, // 269 269
0, 0, 0, 0, // 270 270
0, 0, 0, 0, // 271 271
0, 0, 0, 0, // 272 272
0, 0, 0, 0, // 273 273
0, 0, 0, 0, // 274 274
0, 0, 0, 0, // 275 275
0, 0, 0, 0, // 276 276
0, 0, 0, 0, // 277 277
0, 0, 0, 0, // 278 278
0, 0, 0, 0, // 279 279
0, 0, 0, 0, // 280 280
0, 0, 0, 0, // 281 281
0, 0, 0, 0, // 282 282
0, 0, 0, 0, // 283 283
0, 0, 0, 0, // 284 284
0, 0, 0, 0, // 285 285
0, 0, 0, 0, // 286 286
0, 0, 0, 0, // 287 287
0, 0, 0, 0, // 288 288
0, 0, 0, 0, // 289 289
0, 0, 0, 0, // 290 290
0, 0, 0, 0, // 291 291
0, 0, 0, 0, // 292 292
0, 0, 0, 0, // 293 293
0, 0, 0, 0, // 294 294
0, 0, 0, 0, // 295 295
0, 0, 0, 0, // 296 296
0, 0, 0, 0, // 297 297
0, 0, 0, 0, // 298 298
0, 0, 0, 0, // 299 299
0, 0, 0, 0, // 300 300
0, 0, 0, 0, // 301 301
0, 0, 0, 0, // 302 302
0, 0, 0, 0, // 303 303
0, 0, 0, 0, // 304 304
0, 0, 0, 0, // 305 305
0, 0, 0, 0, // 306 306
0, 0, 0, 0, // 307 307
0, 0, 0, 0, // 308 308
0, 0, 0, 0, // 309 309
0, 0, 0, 0, // 310 310
0, 0, 0, 0, // 311 311
0, 0, 0, 0, // 312 312
0, 0, 0, 0, // 313 313
0, 0, 0, 0, // 314 314
0, 0, 0, 0, // 315 315
0, 0, 0, 0, // 316 316
0, 0, 0, 0, // 317 317
0, 0, 0, 0, // 318 318
0, 0, 0, 0, // 319 319
0, 0, 0, 0, // 320 320
0, 0, 0, 0, // 321 321
0, 0, 0, 0, // 322 322
0, 0, 0, 0, // 323 323
0, 0, 0, 0, // 324 324
0, 0, 0, 0, // 325 325
0, 0, 0, 0, // 326 326
0, 0, 0, 0, // 327 327
0, 0, 0, 0, // 328 328
0, 0, 0, 0, // 329 329
0, 0, 0, 0, // 330 330
0, 0, 0, 0, // 331 331
0, 0, 0, 0, // 332 332
0, 0, 0, 0, // 333 333
0, 0, 0, 0, // 334 334
0, 0, 0, 0, // 335 335
0, 0, 0, 0, // 336 336
0, 0, 0, 0, // 337 337
0, 0, 0, 0, // 338 338
0, 0, 0, 0, // 339 339
0, 0, 0, 0, // 340 340
0, 0, 0, 0, // 341 341
0, 0, 0, 0, // 342 342
0, 0, 0, 0, // 343 343
0, 0, 0, 0, // 344 344
0, 0, 0, 0, // 345 345
0, 0, 0, 0, // 346 346
0, 0, 0, 0, // 347 347
0, 0, 0, 0, // 348 348
0, 0, 0, 0, // 349 349
0, 0, 0, 0, // 350 350
0, 0, 0, 0, // 351 351
0, 0, 0, 0, // 352 352
0, 0, 0, 0, // 353 353
0, 0, 0, 0, // 354 354
0, 0, 0, 0, // 355 355
0, 0, 0, 0, // 356 356
0, 0, 0, 0, // 357 357
0, 0, 0, 0, // 358 358
0, 0, 0, 0, // 359 359
0, 0, 0, 0, // 360 360
0, 0, 0, 0, // 361 361
0, 0, 0, 0, // 362 362
0, 0, 0, 0, // 363 363
0, 0, 0, 0, // 364 364
0, 0, 0, 0, // 365 365
0, 0, 0, 0, // 366 366
0, 0, 0, 0, // 367 367
0, 0, 0, 0, // 368 368
0, 0, 0, 0, // 369 369
0, 0, 0, 0, // 370 370
0, 0, 0, 0, // 371 371
0, 0, 0, 0, // 372 372
0, 0, 0, 0, // 373 373
0, 0, 0, 0, // 374 374
0, 0, 0, 0, // 375 375
0, 0, 0, 0, // 376 376
0, 0, 0, 0, // 377 377
0, 0, 0, 0, // 378 378
0, 0, 0, 0, // 379 379
0, 0, 0, 0, // 380 380
0, 0, 0, 0, // 381 381
0, 0, 0, 0, // 382 382
0, 0, 0, 0, // 383 383
0, 0, 0, 0, // 384 384
0, 0, 0, 0, // 385 385
0, 0, 0, 0, // 386 386
0, 0, 0, 0, // 387 387
0, 0, 0, 0, // 388 388
0, 0, 0, 0, // 389 389
0, 0, 0, 0, // 390 390
0, 0, 0, 0, // 391 391
0, 0, 0, 0, // 392 392
0, 0, 0, 0, // 393 393
0, 0, 0, 0, // 394 394
0, 0, 0, 0, // 395 395
0, 0, 0, 0, // 396 396
0, 0, 0, 0, // 397 397
0, 0, 0, 0, // 398 398
0, 0, 0, 0, // 399 399
0, 0, 0, 0, // 400 400
0, 0, 0, 0, // 401 401
0, 0, 0, 0, // 402 402
0, 0, 0, 0, // 403 403
0, 0, 0, 0, // 404 404
0, 0, 0, 0, // 405 405
0, 0, 0, 0, // 406 406
0, 0, 0, 0, // 407 407
0, 0, 0, 0, // 408 408
0, 0, 0, 0, // 409 409
0, 0, 0, 0, // 410 410
0, 0, 0, 0, // 411 411
0, 0, 0, 0, // 412 412
0, 0, 0, 0, // 413 413
0, 0, 0, 0, // 414 414
0, 0, 0, 0, // 415 415
0, 0, 0, 0, // 416 416
0, 0, 0, 0, // 417 417
0, 0, 0, 0, // 418 418
0, 0, 0, 0, // 419 419
0, 0, 0, 0, // 420 420
0, 0, 0, 0, // 421 421
0, 0, 0, 0, // 422 422
0, 0, 0, 0, // 423 423
0, 0, 0, 0, // 424 424
0, 0, 0, 0, // 425 425
0, 0, 0, 0, // 426 426
0, 0, 0, 0, // 427 427
0, 0, 0, 0, // 428 428
0, 0, 0, 0, // 429 429
0, 0, 0, 0, // 430 430
0, 0, 0, 0, // 431 431
0, 0, 0, 0, // 432 432
0, 0, 0, 0, // 433 433
0, 0, 0, 0, // 434 434
0, 0, 0, 0, // 435 435
0, 0, 0, 0, // 436 436
0, 0, 0, 0, // 437 437
0, 0, 0, 0, // 438 438
0, 0, 0, 0, // 439 439
0, 0, 0, 0, // 440 440
0, 0, 0, 0, // 441 441
0, 0, 0, 0, // 442 442
0, 0, 0, 0, // 443 443
0, 0, 0, 0, // 444 444
0, 0, 0, 0, // 445 445
0, 0, 0, 0, // 446 446
0, 0, 0, 0, // 447 447
0, 0, 0, 0, // 448 448
0, 0, 0, 0, // 449 449
0, 0, 0, 0, // 450 450
0, 0, 0, 0, // 451 451
0, 0, 0, 0, // 452 452
0, 0, 0, 0, // 453 453
0, 0, 0, 0, // 454 454
0, 0, 0, 0, // 455 455
0, 0, 0, 0, // 456 456
0, 0, 0, 0, // 457 457
0, 0, 0, 0, // 458 458
0, 0, 0, 0, // 459 459
0, 0, 0, 0, // 460 460
0, 0, 0, 0, // 461 461
0, 0, 0, 0, // 462 462
0, 0, 0, 0, // 463 463
0, 0, 0, 0, // 464 464
0, 0, 0, 0, // 465 465
0, 0, 0, 0, // 466 466
0, 0, 0, 0, // 467 467
0, 0, 0, 0, // 468 468
0, 0, 0, 0, // 469 469
0, 0, 0, 0, // 470 470
0, 0, 0, 0, // 471 471
0, 0, 0, 0, // 472 472
0, 0, 0, 0, // 473 473
0, 0, 0, 0, // 474 474
0, 0, 0, 0, // 475 475
0, 0, 0, 0, // 476 476
0, 0, 0, 0, // 477 477
0, 0, 0, 0, // 478 478
0, 0, 0, 0, // 479 479
0, 0, 0, 0, // 480 480
0, 0, 0, 0, // 481 481
0, 0, 0, 0, // 482 482
0, 0, 0, 0, // 483 483
0, 0, 0, 0, // 484 484
0, 0, 0, 0, // 485 485
0, 0, 0, 0, // 486 486
0, 0, 0, 0, // 487 487
0, 0, 0, 0, // 488 488
0, 0, 0, 0, // 489 489
0, 0, 0, 0, // 490 490
0, 0, 0, 0, // 491 491
0, 0, 0, 0, // 492 492
0, 0, 0, 0, // 493 493
0, 0, 0, 0, // 494 494
0, 0, 0, 0, // 495 495
0, 0, 0, 0, // 496 496
0, 0, 0, 0, // 497 497
0, 0, 0, 0, // 498 498
0, 0, 0, 0, // 499 499
0, 0, 0, 0, // 500 500
0, 0, 0, 0, // 501 501
0, 0, 0, 0, // 502 502
0, 0, 0, 0, // 503 503
0, 0, 0, 0, // 504 504
0, 0, 0, 0, // 505 505
0, 0, 0, 0, // 506 NDEBELE nr
0, 0, 0, 0, // 507 X_BORK_BORK_BORK zzb
0, 0, 0, 0, // 508 X_PIG_LATIN zzp
0, 0, 0, 0, // 509 X_HACKER zzh
0, 0, 0, 0, // 510 X_KLINGON tlh
0, 0, 0, 0, // 511 X_ELMER_FUDD zze
0, 0, 0, 0, // 512 X_Common xx-Zyyy
0, 0, 0, 0, // 513 X_Latin xx-Latn
0, 0, 0, 0, // 514 X_Greek xx-Grek
0, 0, 0, 0, // 515 X_Cyrillic xx-Cyrl
0, 0, 0, 0, // 516 X_Armenian xx-Armn
0, 0, 0, 0, // 517 X_Hebrew xx-Hebr
0, 0, 0, 0, // 518 X_Arabic xx-Arab
0, 0, 0, 0, // 519 X_Syriac xx-Syrc
0, 0, 0, 0, // 520 X_Thaana xx-Thaa
0, 0, 0, 0, // 521 X_Devanagari xx-Deva
0, 0, 0, 0, // 522 X_Bengali xx-Beng
0, 0, 0, 0, // 523 X_Gurmukhi xx-Guru
0, 0, 0, 0, // 524 X_Gujarati xx-Gujr
0, 0, 0, 0, // 525 X_Oriya xx-Orya
0, 0, 0, 0, // 526 X_Tamil xx-Taml
0, 0, 0, 0, // 527 X_Telugu xx-Telu
0, 0, 0, 0, // 528 X_Kannada xx-Knda
0, 0, 0, 0, // 529 X_Malayalam xx-Mlym
0, 0, 0, 0, // 530 X_Sinhala xx-Sinh
0, 0, 0, 0, // 531 X_Thai xx-Thai
0, 0, 0, 0, // 532 X_Lao xx-Laoo
0, 0, 0, 0, // 533 X_Tibetan xx-Tibt
0, 0, 0, 0, // 534 X_Myanmar xx-Mymr
0, 0, 0, 0, // 535 X_Georgian xx-Geor
0, 0, 0, 0, // 536 X_Hangul xx-Hang
0, 0, 0, 0, // 537 X_Ethiopic xx-Ethi
0, 0, 0, 0, // 538 X_Cherokee xx-Cher
0, 0, 0, 0, // 539 X_Canadian_Aboriginal xx-Cans
0, 0, 0, 0, // 540 X_Ogham xx-Ogam
0, 0, 0, 0, // 541 X_Runic xx-Runr
0, 0, 0, 0, // 542 X_Khmer xx-Khmr
0, 0, 0, 0, // 543 X_Mongolian xx-Mong
0, 0, 0, 0, // 544 X_Hiragana xx-Hira
0, 0, 0, 0, // 545 X_Katakana xx-Kana
0, 0, 0, 0, // 546 X_Bopomofo xx-Bopo
0, 0, 0, 0, // 547 X_Han xx-Hani
0, 0, 0, 0, // 548 X_Yi xx-Yiii
0, 0, 0, 0, // 549 X_Old_Italic xx-Ital
0, 0, 0, 0, // 550 X_Gothic xx-Goth
0, 0, 0, 0, // 551 X_Deseret xx-Dsrt
0, 0, 0, 0, // 552 X_Inherited xx-Qaai
0, 0, 0, 0, // 553 X_Tagalog xx-Tglg
0, 0, 0, 0, // 554 X_Hanunoo xx-Hano
0, 0, 0, 0, // 555 X_Buhid xx-Buhd
0, 0, 0, 0, // 556 X_Tagbanwa xx-Tagb
0, 0, 0, 0, // 557 X_Limbu xx-Limb
0, 0, 0, 0, // 558 X_Tai_Le xx-Tale
0, 0, 0, 0, // 559 X_Linear_B xx-Linb
0, 0, 0, 0, // 560 X_Ugaritic xx-Ugar
0, 0, 0, 0, // 561 X_Shavian xx-Shaw
0, 0, 0, 0, // 562 X_Osmanya xx-Osma
0, 0, 0, 0, // 563 X_Cypriot xx-Cprt
0, 0, 0, 0, // 564 X_Braille xx-Brai
0, 0, 0, 0, // 565 X_Buginese xx-Bugi
0, 0, 0, 0, // 566 X_Coptic xx-Copt
0, 0, 0, 0, // 567 X_New_Tai_Lue xx-Talu
0, 0, 0, 0, // 568 X_Glagolitic xx-Glag
0, 0, 0, 0, // 569 X_Tifinagh xx-Tfng
0, 0, 0, 0, // 570 X_Syloti_Nagri xx-Sylo
0, 0, 0, 0, // 571 X_Old_Persian xx-Xpeo
0, 0, 0, 0, // 572 X_Kharoshthi xx-Khar
0, 0, 0, 0, // 573 X_Balinese xx-Bali
0, 0, 0, 0, // 574 X_Cuneiform xx-Xsux
0, 0, 0, 0, // 575 X_Phoenician xx-Phnx
0, 0, 0, 0, // 576 X_Phags_Pa xx-Phag
0, 0, 0, 0, // 577 X_Nko xx-Nkoo
0, 0, 0, 0, // 578 X_Sundanese xx-Sund
0, 0, 0, 0, // 579 X_Lepcha xx-Lepc
0, 0, 0, 0, // 580 X_Ol_Chiki xx-Olck
0, 0, 0, 0, // 581 X_Vai xx-Vaii
0, 0, 0, 0, // 582 X_Saurashtra xx-Saur
0, 0, 0, 0, // 583 X_Kayah_Li xx-Kali
0, 0, 0, 0, // 584 X_Rejang xx-Rjng
0, 0, 0, 0, // 585 X_Lycian xx-Lyci
0, 0, 0, 0, // 586 X_Carian xx-Cari
0, 0, 0, 0, // 587 X_Lydian xx-Lydi
0, 0, 0, 0, // 588 X_Cham xx-Cham
0, 0, 0, 0, // 589 X_Tai_Tham xx-Lana
0, 0, 0, 0, // 590 X_Tai_Viet xx-Tavt
0, 0, 0, 0, // 591 X_Avestan xx-Avst
0, 0, 0, 0, // 592 X_Egyptian_Hieroglyphs xx-Egyp
0, 0, 0, 0, // 593 X_Samaritan xx-Samr
0, 0, 0, 0, // 594 X_Lisu xx-Lisu
0, 0, 0, 0, // 595 X_Bamum xx-Bamu
0, 0, 0, 0, // 596 X_Javanese xx-Java
0, 0, 0, 0, // 597 X_Meetei_Mayek xx-Mtei
0, 0, 0, 0, // 598 X_Imperial_Aramaic xx-Armi
0, 0, 0, 0, // 599 X_Old_South_Arabian xx-Sarb
0, 0, 0, 0, // 600 X_Inscriptional_Parthian xx-Prti
0, 0, 0, 0, // 601 X_Inscriptional_Pahlavi xx-Phli
0, 0, 0, 0, // 602 X_Old_Turkic xx-Orkh
0, 0, 0, 0, // 603 X_Kaithi xx-Kthi
0, 0, 0, 0, // 604 X_Batak xx-Batk
0, 0, 0, 0, // 605 X_Brahmi xx-Brah
0, 0, 0, 0, // 606 X_Mandaic xx-Mand
0, 0, 0, 0, // 607 X_Chakma xx-Cakm
0, 0, 0, 0, // 608 X_Meroitic_Cursive xx-Merc
0, 0, 0, 0, // 609 X_Meroitic_Hieroglyphs xx-Mero
0, 0, 0, 0, // 610 X_Miao xx-Plrd
0, 0, 0, 0, // 611 X_Sharada xx-Shrd
0, 0, 0, 0, // 612 X_Sora_Sompeng xx-Sora
0, 0, 0, 0, // 613 X_Takri xx-Takr
};
} // End namespace CLD2

View File

@ -0,0 +1,620 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
// Updated 2014.01 for dual table lookup
//
#include "cldutil.h"
#include <string>
#include "cld2tablesummary.h"
#include "integral_types.h"
#include "port.h"
#include "utf8statetable.h"
namespace CLD2 {
// Caller supplies the right tables in scoringcontext
// Runtime routines for hashing, looking up, and scoring
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
// Unigrams and bigrams are for CJK languages only, including simplified/
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
// Zhuang Han characters. Surrounding spaces are not considered.
// Quadgrams and octagrams for for non-CJK and include two bits indicating
// preceding and trailing spaces (word boundaries).
static const int kMinCJKUTF8CharBytes = 3;
static const int kMinGramCount = 3;
static const int kMaxGramCount = 16;
static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
// 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
static const uint8 kSkipSpaceVowelContinue[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
static const uint8 kSkipSpaceContinue[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// Always advances one UTF-8 character
static const uint8 kAdvanceOneChar[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
};
// Advances *only* on space (or illegal byte)
static const uint8 kAdvanceOneCharSpace[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// Routines to access a hash table of <key:wordhash, value:probs> pairs
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
// bucket subscript.
// Probs is a packed: three languages plus a subscript for probability table
// Buckets have all the keys together, then all the values.Key array never
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
// Match case may sometimes take an additional cache miss on value access.
//
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
// byte buckets with single cache miss.
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------//
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
//----------------------------------------------------------------------------//
//----------------------------------------------------------------------------//
// Scoring single groups of letters //
//----------------------------------------------------------------------------//
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
// an accumulator tote. (language 0 means unused entry)
// Output: running sums in tote updated
void ProcessProbV2Tote(uint32 probs, Tote* tote) {
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
uint8 top1 = (probs >> 8) & 0xff;
if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
}
// Return score for a particular per-script language, or zero
int GetLangScore(uint32 probs, uint8 pslang) {
uint8 prob123 = (probs >> 0) & 0xff;
const uint8* prob123_entry = LgProb2TblEntry(prob123);
int retval = 0;
uint8 top1 = (probs >> 8) & 0xff;
if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
uint8 top2 = (probs >> 16) & 0xff;
if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
uint8 top3 = (probs >> 24) & 0xff;
if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
return retval;
}
//----------------------------------------------------------------------------//
// Routines to accumulate probabilities //
//----------------------------------------------------------------------------//
// BIGRAM, using hash table, always advancing by 1 char
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
// Score all bigrams in isrc, using languages that have bigrams (CJK)
// Return number of bigrams that hit in the hash table
int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
const char* isrc, int srclen, Tote* chunk_tote) {
int hit_count = 0;
const char* src = isrc;
// Hashtable-based CJK bigram lookup
const uint8* usrc = reinterpret_cast<const uint8*>(src);
const uint8* usrclimit1 = usrc + srclen - UTFmax;
while (usrc < usrclimit1) {
int len = kAdvanceOneChar[usrc[0]];
int len2 = kAdvanceOneChar[usrc[len]] + len;
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
// Lookup and score this bigram
// Always ignore pre/post spaces
uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
// Now go indirect on the subscript
probs = bigram_obj->kCLDTableInd[probs &
~bigram_obj->kCLDTableKeyMask];
// Process the bigram
if (probs != 0) {
ProcessProbV2Tote(probs, chunk_tote);
++hit_count;
}
}
usrc += len; // Advance by one char
}
return hit_count;
}
// Score up to 64KB of a single script span in one pass
// Make a dummy entry off the end to calc length of last span
// Return offset of first unused input byte
int GetUniHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer) {
const char* isrc = &text[letter_offset];
const char* src = isrc;
// Limit is end, which has extra 20 20 20 00 past len
const char* srclimit = &text[letter_limit];
// Local copies
const UTF8PropObj* unigram_obj =
scoringcontext->scoringtables->unigram_obj;
int next_base = hitbuffer->next_base;
int next_base_limit = hitbuffer->maxscoringhits;
// Visit all unigrams
if (src[0] == ' ') {++src;} // skip any initial space
while (src < srclimit) {
const uint8* usrc = reinterpret_cast<const uint8*>(src);
int len = kAdvanceOneChar[usrc[0]];
src += len;
// Look up property of one UTF-8 character and advance over it.
// Updates usrc and len (bad interface design), hence increment above
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
if (propval > 0) {
// Save indirect subscript for later scoring; 1 or 2 langprobs
int indirect_subscr = propval;
hitbuffer->base[next_base].offset = src - text; // Offset in text
hitbuffer->base[next_base].indirect = indirect_subscr;
++next_base;
}
if (next_base >= next_base_limit) {break;}
}
hitbuffer->next_base = next_base;
// Make a dummy entry off the end to calc length of last span
int dummy_offset = src - text;
hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
hitbuffer->base[hitbuffer->next_base].indirect = 0;
return src - text;
}
// Score up to 64KB of a single script span, doing both delta-bi and
// distinct bis in one pass
void GetBiHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer) {
const char* isrc = &text[letter_offset];
const char* src = isrc;
// Limit is end
const char* srclimit1 = &text[letter_limit];
// Local copies
const CLD2TableSummary* deltabi_obj =
scoringcontext->scoringtables->deltabi_obj;
const CLD2TableSummary* distinctbi_obj =
scoringcontext->scoringtables->distinctbi_obj;
int next_delta = hitbuffer->next_delta;
int next_delta_limit = hitbuffer->maxscoringhits;
int next_distinct = hitbuffer->next_distinct;
// We can do 2 inserts per loop, so -1
int next_distinct_limit = hitbuffer->maxscoringhits - 1;
while (src < srclimit1) {
const uint8* usrc = reinterpret_cast<const uint8*>(src);
int len = kAdvanceOneChar[usrc[0]];
int len2 = kAdvanceOneChar[usrc[len]] + len;
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
// Lookup and this bigram and save <offset, indirect>
uint32 bihash = BiHashV2(src, len2);
uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
// Now go indirect on the subscript
if (probs != 0) {
// Save indirect subscript for later scoring; 1 langprob
int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
hitbuffer->delta[next_delta].offset = src - text;
hitbuffer->delta[next_delta].indirect = indirect_subscr;
++next_delta;
}
// Lookup this distinct bigram and save <offset, indirect>
probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
if (probs != 0) {
int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
hitbuffer->distinct[next_distinct].offset = src - text;
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
++next_distinct;
}
}
src += len; // Advance by one char (not two)
// Almost always srclimit hit first
if (next_delta >= next_delta_limit) {break;}
if (next_distinct >= next_distinct_limit) {break;}
}
hitbuffer->next_delta = next_delta;
hitbuffer->next_distinct = next_distinct;
// Make a dummy entry off the end to calc length of last span
int dummy_offset = src - text;
hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
}
// Score up to 64KB of a single script span in one pass
// Make a dummy entry off the end to calc length of last span
// Return offset of first unused input byte
int GetQuadHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer) {
const char* isrc = &text[letter_offset];
const char* src = isrc;
// Limit is end, which has extra 20 20 20 00 past len
const char* srclimit = &text[letter_limit];
// Local copies
const CLD2TableSummary* quadgram_obj =
scoringcontext->scoringtables->quadgram_obj;
const CLD2TableSummary* quadgram_obj2 =
scoringcontext->scoringtables->quadgram_obj2;
int next_base = hitbuffer->next_base;
int next_base_limit = hitbuffer->maxscoringhits;
// Run a little cache of last quad hits to catch overly-repetitive "text"
// We don't care if we miss a couple repetitions at scriptspan boundaries
int next_prior_quadhash = 0;
uint32 prior_quadhash[2] = {0, 0};
// Visit all quadgrams
if (src[0] == ' ') {++src;} // skip any initial space
while (src < srclimit) {
// Find one quadgram
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
const char* src_mid = src_end;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
int len = src_end - src;
// Hash the quadgram
uint32 quadhash = QuadHashV2(src, len);
// Filter out recent repeats
if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
// Look up this quadgram and save <offset, indirect>
uint32 indirect_flag = 0; // For dual tables
const CLD2TableSummary* hit_obj = quadgram_obj;
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {
// Try lookup in dual table if not found in first one
// Note: we need to know later which of two indirect tables to use.
indirect_flag = 0x80000000u;
hit_obj = quadgram_obj2;
probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);
}
if (probs != 0) {
// Round-robin two entries of actual hits
prior_quadhash[next_prior_quadhash] = quadhash;
next_prior_quadhash = (next_prior_quadhash + 1) & 1;
// Save indirect subscript for later scoring; 1 or 2 langprobs
int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;
hitbuffer->base[next_base].offset = src - text; // Offset in text
// Flip the high bit for table2
hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;
++next_base;
}
}
// Advance: all the way past word if at end-of-word, else 2 chars
if (src_end[0] == ' ') {
src = src_end;
} else {
src = src_mid;
}
// Skip over space at end of word, or ASCII vowel in middle of word
// Use kAdvanceOneCharSpace instead to get rid of vowel hack
if (src < srclimit) {
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
} else {
// Advancing by 4/8/16 can overshoot, but we are about to exit anyway
src = srclimit;
}
if (next_base >= next_base_limit) {break;}
}
hitbuffer->next_base = next_base;
// Make a dummy entry off the end to calc length of last span
int dummy_offset = src - text;
hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
hitbuffer->base[hitbuffer->next_base].indirect = 0;
return src - text;
}
// inputs:
// const tables
// const char* isrc, int srclen (in sscriptbuffer)
// intermediates:
// vector of octa <offset, probs> (which need indirect table to decode)
// vector of distinct <offset, probs> (which need indirect table to decode)
// Score up to 64KB of a single script span, doing both delta-octa and
// distinct words in one pass
void GetOctaHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer) {
const char* isrc = &text[letter_offset];
const char* src = isrc;
// Limit is end+1, to include extra space char (0x20) off the end
const char* srclimit = &text[letter_limit + 1];
// Local copies
const CLD2TableSummary* deltaocta_obj =
scoringcontext->scoringtables->deltaocta_obj;
int next_delta = hitbuffer->next_delta;
int next_delta_limit = hitbuffer->maxscoringhits;
const CLD2TableSummary* distinctocta_obj =
scoringcontext->scoringtables->distinctocta_obj;
int next_distinct = hitbuffer->next_distinct;
// We can do 2 inserts per loop, so -1
int next_distinct_limit = hitbuffer->maxscoringhits - 1;
// Run a little cache of last octa hits to catch overly-repetitive "text"
// We don't care if we miss a couple repetitions at scriptspan boundaries
int next_prior_octahash = 0;
uint64 prior_octahash[2] = {0, 0};
// Score all words truncated to 8 characters
int charcount = 0;
// Skip any initial space
if (src[0] == ' ') {++src;}
// Begin the first word
const char* prior_word_start = src;
const char* word_start = src;
const char* word_end = word_start;
while (src < srclimit) {
// Terminate previous word or continue current word
if (src[0] == ' ') {
int len = word_end - word_start;
// Hash the word
uint64 wordhash40 = OctaHash40(word_start, len);
uint32 probs;
// Filter out recent repeats. Unlike quads, we update even if no hit,
// so we can get hits on same word if separated by non-hit words
if ((wordhash40 != prior_octahash[0]) &&
(wordhash40 != prior_octahash[1])) {
// Round-robin two entries of words
prior_octahash[next_prior_octahash] = wordhash40;
next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1
// (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
// function of the two word hashs. For words A B C, B-A and C-B are good
// enough and fast. We use the same table as distinct single words
// Do not look up a pair of identical words -- all pairs hash to zero
// Both 1- and 2-word distinct lookups are in distinctocta_obj now
// Do this first, because it has the lowest offset
uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
if (probs != 0) {
int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
++next_distinct;
}
}
// (2) Lookup this distinct word and save <offset, indirect>
probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
if (probs != 0) {
int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
hitbuffer->distinct[next_distinct].offset = word_start - text;
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
++next_distinct;
}
// (3) Lookup this word and save <offset, indirect>
probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
if (probs != 0) {
// Save indirect subscript for later scoring; 1 langprob
int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
hitbuffer->delta[next_delta].offset = word_start - text;
hitbuffer->delta[next_delta].indirect = indirect_subscr;
++next_delta;
}
}
// Begin the next word
charcount = 0;
prior_word_start = word_start;
word_start = src + 1; // Over the space
word_end = word_start;
} else {
++charcount;
}
// Advance to next char
src += UTF8OneCharLen(src);
if (charcount <= 8) {
word_end = src;
}
// Almost always srclimit hit first
if (next_delta >= next_delta_limit) {break;}
if (next_distinct >= next_distinct_limit) {break;}
}
hitbuffer->next_delta = next_delta;
hitbuffer->next_distinct = next_distinct;
// Make a dummy entry off the end to calc length of last span
int dummy_offset = src - text;
hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
}
//----------------------------------------------------------------------------//
// Reliability calculations, for single language and between languages //
//----------------------------------------------------------------------------//
// Return reliablity of result 0..100 for top two scores
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
// Threshold is uni/quadgram increment count, bounded above and below.
//
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
// for each scored quadgram is too stringent, so I've backed this off to a
// factor of 2 (e.g. +5/8 log base 3).
//
// I also somewhat lowered the Min/MaxGramCount limits above
//
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
//
int ReliabilityDelta(int value1, int value2, int gramcount) {
int max_reliability_percent = 100;
if (gramcount < 8) {
max_reliability_percent = 12 * gramcount;
}
int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
fully_reliable_thresh = kMinGramCount;
} else if (fully_reliable_thresh > kMaxGramCount) {
fully_reliable_thresh = kMaxGramCount;
}
int delta = value1 - value2;
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
if (delta <= 0) {return 0;}
return minint(max_reliability_percent,
(100 * delta) / fully_reliable_thresh);
}
// Return reliablity of result 0..100 for top score vs. expected mainsteam score
// Values are score per 1024 bytes of input
// ratio = max(top/mainstream, mainstream/top)
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
// Change: short-text word scoring can give unusually good results.
// Let top exceed mainstream by 4x at 50% reliable
//
// dsites April 2010: These could be tightened up. It would be
// reasonable with newer data and round-robin table allocation to start ramping
// down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
// but just barely.
//
// dsites March 2013: Tightened up a bit.
static const double kRatio100 = 1.5;
static const double kRatio0 = 4.0;
int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
if (expected_score_1kb == 0) {return 100;} // No reliability data available yet
if (actual_score_1kb == 0) {return 0;} // zero score = unreliable
double ratio;
if (expected_score_1kb > actual_score_1kb) {
ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
} else {
ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
}
// Ratio 1.0 .. 1.5 scores 100%
// Ratio 2.0 scores 80%
// Linear decline, to ratio 4.0 scores 0%
if (ratio <= kRatio100) {return 100;}
if (ratio > kRatio0) {return 0;}
int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100);
return percent_good;
}
// Create a langprob packed value from its parts.
// qprob is quantized [0..12]
// We use Latn script to represent any RTypeMany language
uint32 MakeLangProb(Language lang, int qprob) {
uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
return retval;
}
} // End namespace CLD2

View File

@ -0,0 +1,80 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// Stuff used only by online detector, not used offline
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__
#include "cldutil_shared.h"
#include "scoreonescriptspan.h"
#include "tote.h"
namespace CLD2 {
// Score up to 64KB of a single script span in one pass
// Make a dummy entry off the end to calc length of last span
// Return offset of first unused input byte
int GetUniHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer);
// Score up to 64KB of a single script span, doing both delta-bi and
// distinct bis in one pass
void GetBiHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer);
// Score up to 64KB of a single script span in one pass
// Make a dummy entry off the end to calc length of last span
// Return offset of first unused input byte
int GetQuadHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer);
// Score up to 64KB of a single script span, doing both delta-octa and
// distinct words in one pass
void GetOctaHits(const char* text,
int letter_offset, int letter_limit,
ScoringContext* scoringcontext,
ScoringHitBuffer* hitbuffer);
// Not sure if these belong here or in scoreonescriptspan.cc
int ReliabilityDelta(int value1, int value2, int gramcount);
int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb);
// Create a langprob packed value from its parts.
uint32 MakeLangProb(Language lang, int qprob);
void ProcessProbV2Tote(uint32 probs, Tote* tote);
// Return score for a particular per-script language, or zero
int GetLangScore(uint32 probs, uint8 pslang);
static inline int minint(int a, int b) {return (a < b) ? a: b;}
static inline int maxint(int a, int b) {return (a > b) ? a: b;}
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__

View File

@ -0,0 +1,437 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#include "cldutil_shared.h"
#include <string>
#include "cld2tablesummary.h"
#include "integral_types.h"
#include "port.h"
#include "utf8statetable.h"
namespace CLD2 {
// Runtime routines for hashing, looking up, and scoring
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
// Unigrams and bigrams are for CJK languages only, including simplified/
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
// Zhuang Han characters. Surrounding spaces are not considered.
// Quadgrams and octagrams for for non-CJK and include two bits indicating
// preceding and trailing spaces (word boundaries).
// Indicator bits for leading/trailing space around quad/octagram
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
// 1-, 2-, or 3-bytes each.
static const uint32 kPreSpaceIndicator = 0x00004444;
static const uint32 kPostSpaceIndicator = 0x44440000;
// Little-endian masks for 0..24 bytes picked up as uint32's
static const uint32 kWordMask0[4] = {
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
};
static const int kMinCJKUTF8CharBytes = 3;
static const int kMinGramCount = 3;
static const int kMaxGramCount = 16;
static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
// Routines to access a hash table of <key:wordhash, value:probs> pairs
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
// bucket subscript.
// Probs is a packed: three languages plus a subscript for probability table
// Buckets have all the keys together, then all the values.Key array never
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
// Match case may sometimes take an additional cache miss on value access.
//
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
// byte buckets with single cache miss.
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
//----------------------------------------------------------------------------//
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
//----------------------------------------------------------------------------//
// Design principles for these hash functions
// - Few operations
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
// Latin script expect 1- and 2-byte mixtures.
// - Last byte of each character has about 5 bits of information
// - Spread good bits around so they can interact in at least two ways
// with other characters
// - Use add for additional mixing thorugh carries
// CJK Three-byte bigram
// ....dddd..cccccc..bbbbbb....aaaa
// ..................ffffff..eeeeee
// make
// ....dddd..cccccc..bbbbbb....aaaa
// 000....dddd..cccccc..bbbbbb....a
// ..................ffffff..eeeeee
// ffffff..eeeeee000000000000000000
//
// CJK Four-byte bigram
// ..dddddd..cccccc....bbbb....aaaa
// ..hhhhhh..gggggg....ffff....eeee
// make
// ..dddddd..cccccc....bbbb....aaaa
// 000..dddddd..cccccc....bbbb....a
// ..hhhhhh..gggggg....ffff....eeee
// ..ffff....eeee000000000000000000
// BIGRAM
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
// OVERSHOOTS up to 3 bytes
// For runtime use of tables
// Does X86 unaligned loads
uint32 BiHashV2(const char* word_ptr, int bytecount) {
if (bytecount == 0) {return 0;}
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
uint32 word0, word1;
if (bytecount <= 4) {
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
word0 = word0 ^ (word0 >> 3);
return word0;
}
// Else do 8 bytes
word0 = UNALIGNED_LOAD32(word_ptr32);
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
word1 = word1 ^ (word1 << 18);
return word0 + word1;
}
//
// Ascii-7 One-byte chars
// ...ddddd...ccccc...bbbbb...aaaaa
// make
// ...ddddd...ccccc...bbbbb...aaaaa
// 000...ddddd...ccccc...bbbbb...aa
//
// Latin 1- and 2-byte chars
// ...ddddd...ccccc...bbbbb...aaaaa
// ...................fffff...eeeee
// make
// ...ddddd...ccccc...bbbbb...aaaaa
// 000...ddddd...ccccc...bbbbb...aa
// ...................fffff...eeeee
// ...............fffff...eeeee0000
//
// Non-CJK Two-byte chars
// ...ddddd...........bbbbb........
// ...hhhhh...........fffff........
// make
// ...ddddd...........bbbbb........
// 000...ddddd...........bbbbb.....
// ...hhhhh...........fffff........
// hhhh...........fffff........0000
//
// Non-CJK Three-byte chars
// ...........ccccc................
// ...................fffff........
// ...lllll...................iiiii
// make
// ...........ccccc................
// 000...........ccccc.............
// ...................fffff........
// ...............fffff........0000
// ...lllll...................iiiii
// .lllll...................iiiii00
//
// QUADGRAM
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
// For runtime use of tables
// Does X86 unaligned loads
uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
uint32 word0, word1, word2;
if (bytecount <= 4) {
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
word0 = word0 ^ (word0 >> 3);
return word0 ^ prepost;
} else if (bytecount <= 8) {
word0 = UNALIGNED_LOAD32(word_ptr32);
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
word1 = word1 ^ (word1 << 4);
return (word0 ^ prepost) + word1;
}
// else do 12 bytes
word0 = UNALIGNED_LOAD32(word_ptr32);
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
word1 = word1 ^ (word1 << 4);
word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
word2 = word2 ^ (word2 << 2);
return (word0 ^ prepost) + word1 + word2;
}
// QUADGRAM wrapper with surrounding spaces
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
// For runtime use of tables
uint32 QuadHashV2(const char* word_ptr, int bytecount) {
if (bytecount == 0) {return 0;}
uint32 prepost = 0;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
return QuadHashV2Mix(word_ptr, bytecount, prepost);
}
// QUADGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
// For offline construction of tables
uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
if (bytecount == 0) {return 0;}
const char* local_word_ptr = word_ptr;
int local_bytecount = bytecount;
uint32 prepost = 0;
if (local_word_ptr[0] == '_') {
prepost |= kPreSpaceIndicator;
++local_word_ptr;
--local_bytecount;
}
if (local_word_ptr[local_bytecount - 1] == '_') {
prepost |= kPostSpaceIndicator;
--local_bytecount;
}
return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
}
// OCTAGRAM
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For runtime use of tables V3
// Does X86 unaligned loads
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
uint64 word0;
uint64 word1;
uint64 sum;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
switch ((bytecount - 1) >> 2) {
case 0: // 1..4 bytes
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
sum = word0;
word0 = word0 ^ (word0 >> 3);
break;
case 1: // 5..8 bytes
word0 = UNALIGNED_LOAD32(word_ptr32);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
break;
case 2: // 9..12 bytes
word0 = UNALIGNED_LOAD32(word_ptr32);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
break;
case 3: // 13..16 bytes
word0 =UNALIGNED_LOAD32(word_ptr32);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
break;
case 4: // 17..20 bytes
word0 = UNALIGNED_LOAD32(word_ptr32);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 4);
word0 += word1;
break;
default: // 21..24 bytes and higher (ignores beyond 24)
word0 = UNALIGNED_LOAD32(word_ptr32);
sum = word0;
word0 = word0 ^ (word0 >> 3);
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
sum += word1;
word1 = word1 ^ (word1 << 4);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
sum += word1;
word1 = word1 ^ (word1 << 2);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
sum += word1;
word1 = word1 ^ (word1 >> 8);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
sum += word1;
word1 = word1 ^ (word1 >> 4);
word0 += word1;
word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
sum += word1;
word1 = word1 ^ (word1 >> 6);
word0 += word1;
break;
}
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
sum = (sum & 0xff) << 32;
return (word0 ^ prepost) + sum;
}
// OCTAGRAM wrapper with surrounding spaces
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For runtime use of tables V3
uint64 OctaHash40(const char* word_ptr, int bytecount) {
if (bytecount == 0) {return 0;}
uint64 prepost = 0;
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
return OctaHash40Mix(word_ptr, bytecount, prepost);
}
// OCTAGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
//
// The low 32 bits follow the pattern from above, tuned to different scripts
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
// For offline construction of tables
uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
if (bytecount == 0) {return 0;}
const char* local_word_ptr = word_ptr;
int local_bytecount = bytecount;
uint64 prepost = 0;
if (local_word_ptr[0] == '_') {
prepost |= kPreSpaceIndicator;
++local_word_ptr;
--local_bytecount;
}
if (local_word_ptr[local_bytecount - 1] == '_') {
prepost |= kPostSpaceIndicator;
--local_bytecount;
}
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
}
// Hash a consecutive pair of tokens/words A B
// Old: hash is B - A, which gives too many false hits on one-char diffs
// Now: rotate(A,13) + B
uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
}
//----------------------------------------------------------------------------//
// Finding groups of 1/2/4/8 letters //
//----------------------------------------------------------------------------//
// src points to a letter. Find the byte length of a unigram starting there.
int UniLen(const char* src) {
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
return src_end - src;
}
// src points to a letter. Find the byte length of a bigram starting there.
int BiLen(const char* src) {
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
return src_end - src;
}
// src points to a letter. Find the byte length of a quadgram starting there.
int QuadLen(const char* src) {
const char* src_end = src;
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
return src_end - src;
}
// src points to a letter. Find the byte length of an octagram starting there.
int OctaLen(const char* src) {
const char* src_end = src;
int charcount = 0;
while (src_end[0] != ' ') {
src_end += UTF8OneCharLen(src);
++charcount;
if (charcount == 8) {break;}
}
return src_end - src;
}
} // End namespace CLD2

View File

@ -0,0 +1,509 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// Just the stuff shared between offline table builder and online detector
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
#include "integral_types.h"
#include "cld2tablesummary.h"
namespace CLD2 {
// Runtime routines for hashing, looking up, and scoring
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
// Unigrams and bigrams are for CJK languages only, including simplified/
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
// Zhuang Han characters. Surrounding spaces are not considered.
// Quadgrams and octagrams for for non-CJK and include two bits indicating
// preceding and trailing spaces (word boundaries).
//----------------------------------------------------------------------------//
// Main quantized probability table //
//----------------------------------------------------------------------------//
// Table has 240 eight-byte entries. Each entry has a five-byte array and
// a three-byte array of log base 2 probabilities in the range 1..12.
// The intended use is to express five or three probabilities in a single-byte
// subscript, then decode via this table. These probabilities are
// intended to go with an array of five or three language numbers.
//
// The corresponding language numbers will have to be sorted by descending
// probability, then the actual probability subscript chosen to match the
// closest available entry in this table.
//
// Pattern of probability values:
// hi 3/4 1/2 1/4 lo hi mid lo
// where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
// and mid is one of 3/4 1/2 or 1/4.
// There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
// lo running 1..hi. Only the first group is used for five-entry lookups.
// The mid value in the first group is 1/2, the second group 3/4, and the
// third group 1/4. For three-entry lookups, this allows the mid entry to be
// somewhat higher or lower than the midpoint, to allow a better match to the
// original probabilities.
static const int kLgProbV2TblSize = 240;
static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
1,1,1,1,1, 1,1,1, // [0]
2,2,2,1,1, 2,2,1, // [1]
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,2,1, // [3]
3,3,3,2,2, 3,3,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,3,1, // [6]
4,4,3,3,2, 4,3,2,
4,4,4,3,3, 4,4,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,3,1, // [10]
5,4,4,3,2, 5,4,2,
5,5,4,4,3, 5,4,3,
5,5,5,4,4, 5,5,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,4,1, // [15]
6,5,4,3,2, 6,4,2,
6,5,5,4,3, 6,5,3,
6,6,5,5,4, 6,5,4,
6,6,6,5,5, 6,6,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,4,1, // [21]
7,6,5,3,2, 7,5,2,
7,6,5,4,3, 7,5,3,
7,6,6,5,4, 7,6,4,
7,7,6,6,5, 7,6,5,
7,7,7,6,6, 7,7,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,5,1, // [28]
8,7,5,4,2, 8,5,2,
8,7,6,4,3, 8,6,3,
8,7,6,5,4, 8,6,4,
8,7,7,6,5, 8,7,5,
8,8,7,7,6, 8,7,6,
8,8,8,7,7, 8,8,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,5,1, // [36]
9,7,6,4,2, 9,6,2,
9,8,6,5,3, 9,6,3,
9,8,7,5,4, 9,7,4,
9,8,7,6,5, 9,7,5,
9,8,8,7,6, 9,8,6,
9,9,8,8,7, 9,8,7,
9,9,9,8,8, 9,9,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,6,1, // [45]
10,8,6,4,2, 10,6,2,
10,8,7,5,3, 10,7,3,
10,9,7,6,4, 10,7,4,
10,9,8,6,5, 10,8,5,
10,9,8,7,6, 10,8,6,
10,9,9,8,7, 10,9,7,
10,10,9,9,8, 10,9,8,
10,10,10,9,9, 10,10,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,6,1, // [55]
11,9,7,4,2, 11,7,2,
11,9,7,5,3, 11,7,3,
11,9,8,6,4, 11,8,4,
11,10,8,7,5, 11,8,5,
11,10,9,7,6, 11,9,6,
11,10,9,8,7, 11,9,7,
11,10,10,9,8, 11,10,8,
11,11,10,10,9, 11,10,9,
11,11,11,10,10, 11,11,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,7,1, // [66]
12,10,7,5,2, 12,7,2,
12,10,8,5,3, 12,8,3,
12,10,8,6,4, 12,8,4,
12,10,9,7,5, 12,9,5,
12,11,9,8,6, 12,9,6,
12,11,10,8,7, 12,10,7,
12,11,10,9,8, 12,10,8,
12,11,11,10,9, 12,11,9,
12,12,11,11,10, 12,11,10,
12,12,12,11,11, 12,12,11,
12,12,12,12,12, 12,12,12,
1,1,1,1,1, 1,1,1,
2,2,2,1,1, 2,2,1,
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,3,1,
3,3,3,2,2, 3,3,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,3,1,
4,4,3,3,2, 4,4,2,
4,4,4,3,3, 4,4,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,4,1,
5,4,4,3,2, 5,4,2,
5,5,4,4,3, 5,5,3,
5,5,5,4,4, 5,5,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,5,1,
6,5,4,3,2, 6,5,2,
6,5,5,4,3, 6,5,3,
6,6,5,5,4, 6,6,4,
6,6,6,5,5, 6,6,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,6,1,
7,6,5,3,2, 7,6,2,
7,6,5,4,3, 7,6,3,
7,6,6,5,4, 7,6,4,
7,7,6,6,5, 7,7,5,
7,7,7,6,6, 7,7,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,6,1,
8,7,5,4,2, 8,7,2,
8,7,6,4,3, 8,7,3,
8,7,6,5,4, 8,7,4,
8,7,7,6,5, 8,7,5,
8,8,7,7,6, 8,8,6,
8,8,8,7,7, 8,8,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,7,1,
9,7,6,4,2, 9,7,2,
9,8,6,5,3, 9,8,3,
9,8,7,5,4, 9,8,4,
9,8,7,6,5, 9,8,5,
9,8,8,7,6, 9,8,6,
9,9,8,8,7, 9,9,7,
9,9,9,8,8, 9,9,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,8,1,
10,8,6,4,2, 10,8,2,
10,8,7,5,3, 10,8,3,
10,9,7,6,4, 10,9,4,
10,9,8,6,5, 10,9,5,
10,9,8,7,6, 10,9,6,
10,9,9,8,7, 10,9,7,
10,10,9,9,8, 10,10,8,
10,10,10,9,9, 10,10,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,9,1,
11,9,7,4,2, 11,9,2,
11,9,7,5,3, 11,9,3,
11,9,8,6,4, 11,9,4,
11,10,8,7,5, 11,10,5,
11,10,9,7,6, 11,10,6,
11,10,9,8,7, 11,10,7,
11,10,10,9,8, 11,10,8,
11,11,10,10,9, 11,11,9,
11,11,11,10,10, 11,11,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,9,1,
12,10,7,5,2, 12,10,2,
12,10,8,5,3, 12,10,3,
12,10,8,6,4, 12,10,4,
12,10,9,7,5, 12,10,5,
12,11,9,8,6, 12,11,6,
12,11,10,8,7, 12,11,7,
12,11,10,9,8, 12,11,8,
12,11,11,10,9, 12,11,9,
12,12,11,11,10, 12,12,10,
12,12,12,11,11, 12,12,11,
12,12,12,12,12, 12,12,12,
1,1,1,1,1, 1,1,1,
2,2,2,1,1, 2,1,1,
2,2,2,2,2, 2,2,2,
3,3,2,2,1, 3,2,1,
3,3,3,2,2, 3,2,2,
3,3,3,3,3, 3,3,3,
4,3,3,2,1, 4,2,1,
4,4,3,3,2, 4,3,2,
4,4,4,3,3, 4,3,3,
4,4,4,4,4, 4,4,4,
5,4,3,2,1, 5,2,1,
5,4,4,3,2, 5,3,2,
5,5,4,4,3, 5,4,3,
5,5,5,4,4, 5,4,4,
5,5,5,5,5, 5,5,5,
6,5,4,2,1, 6,2,1,
6,5,4,3,2, 6,3,2,
6,5,5,4,3, 6,4,3,
6,6,5,5,4, 6,5,4,
6,6,6,5,5, 6,5,5,
6,6,6,6,6, 6,6,6,
7,6,4,3,1, 7,3,1,
7,6,5,3,2, 7,3,2,
7,6,5,4,3, 7,4,3,
7,6,6,5,4, 7,5,4,
7,7,6,6,5, 7,6,5,
7,7,7,6,6, 7,6,6,
7,7,7,7,7, 7,7,7,
8,6,5,3,1, 8,3,1,
8,7,5,4,2, 8,4,2,
8,7,6,4,3, 8,4,3,
8,7,6,5,4, 8,5,4,
8,7,7,6,5, 8,6,5,
8,8,7,7,6, 8,7,6,
8,8,8,7,7, 8,7,7,
8,8,8,8,8, 8,8,8,
9,7,5,3,1, 9,3,1,
9,7,6,4,2, 9,4,2,
9,8,6,5,3, 9,5,3,
9,8,7,5,4, 9,5,4,
9,8,7,6,5, 9,6,5,
9,8,8,7,6, 9,7,6,
9,9,8,8,7, 9,8,7,
9,9,9,8,8, 9,8,8,
9,9,9,9,9, 9,9,9,
10,8,6,3,1, 10,3,1,
10,8,6,4,2, 10,4,2,
10,8,7,5,3, 10,5,3,
10,9,7,6,4, 10,6,4,
10,9,8,6,5, 10,6,5,
10,9,8,7,6, 10,7,6,
10,9,9,8,7, 10,8,7,
10,10,9,9,8, 10,9,8,
10,10,10,9,9, 10,9,9,
10,10,10,10,10, 10,10,10,
11,9,6,4,1, 11,4,1,
11,9,7,4,2, 11,4,2,
11,9,7,5,3, 11,5,3,
11,9,8,6,4, 11,6,4,
11,10,8,7,5, 11,7,5,
11,10,9,7,6, 11,7,6,
11,10,9,8,7, 11,8,7,
11,10,10,9,8, 11,9,8,
11,11,10,10,9, 11,10,9,
11,11,11,10,10, 11,10,10,
11,11,11,11,11, 11,11,11,
12,9,7,4,1, 12,4,1,
12,10,7,5,2, 12,5,2,
12,10,8,5,3, 12,5,3,
12,10,8,6,4, 12,6,4,
12,10,9,7,5, 12,7,5,
12,11,9,8,6, 12,8,6,
12,11,10,8,7, 12,8,7,
12,11,10,9,8, 12,9,8,
12,11,11,10,9, 12,10,9,
12,12,11,11,10, 12,11,10,
12,12,12,11,11, 12,11,11,
12,12,12,12,12, 12,12,12,
// Added 2013.01.28 for CJK compatible mapping
8,5,2,2,2, 8,2,2,
6,6,6,4,2, 6,6,2,
6,5,4,4,4, 6,4,4,
6,4,2,2,2, 6,2,2,
4,3,2,2,2, 4,2,2,
2,2,2,2,2, 2,2,2,
};
// Backmap a single desired probability into an entry in kLgProbV2Tbl
static const uint8 kLgProbV2TblBackmap[13] = {
0,
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
};
// Return address of 8-byte entry[i]
inline const uint8* LgProb2TblEntry(int i) {
return &kLgProbV2Tbl[i * 8];
}
// Return one of three probabilities in an entry
inline uint8 LgProb3(const uint8* entry, int j) {
return entry[j + 5];
}
// Routines to access a hash table of <key:wordhash, value:probs> pairs
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
// bucket subscript.
// Probs is a packed: three languages plus a subscript for probability table
// Buckets have all the keys together, then all the values.Key array never
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
// Match case may sometimes take an additional cache miss on value access.
//
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
// byte buckets with single cache miss.
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
//----------------------------------------------------------------------------//
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
//----------------------------------------------------------------------------//
// BIGRAM
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
// OVERSHOOTS up to 3 bytes
// For runtime use of tables
// Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
uint32 BiHashV2(const char* word_ptr, int bytecount);
// QUADGRAM wrapper with surrounding spaces
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
// For runtime use of tables
uint32 QuadHashV2(const char* word_ptr, int bytecount);
// QUADGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
// OVERSHOOTS up to 3 bytes
// For offline construction of tables
uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
// OCTAGRAM wrapper with surrounding spaces
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
uint64 OctaHash40(const char* word_ptr, int bytecount);
// OCTAGRAM wrapper with surrounding underscores (offline use)
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
// Hash a consecutive pair of tokens/words A B
uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
// From 32-bit gram FP, return hash table subscript and remaining key
inline void QuadFPJustHash(uint32 quadhash,
uint32 keymask,
int bucketcount,
uint32* subscr, uint32* hashkey) {
*subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
*hashkey = quadhash & keymask;
}
// From 40-bit gram FP, return hash table subscript and remaining key
inline void OctaFPJustHash(uint64 longwordhash,
uint32 keymask,
int bucketcount,
uint32* subscr, uint32* hashkey) {
uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
*subscr = temp;
temp = longwordhash >> 4;
*hashkey = temp & keymask;
}
// Look up 32-bit gram FP in caller-passed table
// Typical size 256K entries (1.5MB)
// Two-byte hashkey
inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
uint32 quadhash) {
uint32 subscr, hashkey;
const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
uint32 keymask = gram_obj->kCLDTableKeyMask;
int bucketcount = gram_obj->kCLDTableSize;
QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
// Four-way associative, 4 compares
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
return bucket_ptr->keyvalue[0];
}
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
return bucket_ptr->keyvalue[1];
}
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
return bucket_ptr->keyvalue[2];
}
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
return bucket_ptr->keyvalue[3];
}
return 0;
}
// Look up 40-bit gram FP in caller-passed table
// Typical size 256K-4M entries (1-16MB)
// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
uint64 longwordhash) {
uint32 subscr, hashkey;
const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
uint32 keymask = gram_obj->kCLDTableKeyMask;
int bucketcount = gram_obj->kCLDTableSize;
OctaFPJustHash(longwordhash, keymask, bucketcount,
&subscr, &hashkey);
const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
// Four-way associative, 4 compares
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
return bucket_ptr->keyvalue[0];
}
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
return bucket_ptr->keyvalue[1];
}
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
return bucket_ptr->keyvalue[2];
}
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
return bucket_ptr->keyvalue[3];
}
return 0;
}
//----------------------------------------------------------------------------//
// Finding groups of 1/2/4/8 letters //
//----------------------------------------------------------------------------//
// Does not advance past space or tab/cr/lf/nul
static const uint8 kAdvanceOneCharButSpace[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
};
// Advances *only* on space or ASCII vowel (or illegal byte)
static const uint8 kAdvanceOneCharSpaceVowel[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
// src points to a letter. Find the byte length of a unigram starting there.
int UniLen(const char* src);
// src points to a letter. Find the byte length of a bigram starting there.
int BiLen(const char* src);
// src points to a letter. Find the byte length of a quadgram starting there.
int QuadLen(const char* src);
// src points to a letter. Find the byte length of an octagram starting there.
int OctaLen(const char* src);
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__

View File

@ -0,0 +1,322 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#include <stdio.h>
#include <stdlib.h>
#include "../public/compact_lang_det.h"
#include "../public/encodings.h"
#include "compact_lang_det_impl.h"
#include "integral_types.h"
#include "lang_script.h"
namespace CLD2 {
// String is "code_version - data_scrape_date"
//static const char* kDetectLanguageVersion = "V2.0 - 20130715";
// Large-table version for all ~160 languages
// Small-table version for all ~60 languages
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable) {
bool allow_extended_lang = false;
Language language3[3];
int percent3[3];
double normalized_score3[3];
int text_bytes;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
&text_bytes,
is_reliable);
// Default to English
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = false;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
text_bytes,
is_reliable);
// Default to English
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = false;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
text_bytes,
is_reliable);
// Default to English
if (lang == UNKNOWN_LANGUAGE) {
lang = ENGLISH;
}
return lang;
}
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
const char* tld_hint = "";
int encoding_hint = UNKNOWN_ENCODING;
Language language_hint = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable) {
double normalized_score3[3];
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Same as above, and also returns internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable) {
bool allow_extended_lang = true;
int flags = 0;
Language plus_one = UNKNOWN_LANGUAGE;
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
NULL,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Use this one.
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwaise process different parts of the input
// buffer in language-dependant ways.
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
int flags,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable) {
bool allow_extended_lang = true;
Language plus_one = UNKNOWN_LANGUAGE;
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
cld_hints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
resultchunkvector,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
} // End namespace CLD2

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,95 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
#include <string>
#include "integral_types.h"
#include "lang_script.h"
#include "../public/encodings.h"
namespace CLD2 {
// Packed <Language, weight>, weight in [-32..31] (powers of 2**1.6 ~=3.03)
// Full language in bottom 10 bits, weight in top 6 bits
typedef int16 OneCLDLangPrior;
const int kMaxOneCLDLangPrior = 14;
typedef struct {
int32 n;
OneCLDLangPrior prior[kMaxOneCLDLangPrior];
} CLDLangPriors;
// Reading exposed here; setting hidden in .cc
inline int GetCLDPriorWeight(OneCLDLangPrior olp) {
return olp >> 10;
}
inline Language GetCLDPriorLang(OneCLDLangPrior olp) {
return static_cast<Language>(olp & 0x3ff);
}
inline int32 GetCLDLangPriorCount(CLDLangPriors* lps) {
return lps->n;
}
inline void InitCLDLangPriors(CLDLangPriors* lps) {
lps->n = 0;
}
// Trim language priors to no more than max_entries, keeping largest abs weights
void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps);
// Trim language tag string to canonical form for each language
// Input is from GetLangTagsFromHtml(), already lowercased
std::string TrimCLDLangTagsHint(const std::string& langtags);
// Add hints to vector of langpriors
// Input is from GetLangTagsFromHtml(), already lowercased
void SetCLDLangTagsHint(const std::string& langtags, CLDLangPriors* langpriors);
// Add hints to vector of langpriors
// Input is from HTTP content-language
void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors);
// Add hints to vector of langpriors
// Input is from GetTLD(), already lowercased
void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors);
// Add hints to vector of langpriors
// Input is from DetectEncoding()
void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors);
// Add hints to vector of langpriors
// Input is from random source
void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors);
// Make printable string of priors
std::string DumpCLDLangPriors(const CLDLangPriors* langpriors);
// Get language tag hints from HTML body
// Normalize: remove spaces and make lowercase comma list
std::string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
int32 max_scan_bytes);
} // End namespace CLD2
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,183 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
#include <vector>
#include "../public/compact_lang_det.h" // For CLDHints, ResultChunkVector
#include "integral_types.h"
#include "lang_script.h"
namespace CLD2 {
// Internal use flags
static const int kCLDFlagFinish = 1;
static const int kCLDFlagSqueeze = 2;
static const int kCLDFlagRepeats = 4;
static const int kCLDFlagTop40 = 8;
static const int kCLDFlagShort = 16;
static const int kCLDFlagHint = 32;
static const int kCLDFlagUseWords = 64;
static const int kCLDFlagUNUSED = 128;
// Public use flags, debug output controls, defined in compact_lang_det.h
// 0x0100 and above
/***
Flag meanings:
Flags are used in the context of a recursive call from Detect to itself,
trying to deal in a more restrictive way with input that was not reliably
identified in the top-level call.
Finish -- Do not further recurse; return whatever result ensues, even if it is
unreliable. Typically set in any recursive call to take a second try
on unreliable text.
Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
highly repetitive text and chunks of text with too many 1- and
2-letter words. This avoids scoring repetitive or useless non-text
crap in large files such bogus JPEGs within an HTML file.
Repeats -- When scoring a text run, do a cheap prediction of each character
and do not score a unigram/quadgram if the last character of same is
correctly predicted. This is a slower, finer-grained form of
cheapsqueeze, typically used when the first pass got unreliable
results.
Top40 -- Restrict the set of scored languages to the Google "Top 40", which is
actually 38 languages. This gets rid of about 110 languages that
represent about 0.7% of the web. Typically used when the first pass
got unreliable results.
Short -- DEPRICATED, unused
Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
hint supplied in parameter plus_one.
UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
Tentative decision logic:
In the middle of first pass -- After 4KB of text, look at the front 256 bytes
of every full 4KB buffer. If it compresses very well (say 3:1) or has
lots of spaces (say 1 of every 4 bytes), assume that the input is
large and contains lots of bogus non-text. Recurse, passing the
Squeeze flag to strip out chunks of this non-text.
At the end of the first pass --
If the top language is reliable and >= 70% of the document, return.
Else if the top language is reliable and top+2nd >= say 94%, return.
Else, either the top language is not reliable or there is a lot of
other crap.
***/
// Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages.
//
// Design goals:
// Skip over big stretches of HTML tags
// Able to return ranges of different languages
// Relatively small tables and relatively fast processing
// Thread safe
//
typedef struct {
int perscript_count;
const Language* perscript_lang;
} PerScriptPair;
typedef struct {
// Constants for hashing 4-7 byte quadgram to 32 bits
const int kQuadHashB4Shift;
const int kQuadHashB4bShift;
const int kQuadHashB5Shift;
const int kQuadHashB5bShift;
// Constants for hashing 32 bits to kQuadKeyTable subscript/key
const int kHashvalToSubShift;
const uint32 kHashvalToSubMask;
const int kHashvalToKeyShift;
const uint32 kHashvalToKeyMask;
const int kHashvalAssociativity;
// Pointers to the actual tables
const PerScriptPair* kPerScriptPair;
const uint16* kQuadKeyTable;
const uint32* kQuadValueTable;
} LangDetObj;
// For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded.
//
// We distinguish between bytes of the raw input buffer and bytes of non-tag
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
// and are nearly all seven-bit ASCII English, we prefer to distinguish
// language mixture fractions based on just the non-tag text.
//
// Inputs: text and text_length
// is_plain_text if true says to NOT parse/skip HTML tags nor entities
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// normalized_score3 is an array of internal scores, normalized to the
// average score for each language over a body of training text. A
// normalized score significantly away from 1.0 indicates very skewed text
// or gibberish.
//
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is at least 2**30 times more
// probable then the second-best Language
//
// Return value: the most likely Language for the majority of the input text
// Length 0 input and text with no reliable letter sequences returns
// UNKNOWN_LANGUAGE
//
// Subsetting: For fast detection over large documents, these routines will
// only scan up to a fixed limit (currently 160KB of non-tag letters).
//
Language DetectLanguageSummaryV2(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
bool allow_extended_lang,
int flags,
Language plus_one,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable);
// For unit testing:
// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front
// of the input buffer.
// Return the new, possibly-shorter length
int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
} // End namespace CLD2
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_

View File

@ -0,0 +1,58 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// Produces debugging output for CLD2. See debug_empty.h for suppressing this.
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_
#include <string>
#include "scoreonescriptspan.h"
namespace CLD2 {
// For showing one chunk
void CLD2_Debug(const char* text,
int lo_offset,
int hi_offset,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const ChunkSpan* cspan,
const ChunkSummary* chunksummary);
// For showing all chunks
void CLD2_Debug2(const char* text,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const SummaryBuffer* summarybuffer);
std::string GetPlainEscapedText(const std::string& txt);
std::string GetHtmlEscapedText(const std::string& txt);
std::string GetColorHtmlEscapedText(Language lang, const std::string& txt);
std::string GetLangColorHtmlEscapedText(Language lang, const std::string& txt);
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector);
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_

View File

@ -0,0 +1,64 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
// Compile this in instead of debug.cc to remove code for debug output
//
#include "debug.h"
#include <string>
using namespace std;
namespace CLD2 {
string GetPlainEscapedText(const string& txt) {return string("");}
string GetHtmlEscapedText(const string& txt) {return string("");}
string GetColorHtmlEscapedText(Language lang, const string& txt) {
return string("");
}
string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
return string("");
}
// For showing one chunk
// Print debug output for one scored chunk
// Optionally print out per-chunk scoring information
// In degenerate cases, hitbuffer and cspan can be NULL
void CLD2_Debug(const char* text,
int lo_offset,
int hi_offset,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const ChunkSpan* cspan,
const ChunkSummary* chunksummary) {}
// For showing all chunks
void CLD2_Debug2(const char* text,
bool more_to_come, bool score_cjk,
const ScoringHitBuffer* hitbuffer,
const ScoringContext* scoringcontext,
const SummaryBuffer* summarybuffer) {}
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector) {}
} // End namespace CLD2

View File

@ -0,0 +1,54 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Routine that maps a Unicode code point to an interchange-valid one
//
#include "fixunicodevalue.h"
#include "integral_types.h"
namespace CLD2 {
// Guarantees that the resulting output value is interchange valid
// 00-FF; map to spaces or MS CP1252
// D800-DFFF; surrogates
// FDD0-FDEF; non-characters
// xxFFFE-xxFFFF; non-characters
char32 FixUnicodeValue(char32 uv) {
uint32 uuv = static_cast<uint32>(uv);
if (uuv < 0x0100) {
return kMapFullMicrosoft1252OrSpace[uuv];
}
if (uuv < 0xD800) {
return uv;
}
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
return 0xFFFD;
}
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
return 0xFFFD;
}
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
return 0xFFFD;
}
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
return uv;
}
// surrogates and negative and > 0x10FFFF all land here
return 0xFFFD;
}
} // End namespace CLD2

View File

@ -0,0 +1,68 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Routine that maps a Unicode code point to an interchange-valid one
//
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
// code points. C0 and C1 control codes that are not interchange-valid
// are mapped to spaces.
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__
#include "integral_types.h" // for char32
#include "port.h"
namespace CLD2 {
// Map byte value 0000-00FF to char32
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
};
// Guarantees that the resulting output value is interchange valid
// 00-FF; map to spaces or MS CP1252
// D800-DFFF; surrogates
// FDD0-FDEF; non-characters
// xxFFFE-xxFFFF; non-characters
char32 FixUnicodeValue(char32 uv);
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__

View File

@ -0,0 +1,52 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Degenerate CLD2 scoring lookup table, for use as placeholder
//
#include "cld2tablesummary.h"
namespace CLD2 {
static const uint32 kDistinctBiTableBuildDate = 20130101; // yyyymmdd
static const uint32 kDistinctBiTableSize = 1; // Total Bucket count
static const uint32 kDistinctBiTableKeyMask = 0xffffffff; // Mask hash key
static const char* const kDistinctBiTableRecognizedLangScripts = "";
// Empty table
static const IndirectProbBucket4 kDistinctBiTable[kDistinctBiTableSize] = {
// key[4], words[4] in UTF-8
// value[4]
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000]
};
static const uint32 kDistinctBiTableSizeOne = 1; // One-langprob count
static const uint32 kDistinctBiTableIndSize = 1; // Largest subscript
static const uint32 kDistinctBiTableInd[kDistinctBiTableIndSize] = {
// [0000]
0x00000000, };
extern const CLD2TableSummary kDistinctBiTable_obj = {
kDistinctBiTable,
kDistinctBiTableInd,
kDistinctBiTableSizeOne,
kDistinctBiTableSize,
kDistinctBiTableKeyMask,
kDistinctBiTableBuildDate,
kDistinctBiTableRecognizedLangScripts,
};
} // End namespace CLD2
// End of generated tables

View File

@ -0,0 +1,294 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_entities.cc
// Machine generated. Do Not Edit.
//
// Declarations for HTML entities recognized by CLD2
//
#include "generated_ulscript.h" // for CharIntPair
namespace CLD2 {
// Alphabetical order for binary search
extern const int kNameToEntitySize = 265;
extern const CharIntPair kNameToEntity[kNameToEntitySize] = {
{"AElig", 198},
{"AMP", 38},
{"Aacute", 193},
{"Acirc", 194},
{"Agrave", 192},
{"Alpha", 913},
{"Aring", 197},
{"Atilde", 195},
{"Auml", 196},
{"Beta", 914},
{"Ccaron", 268},
{"Ccedil", 199},
{"Chi", 935},
{"Dagger", 8225},
{"Delta", 916},
{"ETH", 208},
{"Eacute", 201},
{"Ecaron", 282},
{"Ecirc", 202},
{"Egrave", 200},
{"Epsilon", 917},
{"Eta", 919},
{"Euml", 203},
{"GT", 62},
{"Gamma", 915},
{"Iacute", 205},
{"Icirc", 206},
{"Igrave", 204},
{"Iota", 921},
{"Iuml", 207},
{"Kappa", 922},
{"LT", 60},
{"Lambda", 923},
{"Mu", 924},
{"Ntilde", 209},
{"Nu", 925},
{"OElig", 338},
{"Oacute", 211},
{"Ocirc", 212},
{"Ograve", 210},
{"Omega", 937},
{"Omicron", 927},
{"Oslash", 216},
{"Otilde", 213},
{"Ouml", 214},
{"Phi", 934},
{"Pi", 928},
{"Prime", 8243},
{"Psi", 936},
{"QUOT", 34},
{"Rcaron", 344},
{"Rho", 929},
{"Scaron", 352},
{"Sigma", 931},
{"THORN", 222},
{"Tau", 932},
{"Theta", 920},
{"Uacute", 218},
{"Ucirc", 219},
{"Ugrave", 217},
{"Upsilon", 933},
{"Uuml", 220},
{"Xi", 926},
{"Yacute", 221},
{"Yuml", 376},
{"Zeta", 918},
{"aacute", 225},
{"acirc", 226},
{"acute", 180},
{"aelig", 230},
{"agrave", 224},
{"alefsym", 8501},
{"alpha", 945},
{"amp", 38},
{"and", 8743},
{"ang", 8736},
{"apos", 39},
{"aring", 229},
{"asymp", 8776},
{"atilde", 227},
{"auml", 228},
{"bdquo", 8222},
{"beta", 946},
{"brvbar", 166},
{"bull", 8226},
{"cap", 8745},
{"ccaron", 269},
{"ccedil", 231},
{"cedil", 184},
{"cent", 162},
{"chi", 967},
{"circ", 710},
{"clubs", 9827},
{"cong", 8773},
{"copy", 169},
{"crarr", 8629},
{"cup", 8746},
{"curren", 164},
{"dArr", 8659},
{"dagger", 8224},
{"darr", 8595},
{"deg", 176},
{"delta", 948},
{"diams", 9830},
{"divide", 247},
{"eacute", 233},
{"ecaron", 283},
{"ecirc", 234},
{"egrave", 232},
{"emdash", 8212},
{"empty", 8709},
{"emsp", 8195},
{"endash", 8211},
{"ensp", 8194},
{"epsilon", 949},
{"equiv", 8801},
{"eta", 951},
{"eth", 240},
{"euml", 235},
{"euro", 8364},
{"exist", 8707},
{"fnof", 402},
{"forall", 8704},
{"frac12", 189},
{"frac14", 188},
{"frac34", 190},
{"frasl", 8260},
{"gamma", 947},
{"ge", 8805},
{"gt", 62},
{"hArr", 8660},
{"harr", 8596},
{"hearts", 9829},
{"hellip", 8230},
{"iacute", 237},
{"icirc", 238},
{"iexcl", 161},
{"igrave", 236},
{"image", 8465},
{"infin", 8734},
{"int", 8747},
{"iota", 953},
{"iquest", 191},
{"isin", 8712},
{"iuml", 239},
{"kappa", 954},
{"lArr", 8656},
{"lambda", 955},
{"lang", 9001},
{"laquo", 171},
{"larr", 8592},
{"lceil", 8968},
{"ldquo", 8220},
{"le", 8804},
{"lfloor", 8970},
{"lowast", 8727},
{"loz", 9674},
{"lrm", 8206},
{"lsaquo", 8249},
{"lsquo", 8216},
{"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
{"middot", 183},
{"minus", 8722},
{"mu", 956},
{"nabla", 8711},
{"nbsp", 160},
{"ndash", 8211},
{"ne", 8800},
{"ni", 8715},
{"not", 172},
{"notin", 8713},
{"nsub", 8836},
{"ntilde", 241},
{"nu", 957},
{"oacute", 243},
{"ocirc", 244},
{"oelig", 339},
{"ograve", 242},
{"oline", 8254},
{"omega", 969},
{"omicron", 959},
{"oplus", 8853},
{"or", 8744},
{"ordf", 170},
{"ordm", 186},
{"oslash", 248},
{"otilde", 245},
{"otimes", 8855},
{"ouml", 246},
{"para", 182},
{"part", 8706},
{"permil", 8240},
{"perp", 8869},
{"phi", 966},
{"pi", 960},
{"piv", 982},
{"plusmn", 177},
{"pound", 163},
{"prime", 8242},
{"prod", 8719},
{"prop", 8733},
{"psi", 968},
{"quot", 34},
{"rArr", 8658},
{"radic", 8730},
{"rang", 9002},
{"raquo", 187},
{"rarr", 8594},
{"rcaron", 345},
{"rceil", 8969},
{"rdquo", 8221},
{"real", 8476},
{"reg", 174},
{"rfloor", 8971},
{"rho", 961},
{"rlm", 8207},
{"rsaquo", 8250},
{"rsquo", 8217},
{"sbquo", 8218},
{"scaron", 353},
{"sdot", 8901},
{"sect", 167},
{"shy", 173},
{"sigma", 963},
{"sigmaf", 962},
{"sim", 8764},
{"spades", 9824},
{"sub", 8834},
{"sube", 8838},
{"sum", 8721},
{"sup", 8835},
{"sup1", 185},
{"sup2", 178},
{"sup3", 179},
{"supe", 8839},
{"szlig", 223},
{"tau", 964},
{"there4", 8756},
{"theta", 952},
{"thetasym", 977},
{"thinsp", 8201},
{"thorn", 254},
{"tilde", 732},
{"times", 215},
{"trade", 8482},
{"uArr", 8657},
{"uacute", 250},
{"uarr", 8593},
{"ucirc", 251},
{"ugrave", 249},
{"uml", 168},
{"upsih", 978},
{"upsilon", 965},
{"uuml", 252},
{"weierp", 8472},
{"xi", 958},
{"yacute", 253},
{"yen", 165},
{"yuml", 255},
{"zeta", 950},
{"zwj", 8205},
{"zwnj", 8204},
};
} // namespace CLD2

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,651 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_language.h
// Machine generated. Do Not Edit.
//
// Declarations for languages recognized by CLD2
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__
#include "generated_ulscript.h"
#include "integral_types.h"
namespace CLD2 {
typedef uint16 FourScripts[4];
typedef enum {
ENGLISH = 0, // en
DANISH = 1, // da
DUTCH = 2, // nl
FINNISH = 3, // fi
FRENCH = 4, // fr
GERMAN = 5, // de
HEBREW = 6, // iw
ITALIAN = 7, // it
JAPANESE = 8, // ja
KOREAN = 9, // ko
NORWEGIAN = 10, // no
POLISH = 11, // pl
PORTUGUESE = 12, // pt
RUSSIAN = 13, // ru
SPANISH = 14, // es
SWEDISH = 15, // sv
CHINESE = 16, // zh
CZECH = 17, // cs
GREEK = 18, // el
ICELANDIC = 19, // is
LATVIAN = 20, // lv
LITHUANIAN = 21, // lt
ROMANIAN = 22, // ro
HUNGARIAN = 23, // hu
ESTONIAN = 24, // et
TG_UNKNOWN_LANGUAGE = 25, // xxx
UNKNOWN_LANGUAGE = 26, // un
BULGARIAN = 27, // bg
CROATIAN = 28, // hr
SERBIAN = 29, // sr
IRISH = 30, // ga
GALICIAN = 31, // gl
TAGALOG = 32, // tl
TURKISH = 33, // tr
UKRAINIAN = 34, // uk
HINDI = 35, // hi
MACEDONIAN = 36, // mk
BENGALI = 37, // bn
INDONESIAN = 38, // id
LATIN = 39, // la
MALAY = 40, // ms
MALAYALAM = 41, // ml
WELSH = 42, // cy
NEPALI = 43, // ne
TELUGU = 44, // te
ALBANIAN = 45, // sq
TAMIL = 46, // ta
BELARUSIAN = 47, // be
JAVANESE = 48, // jw
OCCITAN = 49, // oc
URDU = 50, // ur
BIHARI = 51, // bh
GUJARATI = 52, // gu
THAI = 53, // th
ARABIC = 54, // ar
CATALAN = 55, // ca
ESPERANTO = 56, // eo
BASQUE = 57, // eu
INTERLINGUA = 58, // ia
KANNADA = 59, // kn
PUNJABI = 60, // pa
SCOTS_GAELIC = 61, // gd
SWAHILI = 62, // sw
SLOVENIAN = 63, // sl
MARATHI = 64, // mr
MALTESE = 65, // mt
VIETNAMESE = 66, // vi
FRISIAN = 67, // fy
SLOVAK = 68, // sk
CHINESE_T = 69, // zh-Hant
FAROESE = 70, // fo
SUNDANESE = 71, // su
UZBEK = 72, // uz
AMHARIC = 73, // am
AZERBAIJANI = 74, // az
GEORGIAN = 75, // ka
TIGRINYA = 76, // ti
PERSIAN = 77, // fa
BOSNIAN = 78, // bs
SINHALESE = 79, // si
NORWEGIAN_N = 80, // nn
X_81 = 81, //
X_82 = 82, //
XHOSA = 83, // xh
ZULU = 84, // zu
GUARANI = 85, // gn
SESOTHO = 86, // st
TURKMEN = 87, // tk
KYRGYZ = 88, // ky
BRETON = 89, // br
TWI = 90, // tw
YIDDISH = 91, // yi
X_92 = 92, //
SOMALI = 93, // so
UIGHUR = 94, // ug
KURDISH = 95, // ku
MONGOLIAN = 96, // mn
ARMENIAN = 97, // hy
LAOTHIAN = 98, // lo
SINDHI = 99, // sd
RHAETO_ROMANCE = 100, // rm
AFRIKAANS = 101, // af
LUXEMBOURGISH = 102, // lb
BURMESE = 103, // my
KHMER = 104, // km
TIBETAN = 105, // bo
DHIVEHI = 106, // dv
CHEROKEE = 107, // chr
SYRIAC = 108, // syr
LIMBU = 109, // lif
ORIYA = 110, // or
ASSAMESE = 111, // as
CORSICAN = 112, // co
INTERLINGUE = 113, // ie
KAZAKH = 114, // kk
LINGALA = 115, // ln
X_116 = 116, //
PASHTO = 117, // ps
QUECHUA = 118, // qu
SHONA = 119, // sn
TAJIK = 120, // tg
TATAR = 121, // tt
TONGA = 122, // to
YORUBA = 123, // yo
X_124 = 124, //
X_125 = 125, //
X_126 = 126, //
X_127 = 127, //
MAORI = 128, // mi
WOLOF = 129, // wo
ABKHAZIAN = 130, // ab
AFAR = 131, // aa
AYMARA = 132, // ay
BASHKIR = 133, // ba
BISLAMA = 134, // bi
DZONGKHA = 135, // dz
FIJIAN = 136, // fj
GREENLANDIC = 137, // kl
HAUSA = 138, // ha
HAITIAN_CREOLE = 139, // ht
INUPIAK = 140, // ik
INUKTITUT = 141, // iu
KASHMIRI = 142, // ks
KINYARWANDA = 143, // rw
MALAGASY = 144, // mg
NAURU = 145, // na
OROMO = 146, // om
RUNDI = 147, // rn
SAMOAN = 148, // sm
SANGO = 149, // sg
SANSKRIT = 150, // sa
SISWANT = 151, // ss
TSONGA = 152, // ts
TSWANA = 153, // tn
VOLAPUK = 154, // vo
ZHUANG = 155, // za
KHASI = 156, // kha
SCOTS = 157, // sco
GANDA = 158, // lg
MANX = 159, // gv
MONTENEGRIN = 160, // sr-ME
AKAN = 161, // ak
IGBO = 162, // ig
MAURITIAN_CREOLE = 163, // mfe
HAWAIIAN = 164, // haw
CEBUANO = 165, // ceb
EWE = 166, // ee
GA = 167, // gaa
HMONG = 168, // blu
KRIO = 169, // kri
LOZI = 170, // loz
LUBA_LULUA = 171, // lua
LUO_KENYA_AND_TANZANIA = 172, // luo
NEWARI = 173, // new
NYANJA = 174, // ny
OSSETIAN = 175, // os
PAMPANGA = 176, // pam
PEDI = 177, // nso
RAJASTHANI = 178, // raj
SESELWA = 179, // crs
TUMBUKA = 180, // tum
VENDA = 181, // ve
WARAY_PHILIPPINES = 182, // war
X_183 = 183, //
X_184 = 184, //
X_185 = 185, //
X_186 = 186, //
X_187 = 187, //
X_188 = 188, //
X_189 = 189, //
X_190 = 190, //
X_191 = 191, //
X_192 = 192, //
X_193 = 193, //
X_194 = 194, //
X_195 = 195, //
X_196 = 196, //
X_197 = 197, //
X_198 = 198, //
X_199 = 199, //
X_200 = 200, //
X_201 = 201, //
X_202 = 202, //
X_203 = 203, //
X_204 = 204, //
X_205 = 205, //
X_206 = 206, //
X_207 = 207, //
X_208 = 208, //
X_209 = 209, //
X_210 = 210, //
X_211 = 211, //
X_212 = 212, //
X_213 = 213, //
X_214 = 214, //
X_215 = 215, //
X_216 = 216, //
X_217 = 217, //
X_218 = 218, //
X_219 = 219, //
X_220 = 220, //
X_221 = 221, //
X_222 = 222, //
X_223 = 223, //
X_224 = 224, //
X_225 = 225, //
X_226 = 226, //
X_227 = 227, //
X_228 = 228, //
X_229 = 229, //
X_230 = 230, //
X_231 = 231, //
X_232 = 232, //
X_233 = 233, //
X_234 = 234, //
X_235 = 235, //
X_236 = 236, //
X_237 = 237, //
X_238 = 238, //
X_239 = 239, //
X_240 = 240, //
X_241 = 241, //
X_242 = 242, //
X_243 = 243, //
X_244 = 244, //
X_245 = 245, //
X_246 = 246, //
X_247 = 247, //
X_248 = 248, //
X_249 = 249, //
X_250 = 250, //
X_251 = 251, //
X_252 = 252, //
X_253 = 253, //
X_254 = 254, //
X_255 = 255, //
X_256 = 256, //
X_257 = 257, //
X_258 = 258, //
X_259 = 259, //
X_260 = 260, //
X_261 = 261, //
X_262 = 262, //
X_263 = 263, //
X_264 = 264, //
X_265 = 265, //
X_266 = 266, //
X_267 = 267, //
X_268 = 268, //
X_269 = 269, //
X_270 = 270, //
X_271 = 271, //
X_272 = 272, //
X_273 = 273, //
X_274 = 274, //
X_275 = 275, //
X_276 = 276, //
X_277 = 277, //
X_278 = 278, //
X_279 = 279, //
X_280 = 280, //
X_281 = 281, //
X_282 = 282, //
X_283 = 283, //
X_284 = 284, //
X_285 = 285, //
X_286 = 286, //
X_287 = 287, //
X_288 = 288, //
X_289 = 289, //
X_290 = 290, //
X_291 = 291, //
X_292 = 292, //
X_293 = 293, //
X_294 = 294, //
X_295 = 295, //
X_296 = 296, //
X_297 = 297, //
X_298 = 298, //
X_299 = 299, //
X_300 = 300, //
X_301 = 301, //
X_302 = 302, //
X_303 = 303, //
X_304 = 304, //
X_305 = 305, //
X_306 = 306, //
X_307 = 307, //
X_308 = 308, //
X_309 = 309, //
X_310 = 310, //
X_311 = 311, //
X_312 = 312, //
X_313 = 313, //
X_314 = 314, //
X_315 = 315, //
X_316 = 316, //
X_317 = 317, //
X_318 = 318, //
X_319 = 319, //
X_320 = 320, //
X_321 = 321, //
X_322 = 322, //
X_323 = 323, //
X_324 = 324, //
X_325 = 325, //
X_326 = 326, //
X_327 = 327, //
X_328 = 328, //
X_329 = 329, //
X_330 = 330, //
X_331 = 331, //
X_332 = 332, //
X_333 = 333, //
X_334 = 334, //
X_335 = 335, //
X_336 = 336, //
X_337 = 337, //
X_338 = 338, //
X_339 = 339, //
X_340 = 340, //
X_341 = 341, //
X_342 = 342, //
X_343 = 343, //
X_344 = 344, //
X_345 = 345, //
X_346 = 346, //
X_347 = 347, //
X_348 = 348, //
X_349 = 349, //
X_350 = 350, //
X_351 = 351, //
X_352 = 352, //
X_353 = 353, //
X_354 = 354, //
X_355 = 355, //
X_356 = 356, //
X_357 = 357, //
X_358 = 358, //
X_359 = 359, //
X_360 = 360, //
X_361 = 361, //
X_362 = 362, //
X_363 = 363, //
X_364 = 364, //
X_365 = 365, //
X_366 = 366, //
X_367 = 367, //
X_368 = 368, //
X_369 = 369, //
X_370 = 370, //
X_371 = 371, //
X_372 = 372, //
X_373 = 373, //
X_374 = 374, //
X_375 = 375, //
X_376 = 376, //
X_377 = 377, //
X_378 = 378, //
X_379 = 379, //
X_380 = 380, //
X_381 = 381, //
X_382 = 382, //
X_383 = 383, //
X_384 = 384, //
X_385 = 385, //
X_386 = 386, //
X_387 = 387, //
X_388 = 388, //
X_389 = 389, //
X_390 = 390, //
X_391 = 391, //
X_392 = 392, //
X_393 = 393, //
X_394 = 394, //
X_395 = 395, //
X_396 = 396, //
X_397 = 397, //
X_398 = 398, //
X_399 = 399, //
X_400 = 400, //
X_401 = 401, //
X_402 = 402, //
X_403 = 403, //
X_404 = 404, //
X_405 = 405, //
X_406 = 406, //
X_407 = 407, //
X_408 = 408, //
X_409 = 409, //
X_410 = 410, //
X_411 = 411, //
X_412 = 412, //
X_413 = 413, //
X_414 = 414, //
X_415 = 415, //
X_416 = 416, //
X_417 = 417, //
X_418 = 418, //
X_419 = 419, //
X_420 = 420, //
X_421 = 421, //
X_422 = 422, //
X_423 = 423, //
X_424 = 424, //
X_425 = 425, //
X_426 = 426, //
X_427 = 427, //
X_428 = 428, //
X_429 = 429, //
X_430 = 430, //
X_431 = 431, //
X_432 = 432, //
X_433 = 433, //
X_434 = 434, //
X_435 = 435, //
X_436 = 436, //
X_437 = 437, //
X_438 = 438, //
X_439 = 439, //
X_440 = 440, //
X_441 = 441, //
X_442 = 442, //
X_443 = 443, //
X_444 = 444, //
X_445 = 445, //
X_446 = 446, //
X_447 = 447, //
X_448 = 448, //
X_449 = 449, //
X_450 = 450, //
X_451 = 451, //
X_452 = 452, //
X_453 = 453, //
X_454 = 454, //
X_455 = 455, //
X_456 = 456, //
X_457 = 457, //
X_458 = 458, //
X_459 = 459, //
X_460 = 460, //
X_461 = 461, //
X_462 = 462, //
X_463 = 463, //
X_464 = 464, //
X_465 = 465, //
X_466 = 466, //
X_467 = 467, //
X_468 = 468, //
X_469 = 469, //
X_470 = 470, //
X_471 = 471, //
X_472 = 472, //
X_473 = 473, //
X_474 = 474, //
X_475 = 475, //
X_476 = 476, //
X_477 = 477, //
X_478 = 478, //
X_479 = 479, //
X_480 = 480, //
X_481 = 481, //
X_482 = 482, //
X_483 = 483, //
X_484 = 484, //
X_485 = 485, //
X_486 = 486, //
X_487 = 487, //
X_488 = 488, //
X_489 = 489, //
X_490 = 490, //
X_491 = 491, //
X_492 = 492, //
X_493 = 493, //
X_494 = 494, //
X_495 = 495, //
X_496 = 496, //
X_497 = 497, //
X_498 = 498, //
X_499 = 499, //
X_500 = 500, //
X_501 = 501, //
X_502 = 502, //
X_503 = 503, //
X_504 = 504, //
X_505 = 505, //
NDEBELE = 506, // nr
X_BORK_BORK_BORK = 507, // zzb
X_PIG_LATIN = 508, // zzp
X_HACKER = 509, // zzh
X_KLINGON = 510, // tlh
X_ELMER_FUDD = 511, // zze
X_Common = 512, // xx-Zyyy
X_Latin = 513, // xx-Latn
X_Greek = 514, // xx-Grek
X_Cyrillic = 515, // xx-Cyrl
X_Armenian = 516, // xx-Armn
X_Hebrew = 517, // xx-Hebr
X_Arabic = 518, // xx-Arab
X_Syriac = 519, // xx-Syrc
X_Thaana = 520, // xx-Thaa
X_Devanagari = 521, // xx-Deva
X_Bengali = 522, // xx-Beng
X_Gurmukhi = 523, // xx-Guru
X_Gujarati = 524, // xx-Gujr
X_Oriya = 525, // xx-Orya
X_Tamil = 526, // xx-Taml
X_Telugu = 527, // xx-Telu
X_Kannada = 528, // xx-Knda
X_Malayalam = 529, // xx-Mlym
X_Sinhala = 530, // xx-Sinh
X_Thai = 531, // xx-Thai
X_Lao = 532, // xx-Laoo
X_Tibetan = 533, // xx-Tibt
X_Myanmar = 534, // xx-Mymr
X_Georgian = 535, // xx-Geor
X_Hangul = 536, // xx-Hang
X_Ethiopic = 537, // xx-Ethi
X_Cherokee = 538, // xx-Cher
X_Canadian_Aboriginal = 539, // xx-Cans
X_Ogham = 540, // xx-Ogam
X_Runic = 541, // xx-Runr
X_Khmer = 542, // xx-Khmr
X_Mongolian = 543, // xx-Mong
X_Hiragana = 544, // xx-Hira
X_Katakana = 545, // xx-Kana
X_Bopomofo = 546, // xx-Bopo
X_Han = 547, // xx-Hani
X_Yi = 548, // xx-Yiii
X_Old_Italic = 549, // xx-Ital
X_Gothic = 550, // xx-Goth
X_Deseret = 551, // xx-Dsrt
X_Inherited = 552, // xx-Qaai
X_Tagalog = 553, // xx-Tglg
X_Hanunoo = 554, // xx-Hano
X_Buhid = 555, // xx-Buhd
X_Tagbanwa = 556, // xx-Tagb
X_Limbu = 557, // xx-Limb
X_Tai_Le = 558, // xx-Tale
X_Linear_B = 559, // xx-Linb
X_Ugaritic = 560, // xx-Ugar
X_Shavian = 561, // xx-Shaw
X_Osmanya = 562, // xx-Osma
X_Cypriot = 563, // xx-Cprt
X_Braille = 564, // xx-Brai
X_Buginese = 565, // xx-Bugi
X_Coptic = 566, // xx-Copt
X_New_Tai_Lue = 567, // xx-Talu
X_Glagolitic = 568, // xx-Glag
X_Tifinagh = 569, // xx-Tfng
X_Syloti_Nagri = 570, // xx-Sylo
X_Old_Persian = 571, // xx-Xpeo
X_Kharoshthi = 572, // xx-Khar
X_Balinese = 573, // xx-Bali
X_Cuneiform = 574, // xx-Xsux
X_Phoenician = 575, // xx-Phnx
X_Phags_Pa = 576, // xx-Phag
X_Nko = 577, // xx-Nkoo
X_Sundanese = 578, // xx-Sund
X_Lepcha = 579, // xx-Lepc
X_Ol_Chiki = 580, // xx-Olck
X_Vai = 581, // xx-Vaii
X_Saurashtra = 582, // xx-Saur
X_Kayah_Li = 583, // xx-Kali
X_Rejang = 584, // xx-Rjng
X_Lycian = 585, // xx-Lyci
X_Carian = 586, // xx-Cari
X_Lydian = 587, // xx-Lydi
X_Cham = 588, // xx-Cham
X_Tai_Tham = 589, // xx-Lana
X_Tai_Viet = 590, // xx-Tavt
X_Avestan = 591, // xx-Avst
X_Egyptian_Hieroglyphs = 592, // xx-Egyp
X_Samaritan = 593, // xx-Samr
X_Lisu = 594, // xx-Lisu
X_Bamum = 595, // xx-Bamu
X_Javanese = 596, // xx-Java
X_Meetei_Mayek = 597, // xx-Mtei
X_Imperial_Aramaic = 598, // xx-Armi
X_Old_South_Arabian = 599, // xx-Sarb
X_Inscriptional_Parthian = 600, // xx-Prti
X_Inscriptional_Pahlavi = 601, // xx-Phli
X_Old_Turkic = 602, // xx-Orkh
X_Kaithi = 603, // xx-Kthi
X_Batak = 604, // xx-Batk
X_Brahmi = 605, // xx-Brah
X_Mandaic = 606, // xx-Mand
X_Chakma = 607, // xx-Cakm
X_Meroitic_Cursive = 608, // xx-Merc
X_Meroitic_Hieroglyphs = 609, // xx-Mero
X_Miao = 610, // xx-Plrd
X_Sharada = 611, // xx-Shrd
X_Sora_Sompeng = 612, // xx-Sora
X_Takri = 613, // xx-Takr
NUM_LANGUAGES
} Language;
} // namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__

View File

@ -0,0 +1,781 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_ulscript.cc
// Machine generated. Do Not Edit.
//
// Declarations for scripts recognized by CLD2
//
#include "generated_ulscript.h"
#include "generated_language.h"
namespace CLD2 {
// Subscripted by enum ULScript
extern const int kULScriptToNameSize = 102;
extern const char* const kULScriptToName[kULScriptToNameSize] = {
"Common", // 0 Zyyy
"Latin", // 1 Latn
"Greek", // 2 Grek
"Cyrillic", // 3 Cyrl
"Armenian", // 4 Armn
"Hebrew", // 5 Hebr
"Arabic", // 6 Arab
"Syriac", // 7 Syrc
"Thaana", // 8 Thaa
"Devanagari", // 9 Deva
"Bengali", // 10 Beng
"Gurmukhi", // 11 Guru
"Gujarati", // 12 Gujr
"Oriya", // 13 Orya
"Tamil", // 14 Taml
"Telugu", // 15 Telu
"Kannada", // 16 Knda
"Malayalam", // 17 Mlym
"Sinhala", // 18 Sinh
"Thai", // 19 Thai
"Lao", // 20 Laoo
"Tibetan", // 21 Tibt
"Myanmar", // 22 Mymr
"Georgian", // 23 Geor
"Hani", // 24 Hani
"Ethiopic", // 25 Ethi
"Cherokee", // 26 Cher
"Canadian_Aboriginal", // 27 Cans
"Ogham", // 28 Ogam
"Runic", // 29 Runr
"Khmer", // 30 Khmr
"Mongolian", // 31 Mong
"", // 32
"", // 33
"Bopomofo", // 34 Bopo
"", // 35
"Yi", // 36 Yiii
"Old_Italic", // 37 Ital
"Gothic", // 38 Goth
"Deseret", // 39 Dsrt
"Inherited", // 40 Zinh
"Tagalog", // 41 Tglg
"Hanunoo", // 42 Hano
"Buhid", // 43 Buhd
"Tagbanwa", // 44 Tagb
"Limbu", // 45 Limb
"Tai_Le", // 46 Tale
"Linear_B", // 47 Linb
"Ugaritic", // 48 Ugar
"Shavian", // 49 Shaw
"Osmanya", // 50 Osma
"Cypriot", // 51 Cprt
"Braille", // 52 Brai
"Buginese", // 53 Bugi
"Coptic", // 54 Copt
"New_Tai_Lue", // 55 Talu
"Glagolitic", // 56 Glag
"Tifinagh", // 57 Tfng
"Syloti_Nagri", // 58 Sylo
"Old_Persian", // 59 Xpeo
"Kharoshthi", // 60 Khar
"Balinese", // 61 Bali
"Cuneiform", // 62 Xsux
"Phoenician", // 63 Phnx
"Phags_Pa", // 64 Phag
"Nko", // 65 Nkoo
"Sundanese", // 66 Sund
"Lepcha", // 67 Lepc
"Ol_Chiki", // 68 Olck
"Vai", // 69 Vaii
"Saurashtra", // 70 Saur
"Kayah_Li", // 71 Kali
"Rejang", // 72 Rjng
"Lycian", // 73 Lyci
"Carian", // 74 Cari
"Lydian", // 75 Lydi
"Cham", // 76 Cham
"Tai_Tham", // 77 Lana
"Tai_Viet", // 78 Tavt
"Avestan", // 79 Avst
"Egyptian_Hieroglyphs", // 80 Egyp
"Samaritan", // 81 Samr
"Lisu", // 82 Lisu
"Bamum", // 83 Bamu
"Javanese", // 84 Java
"Meetei_Mayek", // 85 Mtei
"Imperial_Aramaic", // 86 Armi
"Old_South_Arabian", // 87 Sarb
"Inscriptional_Parthian", // 88 Prti
"Inscriptional_Pahlavi", // 89 Phli
"Old_Turkic", // 90 Orkh
"Kaithi", // 91 Kthi
"Batak", // 92 Batk
"Brahmi", // 93 Brah
"Mandaic", // 94 Mand
"Chakma", // 95 Cakm
"Meroitic_Cursive", // 96 Merc
"Meroitic_Hieroglyphs", // 97 Mero
"Miao", // 98 Plrd
"Sharada", // 99 Shrd
"Sora_Sompeng", // 100 Sora
"Takri", // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToCodeSize = 102;
extern const char* const kULScriptToCode[kULScriptToCodeSize] = {
"Zyyy", // 0 Common
"Latn", // 1 Latin
"Grek", // 2 Greek
"Cyrl", // 3 Cyrillic
"Armn", // 4 Armenian
"Hebr", // 5 Hebrew
"Arab", // 6 Arabic
"Syrc", // 7 Syriac
"Thaa", // 8 Thaana
"Deva", // 9 Devanagari
"Beng", // 10 Bengali
"Guru", // 11 Gurmukhi
"Gujr", // 12 Gujarati
"Orya", // 13 Oriya
"Taml", // 14 Tamil
"Telu", // 15 Telugu
"Knda", // 16 Kannada
"Mlym", // 17 Malayalam
"Sinh", // 18 Sinhala
"Thai", // 19 Thai
"Laoo", // 20 Lao
"Tibt", // 21 Tibetan
"Mymr", // 22 Myanmar
"Geor", // 23 Georgian
"Hani", // 24 Hani
"Ethi", // 25 Ethiopic
"Cher", // 26 Cherokee
"Cans", // 27 Canadian_Aboriginal
"Ogam", // 28 Ogham
"Runr", // 29 Runic
"Khmr", // 30 Khmer
"Mong", // 31 Mongolian
"", // 32
"", // 33
"Bopo", // 34 Bopomofo
"", // 35
"Yiii", // 36 Yi
"Ital", // 37 Old_Italic
"Goth", // 38 Gothic
"Dsrt", // 39 Deseret
"Zinh", // 40 Inherited
"Tglg", // 41 Tagalog
"Hano", // 42 Hanunoo
"Buhd", // 43 Buhid
"Tagb", // 44 Tagbanwa
"Limb", // 45 Limbu
"Tale", // 46 Tai_Le
"Linb", // 47 Linear_B
"Ugar", // 48 Ugaritic
"Shaw", // 49 Shavian
"Osma", // 50 Osmanya
"Cprt", // 51 Cypriot
"Brai", // 52 Braille
"Bugi", // 53 Buginese
"Copt", // 54 Coptic
"Talu", // 55 New_Tai_Lue
"Glag", // 56 Glagolitic
"Tfng", // 57 Tifinagh
"Sylo", // 58 Syloti_Nagri
"Xpeo", // 59 Old_Persian
"Khar", // 60 Kharoshthi
"Bali", // 61 Balinese
"Xsux", // 62 Cuneiform
"Phnx", // 63 Phoenician
"Phag", // 64 Phags_Pa
"Nkoo", // 65 Nko
"Sund", // 66 Sundanese
"Lepc", // 67 Lepcha
"Olck", // 68 Ol_Chiki
"Vaii", // 69 Vai
"Saur", // 70 Saurashtra
"Kali", // 71 Kayah_Li
"Rjng", // 72 Rejang
"Lyci", // 73 Lycian
"Cari", // 74 Carian
"Lydi", // 75 Lydian
"Cham", // 76 Cham
"Lana", // 77 Tai_Tham
"Tavt", // 78 Tai_Viet
"Avst", // 79 Avestan
"Egyp", // 80 Egyptian_Hieroglyphs
"Samr", // 81 Samaritan
"Lisu", // 82 Lisu
"Bamu", // 83 Bamum
"Java", // 84 Javanese
"Mtei", // 85 Meetei_Mayek
"Armi", // 86 Imperial_Aramaic
"Sarb", // 87 Old_South_Arabian
"Prti", // 88 Inscriptional_Parthian
"Phli", // 89 Inscriptional_Pahlavi
"Orkh", // 90 Old_Turkic
"Kthi", // 91 Kaithi
"Batk", // 92 Batak
"Brah", // 93 Brahmi
"Mand", // 94 Mandaic
"Cakm", // 95 Chakma
"Merc", // 96 Meroitic_Cursive
"Mero", // 97 Meroitic_Hieroglyphs
"Plrd", // 98 Miao
"Shrd", // 99 Sharada
"Sora", // 100 Sora_Sompeng
"Takr", // 101 Takri
};
// Subscripted by enum ULScript
extern const int kULScriptToCNameSize = 102;
extern const char* const kULScriptToCName[kULScriptToCNameSize] = {
"ULScript_Common", // 0 Zyyy
"ULScript_Latin", // 1 Latn
"ULScript_Greek", // 2 Grek
"ULScript_Cyrillic", // 3 Cyrl
"ULScript_Armenian", // 4 Armn
"ULScript_Hebrew", // 5 Hebr
"ULScript_Arabic", // 6 Arab
"ULScript_Syriac", // 7 Syrc
"ULScript_Thaana", // 8 Thaa
"ULScript_Devanagari", // 9 Deva
"ULScript_Bengali", // 10 Beng
"ULScript_Gurmukhi", // 11 Guru
"ULScript_Gujarati", // 12 Gujr
"ULScript_Oriya", // 13 Orya
"ULScript_Tamil", // 14 Taml
"ULScript_Telugu", // 15 Telu
"ULScript_Kannada", // 16 Knda
"ULScript_Malayalam", // 17 Mlym
"ULScript_Sinhala", // 18 Sinh
"ULScript_Thai", // 19 Thai
"ULScript_Lao", // 20 Laoo
"ULScript_Tibetan", // 21 Tibt
"ULScript_Myanmar", // 22 Mymr
"ULScript_Georgian", // 23 Geor
"ULScript_Hani", // 24 Hani
"ULScript_Ethiopic", // 25 Ethi
"ULScript_Cherokee", // 26 Cher
"ULScript_Canadian_Aboriginal", // 27 Cans
"ULScript_Ogham", // 28 Ogam
"ULScript_Runic", // 29 Runr
"ULScript_Khmer", // 30 Khmr
"ULScript_Mongolian", // 31 Mong
"ULScript_32", // 32
"ULScript_33", // 33
"ULScript_Bopomofo", // 34 Bopo
"ULScript_35", // 35
"ULScript_Yi", // 36 Yiii
"ULScript_Old_Italic", // 37 Ital
"ULScript_Gothic", // 38 Goth
"ULScript_Deseret", // 39 Dsrt
"ULScript_Inherited", // 40 Zinh
"ULScript_Tagalog", // 41 Tglg
"ULScript_Hanunoo", // 42 Hano
"ULScript_Buhid", // 43 Buhd
"ULScript_Tagbanwa", // 44 Tagb
"ULScript_Limbu", // 45 Limb
"ULScript_Tai_Le", // 46 Tale
"ULScript_Linear_B", // 47 Linb
"ULScript_Ugaritic", // 48 Ugar
"ULScript_Shavian", // 49 Shaw
"ULScript_Osmanya", // 50 Osma
"ULScript_Cypriot", // 51 Cprt
"ULScript_Braille", // 52 Brai
"ULScript_Buginese", // 53 Bugi
"ULScript_Coptic", // 54 Copt
"ULScript_New_Tai_Lue", // 55 Talu
"ULScript_Glagolitic", // 56 Glag
"ULScript_Tifinagh", // 57 Tfng
"ULScript_Syloti_Nagri", // 58 Sylo
"ULScript_Old_Persian", // 59 Xpeo
"ULScript_Kharoshthi", // 60 Khar
"ULScript_Balinese", // 61 Bali
"ULScript_Cuneiform", // 62 Xsux
"ULScript_Phoenician", // 63 Phnx
"ULScript_Phags_Pa", // 64 Phag
"ULScript_Nko", // 65 Nkoo
"ULScript_Sundanese", // 66 Sund
"ULScript_Lepcha", // 67 Lepc
"ULScript_Ol_Chiki", // 68 Olck
"ULScript_Vai", // 69 Vaii
"ULScript_Saurashtra", // 70 Saur
"ULScript_Kayah_Li", // 71 Kali
"ULScript_Rejang", // 72 Rjng
"ULScript_Lycian", // 73 Lyci
"ULScript_Carian", // 74 Cari
"ULScript_Lydian", // 75 Lydi
"ULScript_Cham", // 76 Cham
"ULScript_Tai_Tham", // 77 Lana
"ULScript_Tai_Viet", // 78 Tavt
"ULScript_Avestan", // 79 Avst
"ULScript_Egyptian_Hieroglyphs", // 80 Egyp
"ULScript_Samaritan", // 81 Samr
"ULScript_Lisu", // 82 Lisu
"ULScript_Bamum", // 83 Bamu
"ULScript_Javanese", // 84 Java
"ULScript_Meetei_Mayek", // 85 Mtei
"ULScript_Imperial_Aramaic", // 86 Armi
"ULScript_Old_South_Arabian", // 87 Sarb
"ULScript_Inscriptional_Parthian", // 88 Prti
"ULScript_Inscriptional_Pahlavi", // 89 Phli
"ULScript_Old_Turkic", // 90 Orkh
"ULScript_Kaithi", // 91 Kthi
"ULScript_Batak", // 92 Batk
"ULScript_Brahmi", // 93 Brah
"ULScript_Mandaic", // 94 Mand
"ULScript_Chakma", // 95 Cakm
"ULScript_Meroitic_Cursive", // 96 Merc
"ULScript_Meroitic_Hieroglyphs", // 97 Mero
"ULScript_Miao", // 98 Plrd
"ULScript_Sharada", // 99 Shrd
"ULScript_Sora_Sompeng", // 100 Sora
"ULScript_Takri", // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToRtypeSize = 102;
extern const ULScriptRType kULScriptToRtype[kULScriptToRtypeSize] = {
RTypeNone, // 0 Zyyy
RTypeMany, // 1 Latn
RTypeOne, // 2 Grek
RTypeMany, // 3 Cyrl
RTypeOne, // 4 Armn
RTypeMany, // 5 Hebr
RTypeMany, // 6 Arab
RTypeOne, // 7 Syrc
RTypeOne, // 8 Thaa
RTypeMany, // 9 Deva
RTypeMany, // 10 Beng
RTypeOne, // 11 Guru
RTypeOne, // 12 Gujr
RTypeOne, // 13 Orya
RTypeOne, // 14 Taml
RTypeOne, // 15 Telu
RTypeOne, // 16 Knda
RTypeOne, // 17 Mlym
RTypeOne, // 18 Sinh
RTypeOne, // 19 Thai
RTypeOne, // 20 Laoo
RTypeMany, // 21 Tibt
RTypeOne, // 22 Mymr
RTypeOne, // 23 Geor
RTypeCJK, // 24 Hani
RTypeMany, // 25 Ethi
RTypeOne, // 26 Cher
RTypeOne, // 27 Cans
RTypeNone, // 28 Ogam
RTypeNone, // 29 Runr
RTypeOne, // 30 Khmr
RTypeOne, // 31 Mong
RTypeNone, // 32
RTypeNone, // 33
RTypeNone, // 34 Bopo
RTypeNone, // 35
RTypeNone, // 36 Yiii
RTypeNone, // 37 Ital
RTypeNone, // 38 Goth
RTypeNone, // 39 Dsrt
RTypeNone, // 40 Zinh
RTypeOne, // 41 Tglg
RTypeNone, // 42 Hano
RTypeNone, // 43 Buhd
RTypeNone, // 44 Tagb
RTypeOne, // 45 Limb
RTypeNone, // 46 Tale
RTypeNone, // 47 Linb
RTypeNone, // 48 Ugar
RTypeNone, // 49 Shaw
RTypeNone, // 50 Osma
RTypeNone, // 51 Cprt
RTypeNone, // 52 Brai
RTypeNone, // 53 Bugi
RTypeNone, // 54 Copt
RTypeNone, // 55 Talu
RTypeNone, // 56 Glag
RTypeNone, // 57 Tfng
RTypeNone, // 58 Sylo
RTypeNone, // 59 Xpeo
RTypeNone, // 60 Khar
RTypeNone, // 61 Bali
RTypeNone, // 62 Xsux
RTypeNone, // 63 Phnx
RTypeNone, // 64 Phag
RTypeNone, // 65 Nkoo
RTypeNone, // 66 Sund
RTypeNone, // 67 Lepc
RTypeNone, // 68 Olck
RTypeNone, // 69 Vaii
RTypeNone, // 70 Saur
RTypeNone, // 71 Kali
RTypeNone, // 72 Rjng
RTypeNone, // 73 Lyci
RTypeNone, // 74 Cari
RTypeNone, // 75 Lydi
RTypeNone, // 76 Cham
RTypeNone, // 77 Lana
RTypeNone, // 78 Tavt
RTypeNone, // 79 Avst
RTypeNone, // 80 Egyp
RTypeNone, // 81 Samr
RTypeNone, // 82 Lisu
RTypeNone, // 83 Bamu
RTypeNone, // 84 Java
RTypeNone, // 85 Mtei
RTypeNone, // 86 Armi
RTypeNone, // 87 Sarb
RTypeNone, // 88 Prti
RTypeNone, // 89 Phli
RTypeNone, // 90 Orkh
RTypeNone, // 91 Kthi
RTypeNone, // 92 Batk
RTypeNone, // 93 Brah
RTypeNone, // 94 Mand
RTypeNone, // 95 Cakm
RTypeNone, // 96 Merc
RTypeNone, // 97 Mero
RTypeNone, // 98 Plrd
RTypeNone, // 99 Shrd
RTypeNone, // 100 Sora
RTypeNone, // 101 Takr
};
// Subscripted by enum ULScript
extern const int kULScriptToDefaultLangSize = 102;
extern const Language kULScriptToDefaultLang[kULScriptToDefaultLangSize] = {
X_Common, // 0 Zyyy RTypeNone
ENGLISH, // 1 Latn RTypeMany
GREEK, // 2 Grek RTypeOne
RUSSIAN, // 3 Cyrl RTypeMany
ARMENIAN, // 4 Armn RTypeOne
HEBREW, // 5 Hebr RTypeMany
ARABIC, // 6 Arab RTypeMany
SYRIAC, // 7 Syrc RTypeOne
DHIVEHI, // 8 Thaa RTypeOne
HINDI, // 9 Deva RTypeMany
BENGALI, // 10 Beng RTypeMany
PUNJABI, // 11 Guru RTypeOne
GUJARATI, // 12 Gujr RTypeOne
ORIYA, // 13 Orya RTypeOne
TAMIL, // 14 Taml RTypeOne
TELUGU, // 15 Telu RTypeOne
KANNADA, // 16 Knda RTypeOne
MALAYALAM, // 17 Mlym RTypeOne
SINHALESE, // 18 Sinh RTypeOne
THAI, // 19 Thai RTypeOne
LAOTHIAN, // 20 Laoo RTypeOne
TIBETAN, // 21 Tibt RTypeMany
BURMESE, // 22 Mymr RTypeOne
GEORGIAN, // 23 Geor RTypeOne
JAPANESE, // 24 Hani RTypeCJK
AMHARIC, // 25 Ethi RTypeMany
CHEROKEE, // 26 Cher RTypeOne
INUKTITUT, // 27 Cans RTypeOne
X_Ogham, // 28 Ogam RTypeNone
X_Runic, // 29 Runr RTypeNone
KHMER, // 30 Khmr RTypeOne
MONGOLIAN, // 31 Mong RTypeOne
UNKNOWN_LANGUAGE, // 32 RTypeNone
UNKNOWN_LANGUAGE, // 33 RTypeNone
X_Bopomofo, // 34 Bopo RTypeNone
UNKNOWN_LANGUAGE, // 35 RTypeNone
X_Yi, // 36 Yiii RTypeNone
X_Old_Italic, // 37 Ital RTypeNone
X_Gothic, // 38 Goth RTypeNone
X_Deseret, // 39 Dsrt RTypeNone
X_Inherited, // 40 Zinh RTypeNone
TAGALOG, // 41 Tglg RTypeOne
X_Hanunoo, // 42 Hano RTypeNone
X_Buhid, // 43 Buhd RTypeNone
X_Tagbanwa, // 44 Tagb RTypeNone
LIMBU, // 45 Limb RTypeOne
X_Tai_Le, // 46 Tale RTypeNone
X_Linear_B, // 47 Linb RTypeNone
X_Ugaritic, // 48 Ugar RTypeNone
X_Shavian, // 49 Shaw RTypeNone
X_Osmanya, // 50 Osma RTypeNone
X_Cypriot, // 51 Cprt RTypeNone
X_Braille, // 52 Brai RTypeNone
X_Buginese, // 53 Bugi RTypeNone
X_Coptic, // 54 Copt RTypeNone
X_New_Tai_Lue, // 55 Talu RTypeNone
X_Glagolitic, // 56 Glag RTypeNone
X_Tifinagh, // 57 Tfng RTypeNone
X_Syloti_Nagri, // 58 Sylo RTypeNone
X_Old_Persian, // 59 Xpeo RTypeNone
X_Kharoshthi, // 60 Khar RTypeNone
X_Balinese, // 61 Bali RTypeNone
X_Cuneiform, // 62 Xsux RTypeNone
X_Phoenician, // 63 Phnx RTypeNone
X_Phags_Pa, // 64 Phag RTypeNone
X_Nko, // 65 Nkoo RTypeNone
X_Sundanese, // 66 Sund RTypeNone
X_Lepcha, // 67 Lepc RTypeNone
X_Ol_Chiki, // 68 Olck RTypeNone
X_Vai, // 69 Vaii RTypeNone
X_Saurashtra, // 70 Saur RTypeNone
X_Kayah_Li, // 71 Kali RTypeNone
X_Rejang, // 72 Rjng RTypeNone
X_Lycian, // 73 Lyci RTypeNone
X_Carian, // 74 Cari RTypeNone
X_Lydian, // 75 Lydi RTypeNone
X_Cham, // 76 Cham RTypeNone
X_Tai_Tham, // 77 Lana RTypeNone
X_Tai_Viet, // 78 Tavt RTypeNone
X_Avestan, // 79 Avst RTypeNone
X_Egyptian_Hieroglyphs, // 80 Egyp RTypeNone
X_Samaritan, // 81 Samr RTypeNone
X_Lisu, // 82 Lisu RTypeNone
X_Bamum, // 83 Bamu RTypeNone
X_Javanese, // 84 Java RTypeNone
X_Meetei_Mayek, // 85 Mtei RTypeNone
X_Imperial_Aramaic, // 86 Armi RTypeNone
X_Old_South_Arabian, // 87 Sarb RTypeNone
X_Inscriptional_Parthian, // 88 Prti RTypeNone
X_Inscriptional_Pahlavi, // 89 Phli RTypeNone
X_Old_Turkic, // 90 Orkh RTypeNone
X_Kaithi, // 91 Kthi RTypeNone
X_Batak, // 92 Batk RTypeNone
X_Brahmi, // 93 Brah RTypeNone
X_Mandaic, // 94 Mand RTypeNone
X_Chakma, // 95 Cakm RTypeNone
X_Meroitic_Cursive, // 96 Merc RTypeNone
X_Meroitic_Hieroglyphs, // 97 Mero RTypeNone
X_Miao, // 98 Plrd RTypeNone
X_Sharada, // 99 Shrd RTypeNone
X_Sora_Sompeng, // 100 Sora RTypeNone
X_Takri, // 101 Takr RTypeNone
};
// Alphabetical order for binary search
extern const int kNameToULScriptSize = 105;
extern const CharIntPair kNameToULScript[kNameToULScriptSize] = {
{"Arabic", 6}, // Arab
{"Armenian", 4}, // Armn
{"Avestan", 79}, // Avst
{"Balinese", 61}, // Bali
{"Bamum", 83}, // Bamu
{"Batak", 92}, // Batk
{"Bengali", 10}, // Beng
{"Bopomofo", 34}, // Bopo
{"Brahmi", 93}, // Brah
{"Braille", 52}, // Brai
{"Buginese", 53}, // Bugi
{"Buhid", 43}, // Buhd
{"Canadian_Aboriginal", 27}, // Cans
{"Carian", 74}, // Cari
{"Chakma", 95}, // Cakm
{"Cham", 76}, // Cham
{"Cherokee", 26}, // Cher
{"Common", 0}, // Zyyy
{"Coptic", 54}, // Copt
{"Cuneiform", 62}, // Xsux
{"Cypriot", 51}, // Cprt
{"Cyrillic", 3}, // Cyrl
{"Deseret", 39}, // Dsrt
{"Devanagari", 9}, // Deva
{"Egyptian_Hieroglyphs", 80}, // Egyp
{"Ethiopic", 25}, // Ethi
{"Georgian", 23}, // Geor
{"Glagolitic", 56}, // Glag
{"Gothic", 38}, // Goth
{"Greek", 2}, // Grek
{"Gujarati", 12}, // Gujr
{"Gurmukhi", 11}, // Guru
{"Han", 24}, // Hant
{"Han", 24}, // Hans
{"Han", 24}, // Hani
{"Hangul", 24}, // Hang
{"Hani", 24}, // Hani
{"Hanunoo", 42}, // Hano
{"Hebrew", 5}, // Hebr
{"Hiragana", 24}, // Hira
{"Imperial_Aramaic", 86}, // Armi
{"Inherited", 40}, // Zinh
{"Inscriptional_Pahlavi", 89}, // Phli
{"Inscriptional_Parthian", 88}, // Prti
{"Javanese", 84}, // Java
{"Kaithi", 91}, // Kthi
{"Kannada", 16}, // Knda
{"Katakana", 24}, // Kana
{"Kayah_Li", 71}, // Kali
{"Kharoshthi", 60}, // Khar
{"Khmer", 30}, // Khmr
{"Lao", 20}, // Laoo
{"Latin", 1}, // Latn
{"Lepcha", 67}, // Lepc
{"Limbu", 45}, // Limb
{"Linear_B", 47}, // Linb
{"Lisu", 82}, // Lisu
{"Lycian", 73}, // Lyci
{"Lydian", 75}, // Lydi
{"Malayalam", 17}, // Mlym
{"Mandaic", 94}, // Mand
{"Meetei_Mayek", 85}, // Mtei
{"Meroitic_Cursive", 96}, // Merc
{"Meroitic_Hieroglyphs", 97}, // Mero
{"Miao", 98}, // Plrd
{"Mongolian", 31}, // Mong
{"Myanmar", 22}, // Mymr
{"New_Tai_Lue", 55}, // Talu
{"Nko", 65}, // Nkoo
{"Ogham", 28}, // Ogam
{"Ol_Chiki", 68}, // Olck
{"Old_Italic", 37}, // Ital
{"Old_Persian", 59}, // Xpeo
{"Old_South_Arabian", 87}, // Sarb
{"Old_Turkic", 90}, // Orkh
{"Oriya", 13}, // Orya
{"Osmanya", 50}, // Osma
{"Phags_Pa", 64}, // Phag
{"Phoenician", 63}, // Phnx
{"Rejang", 72}, // Rjng
{"Runic", 29}, // Runr
{"Samaritan", 81}, // Samr
{"Saurashtra", 70}, // Saur
{"Sharada", 99}, // Shrd
{"Shavian", 49}, // Shaw
{"Sinhala", 18}, // Sinh
{"Sora_Sompeng", 100}, // Sora
{"Sundanese", 66}, // Sund
{"Syloti_Nagri", 58}, // Sylo
{"Syriac", 7}, // Syrc
{"Tagalog", 41}, // Tglg
{"Tagbanwa", 44}, // Tagb
{"Tai_Le", 46}, // Tale
{"Tai_Tham", 77}, // Lana
{"Tai_Viet", 78}, // Tavt
{"Takri", 101}, // Takr
{"Tamil", 14}, // Taml
{"Telugu", 15}, // Telu
{"Thaana", 8}, // Thaa
{"Thai", 19}, // Thai
{"Tibetan", 21}, // Tibt
{"Tifinagh", 57}, // Tfng
{"Ugaritic", 48}, // Ugar
{"Vai", 69}, // Vaii
{"Yi", 36}, // Yiii
};
// Alphabetical order for binary search
extern const int kCodeToULScriptSize = 105;
extern const CharIntPair kCodeToULScript[kNameToULScriptSize] = {
{"Arab", 6}, // Arab
{"Armi", 86}, // Armi
{"Armn", 4}, // Armn
{"Avst", 79}, // Avst
{"Bali", 61}, // Bali
{"Bamu", 83}, // Bamu
{"Batk", 92}, // Batk
{"Beng", 10}, // Beng
{"Bopo", 34}, // Bopo
{"Brah", 93}, // Brah
{"Brai", 52}, // Brai
{"Bugi", 53}, // Bugi
{"Buhd", 43}, // Buhd
{"Cakm", 95}, // Cakm
{"Cans", 27}, // Cans
{"Cari", 74}, // Cari
{"Cham", 76}, // Cham
{"Cher", 26}, // Cher
{"Copt", 54}, // Copt
{"Cprt", 51}, // Cprt
{"Cyrl", 3}, // Cyrl
{"Deva", 9}, // Deva
{"Dsrt", 39}, // Dsrt
{"Egyp", 80}, // Egyp
{"Ethi", 25}, // Ethi
{"Geor", 23}, // Geor
{"Glag", 56}, // Glag
{"Goth", 38}, // Goth
{"Grek", 2}, // Grek
{"Gujr", 12}, // Gujr
{"Guru", 11}, // Guru
{"Hang", 24}, // Hang
{"Hani", 24}, // Hani
{"Hani", 24}, // Hani
{"Hano", 42}, // Hano
{"Hans", 24}, // Hans
{"Hant", 24}, // Hant
{"Hebr", 5}, // Hebr
{"Hira", 24}, // Hira
{"Ital", 37}, // Ital
{"Java", 84}, // Java
{"Kali", 71}, // Kali
{"Kana", 24}, // Kana
{"Khar", 60}, // Khar
{"Khmr", 30}, // Khmr
{"Knda", 16}, // Knda
{"Kthi", 91}, // Kthi
{"Lana", 77}, // Lana
{"Laoo", 20}, // Laoo
{"Latn", 1}, // Latn
{"Lepc", 67}, // Lepc
{"Limb", 45}, // Limb
{"Linb", 47}, // Linb
{"Lisu", 82}, // Lisu
{"Lyci", 73}, // Lyci
{"Lydi", 75}, // Lydi
{"Mand", 94}, // Mand
{"Merc", 96}, // Merc
{"Mero", 97}, // Mero
{"Mlym", 17}, // Mlym
{"Mong", 31}, // Mong
{"Mtei", 85}, // Mtei
{"Mymr", 22}, // Mymr
{"Nkoo", 65}, // Nkoo
{"Ogam", 28}, // Ogam
{"Olck", 68}, // Olck
{"Orkh", 90}, // Orkh
{"Orya", 13}, // Orya
{"Osma", 50}, // Osma
{"Phag", 64}, // Phag
{"Phli", 89}, // Phli
{"Phnx", 63}, // Phnx
{"Plrd", 98}, // Plrd
{"Prti", 88}, // Prti
{"Rjng", 72}, // Rjng
{"Runr", 29}, // Runr
{"Samr", 81}, // Samr
{"Sarb", 87}, // Sarb
{"Saur", 70}, // Saur
{"Shaw", 49}, // Shaw
{"Shrd", 99}, // Shrd
{"Sinh", 18}, // Sinh
{"Sora", 100}, // Sora
{"Sund", 66}, // Sund
{"Sylo", 58}, // Sylo
{"Syrc", 7}, // Syrc
{"Tagb", 44}, // Tagb
{"Takr", 101}, // Takr
{"Tale", 46}, // Tale
{"Talu", 55}, // Talu
{"Taml", 14}, // Taml
{"Tavt", 78}, // Tavt
{"Telu", 15}, // Telu
{"Tfng", 57}, // Tfng
{"Tglg", 41}, // Tglg
{"Thaa", 8}, // Thaa
{"Thai", 19}, // Thai
{"Tibt", 21}, // Tibt
{"Ugar", 48}, // Ugar
{"Vaii", 69}, // Vaii
{"Xpeo", 59}, // Xpeo
{"Xsux", 62}, // Xsux
{"Yiii", 36}, // Yiii
{"Zinh", 40}, // Zinh
{"Zyyy", 0}, // Zyyy
};
} // namespace CLD2

View File

@ -0,0 +1,140 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// generated_ulscript.h
// Machine generated. Do Not Edit.
//
// Declarations for scripts recognized by CLD2
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__
namespace CLD2 {
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
typedef struct {const char* s; int i;} CharIntPair;
typedef enum {
ULScript_Common = 0, // Zyyy
ULScript_Latin = 1, // Latn
ULScript_Greek = 2, // Grek
ULScript_Cyrillic = 3, // Cyrl
ULScript_Armenian = 4, // Armn
ULScript_Hebrew = 5, // Hebr
ULScript_Arabic = 6, // Arab
ULScript_Syriac = 7, // Syrc
ULScript_Thaana = 8, // Thaa
ULScript_Devanagari = 9, // Deva
ULScript_Bengali = 10, // Beng
ULScript_Gurmukhi = 11, // Guru
ULScript_Gujarati = 12, // Gujr
ULScript_Oriya = 13, // Orya
ULScript_Tamil = 14, // Taml
ULScript_Telugu = 15, // Telu
ULScript_Kannada = 16, // Knda
ULScript_Malayalam = 17, // Mlym
ULScript_Sinhala = 18, // Sinh
ULScript_Thai = 19, // Thai
ULScript_Lao = 20, // Laoo
ULScript_Tibetan = 21, // Tibt
ULScript_Myanmar = 22, // Mymr
ULScript_Georgian = 23, // Geor
ULScript_Hani = 24, // Hani
ULScript_Ethiopic = 25, // Ethi
ULScript_Cherokee = 26, // Cher
ULScript_Canadian_Aboriginal = 27, // Cans
ULScript_Ogham = 28, // Ogam
ULScript_Runic = 29, // Runr
ULScript_Khmer = 30, // Khmr
ULScript_Mongolian = 31, // Mong
ULScript_32 = 32, //
ULScript_33 = 33, //
ULScript_Bopomofo = 34, // Bopo
ULScript_35 = 35, //
ULScript_Yi = 36, // Yiii
ULScript_Old_Italic = 37, // Ital
ULScript_Gothic = 38, // Goth
ULScript_Deseret = 39, // Dsrt
ULScript_Inherited = 40, // Zinh
ULScript_Tagalog = 41, // Tglg
ULScript_Hanunoo = 42, // Hano
ULScript_Buhid = 43, // Buhd
ULScript_Tagbanwa = 44, // Tagb
ULScript_Limbu = 45, // Limb
ULScript_Tai_Le = 46, // Tale
ULScript_Linear_B = 47, // Linb
ULScript_Ugaritic = 48, // Ugar
ULScript_Shavian = 49, // Shaw
ULScript_Osmanya = 50, // Osma
ULScript_Cypriot = 51, // Cprt
ULScript_Braille = 52, // Brai
ULScript_Buginese = 53, // Bugi
ULScript_Coptic = 54, // Copt
ULScript_New_Tai_Lue = 55, // Talu
ULScript_Glagolitic = 56, // Glag
ULScript_Tifinagh = 57, // Tfng
ULScript_Syloti_Nagri = 58, // Sylo
ULScript_Old_Persian = 59, // Xpeo
ULScript_Kharoshthi = 60, // Khar
ULScript_Balinese = 61, // Bali
ULScript_Cuneiform = 62, // Xsux
ULScript_Phoenician = 63, // Phnx
ULScript_Phags_Pa = 64, // Phag
ULScript_Nko = 65, // Nkoo
ULScript_Sundanese = 66, // Sund
ULScript_Lepcha = 67, // Lepc
ULScript_Ol_Chiki = 68, // Olck
ULScript_Vai = 69, // Vaii
ULScript_Saurashtra = 70, // Saur
ULScript_Kayah_Li = 71, // Kali
ULScript_Rejang = 72, // Rjng
ULScript_Lycian = 73, // Lyci
ULScript_Carian = 74, // Cari
ULScript_Lydian = 75, // Lydi
ULScript_Cham = 76, // Cham
ULScript_Tai_Tham = 77, // Lana
ULScript_Tai_Viet = 78, // Tavt
ULScript_Avestan = 79, // Avst
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
ULScript_Samaritan = 81, // Samr
ULScript_Lisu = 82, // Lisu
ULScript_Bamum = 83, // Bamu
ULScript_Javanese = 84, // Java
ULScript_Meetei_Mayek = 85, // Mtei
ULScript_Imperial_Aramaic = 86, // Armi
ULScript_Old_South_Arabian = 87, // Sarb
ULScript_Inscriptional_Parthian = 88, // Prti
ULScript_Inscriptional_Pahlavi = 89, // Phli
ULScript_Old_Turkic = 90, // Orkh
ULScript_Kaithi = 91, // Kthi
ULScript_Batak = 92, // Batk
ULScript_Brahmi = 93, // Brah
ULScript_Mandaic = 94, // Mand
ULScript_Chakma = 95, // Cakm
ULScript_Meroitic_Cursive = 96, // Merc
ULScript_Meroitic_Hieroglyphs = 97, // Mero
ULScript_Miao = 98, // Plrd
ULScript_Sharada = 99, // Shrd
ULScript_Sora_Sompeng = 100, // Sora
ULScript_Takri = 101, // Takr
NUM_ULSCRIPTS
} ULScript;
#define UNKNOWN_ULSCRIPT ULScript_Common
} // namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,110 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
#include "integral_types.h"
#include "langspan.h"
#include "offsetmap.h"
namespace CLD2 {
static const int kMaxScriptBuffer = 40960;
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
static const int kWithinScriptTail = 32; // Stop at word space in last
// N bytes of script buffer
static inline bool IsContinuationByte(char c) {
return static_cast<signed char>(c) < -64;
}
// Gets lscript number for letters; always returns
// 0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src);
// Update src pointer to point to next quadgram, +2..+5
// Looks at src[0..4]
const char* AdvanceQuad(const char* src);
class ScriptScanner {
public:
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
bool any_text, bool any_script);
~ScriptScanner();
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
bool GetOneScriptSpan(LangSpan* span);
// Force Latin and Cyrillic scripts to be lowercase
void LowerScriptSpan(LangSpan* span);
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin and Cyrillic scripts to be lowercase
bool GetOneScriptSpanLower(LangSpan* span);
// Copy next run of non-tag characters to buffer [NUL terminated]
// This just removes tags and removes entities
// Buffer has leading space
bool GetOneTextSpan(LangSpan* span);
// Maps byte offset in most recent GetOneScriptSpan/Lower
// span->text [0..text_bytes] into an additional byte offset from
// span->offset, to get back to corresponding text in the original
// input buffer.
// text_offset must be the first byte
// of a UTF-8 character, or just beyond the last character. Normally this
// routine is called with the first byte of an interesting range and
// again with the first byte of the following range.
int MapBack(int text_offset);
const char* GetBufferStart() {return start_byte_;};
private:
// Skip over tags and non-letters
int SkipToFrontOfSpan(const char* src, int len, int* script);
const char* start_byte_; // Starting byte of buffer to scan
const char* next_byte_; // First unscanned byte
const char* next_byte_limit_; // Last byte + 1
int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
bool is_plain_text_; // true fo text, false for HTML
char* script_buffer_; // Holds text with expanded entities
char* script_buffer_lower_; // Holds lowercased text
bool letters_marks_only_; // To distinguish scriptspan of one
// letters/marks vs. any mixture of text
bool one_script_only_; // To distinguish scriptspan of one
// script vs. any mixture of scripts
int exit_state_; // For tag parser kTagParseTbl_0, based
// on letters_marks_only_
public :
// Expose for debugging
OffsetMap map2original_; // map from script_buffer_ to buffer
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
};
} // namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_

View File

@ -0,0 +1,31 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Cheap version
namespace CLD2 {
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
typedef int32 char32;
} // End namespace CLD2

View File

@ -0,0 +1,560 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// File: lang_script.cc
// ================
//
// Author: dsites@google.com (Dick Sites)
//
// This file declares language and script numbers and names for CLD2
//
#include "lang_script.h"
#include <stdlib.h>
#include <string.h>
#include "generated_language.h"
#include "generated_ulscript.h"
namespace CLD2 {
// Language tables
// Subscripted by enum Language
extern const int kLanguageToNameSize;
extern const char* const kLanguageToName[];
extern const int kLanguageToCodeSize;
extern const char* const kLanguageToCode[];
extern const int kLanguageToCNameSize;
extern const char* const kLanguageToCName[];
extern const int kLanguageToScriptsSize;
extern const FourScripts kLanguageToScripts[];
// Subscripted by Language
extern const int kLanguageToPLangSize;
extern const uint8 kLanguageToPLang[];
// Subscripted by per-script language
extern const uint16 kPLangToLanguageLatn[];
extern const uint16 kPLangToLanguageOthr[];
// Alphabetical order for binary search
extern const int kNameToLanguageSize;
extern const CharIntPair kNameToLanguage[];
extern const int kCodeToLanguageSize;
extern const CharIntPair kCodeToLanguage[];
// ULScript tables
// Subscripted by enum ULScript
extern const int kULScriptToNameSize;
extern const char* const kULScriptToName[];
extern const int kULScriptToCodeSize;
extern const char* const kULScriptToCode[];
extern const int kULScriptToCNameSize;
extern const char* const kULScriptToCName[];
extern const int kULScriptToRtypeSize;
extern const ULScriptRType kULScriptToRtype[];
extern const int kULScriptToDefaultLangSize;
extern const Language kULScriptToDefaultLang[];
// Alphabetical order for binary search
extern const int kNameToULScriptSize;
extern const CharIntPair kNameToULScript[];
extern const int kCodeToULScriptSize;
extern const CharIntPair kCodeToULScript[];
//
// File: lang_script.h
// ================
//
// Author: dsites@google.com (Dick Sites)
//
// This file declares language and script numbers and names for CLD2
//
// NOTE: The script numbers and language numbers here are not guaranteed to be
// stable. If you want to record a result for posterity, save the ISO codes
// as character strings.
//
//
// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
// specified in an enum. Each script has human-readable script name and a
// 4-letter ISO 15924 script code. Each has a C name (largely for use by
// programs that generate declarations in cld2_generated_scripts.h). Each
// also has a recognition type
// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
//
// The declarations for a particular version of Unicode are machine-generated in
// cld2_generated_scripts.h
//
// This file includes that one and declares the access routines. The type
// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
// which are not quite Unicode Scripts. In particular, the CJK scripts are
// merged into a single number because CLD2 recognizes the CJK languages from
// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
// Katakana.
// Each script has one of these four recognition types.
// RTypeNone: There is no language associated with this script. In extended
// language recognition calls, return a fake language number that maps to
// xx-Cham, with literally "xx" for the language code,and with the script
// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
// RTypeOne: The script maps 1:1 to a single language. No letters are examined
// during recognition and no lookups done.
// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
// is done to determine the languages involved.
// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
// languages involved.
//
// Note that the choice of recognition type is a function of script, not
// language. In particular, some languges are recognized in multiple scripts
// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
// for example).
//----------------------------------------------------------------------------//
// Functions of ULScript //
//----------------------------------------------------------------------------//
// If the input is out of range or otherwise unrecognized, it is treated
// as UNKNOWN_ULSCRIPT (which never participates in language recognition)
const char* ULScriptName(ULScript ulscript) {
int i_ulscript = ulscript;
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
return kULScriptToName[i_ulscript];
}
const char* ULScriptCode(ULScript ulscript) {
int i_ulscript = ulscript;
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
return kULScriptToCode[i_ulscript];
}
const char* ULScriptDeclaredName(ULScript ulscript) {
int i_ulscript = ulscript;
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
return kULScriptToCName[i_ulscript];
}
ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
int i_ulscript = ulscript;
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
return kULScriptToRtype[i_ulscript];
}
// The languages recognized by CLD2 are numbered almost arbitrarily,
// specified in an enum. Each language has human-readable language name and a
// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
// programs that generate declarations in cld2_generated_languagess.h).
// Each has a list of up to four scripts in which it is currently recognized.
//
// The declarations for a particular set of recognized languages are
// machine-generated in
// cld2_generated_languages.h
//
// The Language enum is intended to match the internal Google Language enum
// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
// languages assigned above that. Over time, some languages may be renumbered
// if they are moved into the Language enum.
//
// The Language enum includes the fake language numbers for RTypeNone above.
//
// In an open-source environment, the Google-specific Language enum is not
// available. Language decouples the two environments while maintaining
// internal compatibility.
// If the input is out of range or otherwise unrecognized, it is treated
// as UNKNOWN_LANGUAGE
//
// LanguageCode
// ------------
// Given the Language, return the language code, e.g. "ko"
// This is determined by
// the following (in order of preference):
// - ISO-639-1 two-letter language code
// (all except those mentioned below)
// - ISO-639-2 three-letter bibliographic language code
// (Tibetan, Dhivehi, Cherokee, Syriac)
// - Google-specific language code
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
// - Fake RTypeNone names.
//----------------------------------------------------------------------------//
// Functions of Language //
//----------------------------------------------------------------------------//
const char* LanguageName(Language lang) {
int i_lang = lang;
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
return kLanguageToName[i_lang];
}
const char* LanguageCode(Language lang) {
int i_lang = lang;
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
return kLanguageToCode[i_lang];
}
const char* LanguageDeclaredName(Language lang) {
int i_lang = lang;
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
return kLanguageToCName[i_lang];
}
// n is in 0..3. Trailing entries are filled with
// UNKNOWN_LANGUAGE (which never participates in language recognition)
ULScript LanguageRecognizedScript(Language lang, int n) {
int i_lang = lang;
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
}
// Given the Language, returns its string name used as the output by
// the lang/enc identifier, e.g. "Korean"
// "invalid_language" if the input is invalid.
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
const char* ExtLanguageName(const Language lang) {
return LanguageName(lang);
}
// Given the Language, return the language code, e.g. "ko"
const char* ExtLanguageCode(const Language lang) {
return LanguageCode(lang);
}
// Given the Language, returns its Language enum spelling, for use by
// programs that create C declarations, e.g. "KOREAN"
// "UNKNOWN_LANGUAGE" if the input is invalid.
const char* ExtLanguageDeclaredName(const Language lang) {
return LanguageDeclaredName(lang);
}
extern const int kCloseSetSize = 10;
// Returns which set of statistically-close languages lang is in. 0 means none.
int LanguageCloseSet(Language lang) {
// Scaffolding
// id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
// bo dz # TIBETAN DZONGKHA coef=0.4571
// cs sk # CZECH SLOVAK coef=0.4273
// zu xh # ZULU XHOSA coef=0.3716
//
// bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
// hi mr bh ne # HINDI MARATHI BIHARI NEPALI
// no nn da # NORWEGIAN NORWEGIAN_N DANISH
// gl es pt # GALICIAN SPANISH PORTUGUESE
// rw rn # KINYARWANDA RUNDI
if (lang == INDONESIAN) {return 1;}
if (lang == MALAY) {return 1;}
if (lang == TIBETAN) {return 2;}
if (lang == DZONGKHA) {return 2;}
if (lang == CZECH) {return 3;}
if (lang == SLOVAK) {return 3;}
if (lang == ZULU) {return 4;}
if (lang == XHOSA) {return 4;}
if (lang == BOSNIAN) {return 5;}
if (lang == CROATIAN) {return 5;}
if (lang == SERBIAN) {return 5;}
if (lang == MONTENEGRIN) {return 5;}
if (lang == HINDI) {return 6;}
if (lang == MARATHI) {return 6;}
if (lang == BIHARI) {return 6;}
if (lang == NEPALI) {return 6;}
if (lang == NORWEGIAN) {return 7;}
if (lang == NORWEGIAN_N) {return 7;}
if (lang == DANISH) {return 7;}
if (lang == GALICIAN) {return 8;}
if (lang == SPANISH) {return 8;}
if (lang == PORTUGUESE) {return 8;}
if (lang == KINYARWANDA) {return 9;}
if (lang == RUNDI) {return 9;}
return 0;
}
//----------------------------------------------------------------------------//
// Functions of ULScript and Language //
//----------------------------------------------------------------------------//
Language DefaultLanguage(ULScript ulscript) {
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
return kULScriptToDefaultLang[ulscript];
}
uint8 PerScriptNumber(ULScript ulscript, Language lang) {
if (ulscript < 0) {return 0;}
if (ulscript >= NUM_ULSCRIPTS) {return 0;}
if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
if (lang >= kLanguageToPLangSize) {return 0;}
return kLanguageToPLang[lang];
}
Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
if ((kULScriptToRtype[ulscript] == RTypeNone) ||
(kULScriptToRtype[ulscript] == RTypeOne)) {
return kULScriptToDefaultLang[ulscript];
}
if (ulscript == ULScript_Latin) {
return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
} else {
return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
}
}
// Return true if language can be in the Latin script
bool IsLatnLanguage(Language lang) {
if (lang >= kLanguageToPLangSize) {return false;}
return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]);
}
// Return true if language can be in a non-Latin script
bool IsOthrLanguage(Language lang) {
if (lang >= kLanguageToPLangSize) {return false;}
return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]);
}
//----------------------------------------------------------------------------//
// Other //
//----------------------------------------------------------------------------//
// Returns mid if key found in lo <= mid < hi, else -1
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
// binary search
while (lo < hi) {
int mid = (lo + hi) >> 1;
if (strcmp(key, cipair[mid].s) < 0) {
hi = mid;
} else if (strcmp(key, cipair[mid].s) > 0) {
lo = mid + 1;
} else {
return mid;
}
}
return -1;
}
Language MakeLang(int i) {return static_cast<Language>(i);}
// Name can be either full name or ISO code, or can be ISO code embedded in
// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
Language GetLanguageFromName(const char* src) {
const char* hyphen1 = strchr(src, '-');
const char* hyphen2 = NULL;
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
int match = -1;
if (hyphen1 == NULL) {
// Bare name. Look at full name, then code
match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
return UNKNOWN_LANGUAGE;
}
if (hyphen2 == NULL) {
// aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
int len = strlen(src);
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
char temp[16];
int hyphen1_offset = hyphen1 - src;
// Take off part after hyphen1
memcpy(temp, src, len);
temp[hyphen1_offset] = '\0';
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
return UNKNOWN_LANGUAGE;
}
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc
int len = strlen(src);
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
char temp[16];
int hyphen1_offset = hyphen1 - src;
int hyphen2_offset = hyphen2 - src;
// Take off part after hyphen2
memcpy(temp, src, len);
temp[hyphen2_offset] = '\0';
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
// Take off part between hyphen1 and hyphen2
int len2 = len - hyphen2_offset;
memcpy(temp, src, len);
memcpy(&temp[hyphen1_offset], hyphen2, len2);
temp[hyphen1_offset + len2] = '\0';
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc
// Take off everything after hyphen1
memcpy(temp, src, len);
temp[hyphen1_offset] = '\0';
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
return UNKNOWN_LANGUAGE;
}
// Name can be either full name or ISO code, or can be ISO code embedded in
// a language-script combination such as "en-Latn-GB"
// MORE WORK to do here. also kLanguageToScripts [4] is bogus
// if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc.
// Something like map code to Language, then Language to kLanguageToScripts[x][0]
// ADD BIAS: kLanguageToScripts lists default script first
// If total mismatch, reutrn Latn
// if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong]
// if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
ULScript GetULScriptFromName(const char* src) {
const char* hyphen1 = strchr(src, '-');
const char* hyphen2 = NULL;
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
int match = -1;
if (hyphen1 == NULL) {
// Bare name. Look at full name, then code, then try backmapping as Language
match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
Language backmap_me = GetLanguageFromName(src);
if (backmap_me != UNKNOWN_LANGUAGE) {
return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
}
return ULScript_Latin;
}
if (hyphen2 == NULL) {
// aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb
int len = strlen(src);
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
char temp[16];
int hyphen1_offset = hyphen1 - src;
int len1 = len - hyphen1_offset - 1; // Exclude the hyphen
// Take off part before hyphen1
memcpy(temp, hyphen1 + 1, len1);
temp[len1] = '\0';
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
// Take off part after hyphen1
memcpy(temp, src, len);
temp[hyphen1_offset] = '\0';
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
return ULScript_Latin;
}
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc
int len = strlen(src);
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
char temp[16];
int hyphen1_offset = hyphen1 - src;
int hyphen2_offset = hyphen2 - src;
int len2 = len - hyphen2_offset - 1; // Exclude the hyphen
int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen
// Keep part between hyphen1 and hyphen2
memcpy(temp, hyphen1 + 1, lenmid);
temp[lenmid] = '\0';
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
// Keep part after hyphen2
memcpy(temp, hyphen2 + 1, len2);
temp[len2] = '\0';
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc
// Keep part before hyphen1
memcpy(temp, src, len);
temp[hyphen1_offset] = '\0';
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
return ULScript_Latin;
}
// Map script into Latin, Cyrillic, Arabic, Other
int LScript4(ULScript ulscript) {
if (ulscript == ULScript_Latin) {return 0;}
if (ulscript == ULScript_Cyrillic) {return 1;}
if (ulscript == ULScript_Arabic) {return 2;}
return 3;
}
} // namespace CLD2

View File

@ -0,0 +1,187 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// File: lang_script.h
// ================
//
// Author: dsites@google.com (Dick Sites)
//
// This file declares language and script numbers and names for CLD2,
// plus routines that access side tables based on these
//
#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
#include "generated_language.h"
#include "generated_ulscript.h"
#include "integral_types.h"
// NOTE: The script numbers and language numbers here are not guaranteed to be
// stable. If you want to record a result for posterity, save the
// ULScriptCode(ULScript ulscript) result as character strings.
//
// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
// specified in an enum. Each script has human-readable script name and a
// 4-letter ISO 15924 script code. Each has a C name (largely for use by
// programs that generate declarations in cld2_generated_scripts.h). Each
// also has a recognition type
// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
//
// The declarations for a particular version of Unicode are machine-generated in
// generated_scripts.h
//
// This file includes that one and declares the access routines. The type
// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
// which are not quite Unicode Scripts. In particular, the CJK scripts are
// merged into a single number because CLD2 recognizes the CJK languages from
// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
// Katakana.
// Each script has one of these four recognition types.
// RTypeNone: There is no language associated with this script. In extended
// language recognition calls, return a fake language number that maps to
// xx-Cham, with literally "xx" for the language code,and with the script
// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
// RTypeOne: The script maps 1:1 to a single language. No letters are examined
// during recognition and no lookups done.
// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
// is done to determine the languages involved.
// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
// languages involved.
//
// Note that the choice of recognition type is a function of script, not
// language. In particular, some languges are recognized in multiple scripts
// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
// for example).
namespace CLD2 {
//----------------------------------------------------------------------------//
// Functions of ULScript //
//----------------------------------------------------------------------------//
// If the input is out of range or otherwise unrecognized, it is treated
// as ULScript_Common (which never participates in language recognition)
const char* ULScriptName(ULScript ulscript);
const char* ULScriptCode(ULScript ulscript);
const char* ULScriptDeclaredName(ULScript ulscript);
ULScriptRType ULScriptRecognitionType(ULScript ulscript);
// Name can be either full name or ISO code, or can be ISO code embedded in
// a language-script combination such as "en-Latn-GB"
ULScript GetULScriptFromName(const char* src);
// Map script into Latin, Cyrillic, Arabic, Other
int LScript4(ULScript ulscript);
//----------------------------------------------------------------------------//
// Functions of Language //
//----------------------------------------------------------------------------//
// The languages recognized by CLD2 are numbered almost arbitrarily,
// specified in an enum. Each language has human-readable language name and a
// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
// programs that generate declarations in cld2_generated_languagess.h).
// Each has a list of up to four scripts in which it is currently recognized.
//
// The declarations for a particular set of recognized languages are
// machine-generated in
// generated_languages.h
//
// The Language enum is intended to match the internal Google Language enum
// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
// languages assigned above that. Over time, some languages may be renumbered
// if they are moved into the Language enum.
//
// The Language enum includes the fake language numbers for RTypeNone above.
//
// If the input is out of range or otherwise unrecognized, it is treated
// as UNKNOWN_LANGUAGE
//
// LanguageCode
// ------------
// Given the Language, return the language code, e.g. "ko"
// This is determined by
// the following (in order of preference):
// - ISO-639-1 two-letter language code
// (all except those mentioned below)
// - ISO-639-2 three-letter bibliographic language code
// (Tibetan, Dhivehi, Cherokee, Syriac)
// - Google-specific language code
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
// - Fake RTypeNone names.
const char* LanguageName(Language lang);
const char* LanguageCode(Language lang);
const char* LanguageShortCode(Language lang);
const char* LanguageDeclaredName(Language lang);
// n is in 0..3. Trailing entries are filled with
// ULScript_Common (which never participates in language recognition)
ULScript LanguageRecognizedScript(Language lang, int n);
// Name can be either full name or ISO code, or can be ISO code embedded in
// a language-script combination such as "en-Latn-GB"
Language GetLanguageFromName(const char* src);
// Returns which set of statistically-close languages lang is in. 0 means none.
int LanguageCloseSet(Language lang);
//----------------------------------------------------------------------------//
// Functions of ULScript and Language //
//----------------------------------------------------------------------------//
// Most common language in each script
Language DefaultLanguage(ULScript ulscript);
// For RTypeMany recognition,
// the CLD2 lookup tables are kept small by encoding a language into one byte.
// To avoid limiting CLD2 to at most 256 languages, a larger range of external
// Language numbers is mapped to a smaller range of per-script numbers. At
// the moment (January 2013) the Latin script has about 90 languages to be
// recognized, while all the other scripts total about 50 more languages. In
// addition, the RTypeNone scripts map to about 100 fake languages.
// So we map all Latin-script languages to one range of 1..255 per-script
// numbers and map all the other RTypeMany languages to an overlapping range
// 1..255 of per-script numbers.
uint8 PerScriptNumber(ULScript ulscript, Language lang);
Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);
// While the speed-sensitive processing deals with per-script language numbers,
// there is a need for low-performance dealing with original language numbers
// and unknown scripts, mostly for processing language hints.
// These routines let one derive a script class from a bare language.
// For languages written in multiple scripts, both of these can return true.
bool IsLatnLanguage(Language lang);
bool IsOthrLanguage(Language lang);
//----------------------------------------------------------------------------//
// Other //
//----------------------------------------------------------------------------//
// Utility routine to search alphabetical tables
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
} // namespace CLD2
#endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__

View File

@ -0,0 +1,40 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_
#include "generated_language.h"
#include "generated_ulscript.h"
namespace CLD2 {
typedef struct {
char* text; // Pointer to the span, somewhere
int text_bytes; // Number of bytes of text in the span
int offset; // Offset of start of span in original input buffer
ULScript ulscript; // Unicode Letters Script of this span
Language lang; // Language identified for this span
bool truncated; // true if buffer filled up before a
// different script or EOF was found
} LangSpan;
} // namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_

View File

@ -0,0 +1,569 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
//
#include "offsetmap.h"
#include <string.h> // for strcmp
#include <stdio.h> // for fprintf, stderr, fclose, etc
#include <algorithm> // for min
using namespace std;
namespace CLD2 {
// Constructor, destructor
OffsetMap::OffsetMap() {
Clear();
}
OffsetMap::~OffsetMap() {
}
// Clear the map
// After:
// next_diff_sub_ is 0
// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
// which is a fake range of width 0 mapping 0=>0
void OffsetMap::Clear() {
diffs_.clear();
pending_op_ = COPY_OP;
pending_length_ = 0;
next_diff_sub_ = 0;
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
max_aoffset_ = 0; // Largest seen so far
max_aprimeoffset_ = 0; // Largest seen so far
}
static inline char OpPart(const char c) {
return (c >> 6) & 3;
}
static inline char LenPart(const char c) {
return c & 0x3f;
}
// Print map to file, for debugging
void OffsetMap::Printmap(const char* filename) {
FILE* fout;
bool needs_close = false;
if (strcmp(filename, "stdout") == 0) {
fout = stdout;
} else if (strcmp(filename, "stderr") == 0) {
fout = stderr;
} else {
fout = fopen(filename, "w");
needs_close = true;
}
if (fout == NULL) {
fprintf(stderr, "%s did not open\n", filename);
return;
}
Flush(); // Make sure any pending entry gets printed
fprintf(fout, "Offsetmap: %ld bytes\n", diffs_.size());
for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
fprintf(fout, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
if ((i % 20) == 19) {fprintf(fout, "\n");}
}
fprintf(fout, "\n");
if (needs_close) {
fclose(fout);
}
}
// Reset to offset 0
void OffsetMap::Reset() {
MaybeFlushAll();
next_diff_sub_ = 0;
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
}
// Add to mapping from A to A', specifying how many next bytes are
// identical in A and A'
void OffsetMap::Copy(int bytes) {
if (bytes == 0) {return;}
max_aoffset_ += bytes; // Largest seen so far
max_aprimeoffset_ += bytes; // Largest seen so far
if (pending_op_ == COPY_OP) {
pending_length_ += bytes;
} else {
Flush();
pending_op_ = COPY_OP;
pending_length_ = bytes;
}
}
// Add to mapping from A to A', specifying how many next bytes are
// inserted in A' while not advancing in A at all
void OffsetMap::Insert(int bytes){
if (bytes == 0) {return;}
max_aprimeoffset_ += bytes; // Largest seen so far
if (pending_op_ == INSERT_OP) {
pending_length_ += bytes;
} else if ((bytes == 1) &&
(pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
// Special-case exactly delete(1) insert(1) +> copy(1);
// all others backmap inserts to after deletes
pending_op_ = COPY_OP;
} else {
Flush();
pending_op_ = INSERT_OP;
pending_length_ = bytes;
}
}
// Add to mapping from A to A', specifying how many next bytes are
// deleted from A while not advancing in A' at all
void OffsetMap::Delete(int bytes){
if (bytes == 0) {return;}
max_aoffset_ += bytes; // Largest seen so far
if (pending_op_ == DELETE_OP) {
pending_length_ += bytes;
} else if ((bytes == 1) &&
(pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
// Special-case exactly insert(1) delete(1) => copy(1);
// all others backmap deletes to after insertss
pending_op_ = COPY_OP;
} else {
Flush();
pending_op_ = DELETE_OP;
pending_length_ = bytes;
}
}
void OffsetMap::Flush() {
if (pending_length_ == 0) {
return;
}
// We may be emitting a copy op just after a copy op because +1 -1 cancelled
// inbetween. If the lengths don't need a prefix byte, combine them
if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
char c = diffs_[diffs_.size() - 1];
MapOp prior_op = static_cast<MapOp>(OpPart(c));
int prior_len = LenPart(c);
if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
diffs_[diffs_.size() - 1] += pending_length_;
pending_length_ = 0;
return;
}
}
if (pending_length_ > 0x3f) {
bool non_zero_emitted = false;
for (int shift = 30; shift > 0; shift -= 6) {
int prefix = (pending_length_ >> shift) & 0x3f;
if ((prefix > 0) || non_zero_emitted) {
Emit(PREFIX_OP, prefix);
non_zero_emitted = true;
}
}
}
Emit(pending_op_, pending_length_ & 0x3f);
pending_length_ = 0;
}
// Add one more entry to copy one byte off the end, then flush
void OffsetMap::FlushAll() {
Copy(1);
Flush();
}
// Flush all if necessary
void OffsetMap::MaybeFlushAll() {
if ((0 < pending_length_) || diffs_.empty()) {
FlushAll();
}
}
// Len may be 0, for example as the low piece of length=64
void OffsetMap::Emit(MapOp op, int len) {
char c = (static_cast<char>(op) << 6) | (len & 0x3f);
diffs_.push_back(c);
}
void OffsetMap::DumpString() {
for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
fprintf(stderr, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
}
fprintf(stderr, "\n");
// Print running table of correspondences
fprintf(stderr, " op A => A' (A forward-maps to A')\n");
int aoffset = 0;
int aprimeoffset = 0;
int length = 0;
for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
char c = diffs_[i];
MapOp op = static_cast<MapOp>(OpPart(c));
int len = LenPart(c);
length = (length << 6) + len;
if (op == COPY_OP) {
aoffset += length;
aprimeoffset += length;
length = 0;
} else if (op == INSERT_OP) {
aoffset += 0;
aprimeoffset += length;
length = 0;
} else if (op == DELETE_OP) {
aoffset += length;
aprimeoffset += 0;
length = 0;
} else { // (op == PREFIX_OP)
// Do nothing else
}
fprintf(stderr, "[%3d] %c%02d %6d %6d%s\n",
i, "&=+-"[op], len,
aoffset, aprimeoffset,
(next_diff_sub_ == i) ? " <==next_diff_sub_" : "");
}
fprintf(stderr, "\n");
}
void OffsetMap::DumpWindow() {
fprintf(stderr, "DumpWindow(A => A'): max_aoffset_ = %d, "
"max_aprimeoffset_ = %d, next_diff_sub_ = %d<br>\n",
max_aoffset_, max_aprimeoffset_, next_diff_sub_);
fprintf(stderr, "A [%u..%u)\n",
current_lo_aoffset_, current_hi_aoffset_);
fprintf(stderr, "A' [%u..%u)\n",
current_lo_aprimeoffset_, current_hi_aprimeoffset_);
fprintf(stderr, " diff = %d\n", current_diff_);
DumpString();
}
//----------------------------------------------------------------------------//
// The guts of the 2013 design //
// If there are three ranges a b c in diffs_, we can be in one of five //
// states: LEFT of a, in ranges a b c, or RIGHT of c //
// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
// position next_diff_sub_ //
// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
// next_diff_sub_=diffs_.size() //
// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
// correspond to each other. If range i is active, next_diff_sub_ is at //
// the first byte of range i+1. Because of the length-prefix operator, //
// an individual range item in diffs_ may be multiple bytes //
// In all cases aprimeoffset = aoffset + current_diff_ //
// i.e. current_diff_ = aprimeoffset - aoffset //
// //
// In the degenerate case of diffs_.empty(), there are only two states //
// LEFT and RIGHT and the mapping is the identity mapping. //
// The initial state is LEFT. //
// It is an error to move left into LEFT or right into RIGHT, but the code //
// below is robust in these cases. //
//----------------------------------------------------------------------------//
void OffsetMap::SetLeft() {
current_lo_aoffset_ = 0;
current_hi_aoffset_ = 0;
current_lo_aprimeoffset_ = 0;
current_hi_aprimeoffset_ = 0;
current_diff_ = 0;
next_diff_sub_ = 0;
}
void OffsetMap::SetRight() {
current_lo_aoffset_ = max_aoffset_;
current_hi_aoffset_ = max_aoffset_;
current_lo_aprimeoffset_ = max_aprimeoffset_;
current_hi_aprimeoffset_ = max_aprimeoffset_;
current_diff_ = max_aprimeoffset_ - max_aoffset_;
next_diff_sub_ = 0;
}
// Back up over previous range, 1..5 bytes
// Return subscript at the beginning of that. Pins at 0
int OffsetMap::Backup(int sub) {
if (sub <= 0) {return 0;}
--sub;
while ((0 < sub) &&
(static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
--sub;
}
return sub;
}
// Parse next range, 1..5 bytes
// Return subscript just off the end of that
int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
*op = PREFIX_OP;
*length = 0;
char c;
while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) {
c = diffs_[sub++];
*op = static_cast<MapOp>(OpPart(c));
int len = LenPart(c);
*length = (*length << 6) + len;
}
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
// Mal-formed can include a trailing prefix byte with no following op
return sub;
}
// Parse previous range, 1..5 bytes
// Return current subscript
int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
sub = Backup(sub);
return ParseNext(sub, op, length);
}
// Quick debugging dump; does not parse multi-byte items, so just length & 0x3f
void OffsetMap::PrintPosition(const char* str) {
MapOp op = PREFIX_OP;
int length = 0;
if ((0 < next_diff_sub_) && (next_diff_sub_ <= static_cast<int>(diffs_.size()))) {
op = static_cast<MapOp>(OpPart(diffs_[next_diff_sub_ - 1]));
length = LenPart(diffs_[next_diff_sub_ - 1]);
}
fprintf(stderr, "%s[%d] %c%02d = A[%d..%d) ==> A'[%d..%d)\n",
str,
next_diff_sub_, "&=+-"[op], length,
current_lo_aoffset_, current_hi_aoffset_,
current_lo_aprimeoffset_, current_hi_aprimeoffset_);
}
// Move active window one range to the right
// Return true if move was OK
bool OffsetMap::MoveRight() {
// If at last range or RIGHT, set to RIGHT, return error
if (next_diff_sub_ >= static_cast<int>(diffs_.size())) {
SetRight();
return false;
}
// Actually OK to move right
MapOp op;
int length;
bool retval = true;
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
current_lo_aoffset_ = current_hi_aoffset_;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
if (op == COPY_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + length;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
} else if (op == INSERT_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + 0;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
} else if (op == DELETE_OP) {
current_hi_aoffset_ = current_lo_aoffset_ + length;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
} else {
SetRight();
retval = false;
}
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
return retval;
}
// Move active window one range to the left
// Return true if move was OK
bool OffsetMap::MoveLeft() {
// If at first range or LEFT, set to LEFT, return error
if (next_diff_sub_ <= 0) {
SetLeft();
return false;
}
// Back up over current active window
next_diff_sub_ = Backup(next_diff_sub_);
if (next_diff_sub_ <= 0) {
SetLeft();
return false;
}
// Actually OK to move left
MapOp op;
int length;
bool retval = true;
// If mal-formed or in LEFT, this will return with op = PREFIX_OP
next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
current_hi_aoffset_ = current_lo_aoffset_;
current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
if (op == COPY_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - length;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
} else if (op == INSERT_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - 0;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
} else if (op == DELETE_OP) {
current_lo_aoffset_ = current_hi_aoffset_ - length;
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
} else {
SetLeft();
retval = false;
}
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
return true;
}
// Map an offset in A' to the corresponding offset in A
int OffsetMap::MapBack(int aprimeoffset){
MaybeFlushAll();
if (aprimeoffset < 0) {return 0;}
if (max_aprimeoffset_ <= aprimeoffset) {
return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
}
// If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
// use current mapping, else move window left/right
bool ok = true;
while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
ok = MoveLeft();
}
while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
ok = MoveRight();
}
// So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
int aoffset = aprimeoffset - current_diff_;
if (aoffset >= current_hi_aoffset_) {
// A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
aoffset = current_hi_aoffset_;
}
return aoffset;
}
// Map an offset in A to the corresponding offset in A'
int OffsetMap::MapForward(int aoffset){
MaybeFlushAll();
if (aoffset < 0) {return 0;}
if (max_aoffset_ <= aoffset) {
return (aoffset - max_aoffset_) + max_aprimeoffset_;
}
// If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
// use current mapping, else move window left/right
bool ok = true;
while (ok && (aoffset < current_lo_aoffset_)) {
ok = MoveLeft();
}
while (ok && (current_hi_aoffset_ <= aoffset)) {
ok = MoveRight();
}
int aprimeoffset = aoffset + current_diff_;
if (aprimeoffset >= current_hi_aprimeoffset_) {
// A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
aprimeoffset = current_hi_aprimeoffset_;
}
return aprimeoffset;
}
// static
bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
bool ok = true;
while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
ok = source->MoveRight();
if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
return false;
}
dest->Insert(
source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
}
return true;
}
// static
bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
bool ok = true;
while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
ok = source->MoveRight();
if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
return false;
}
dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
}
return true;
}
// static
void OffsetMap::ComposeOffsetMap(
OffsetMap* g, OffsetMap* f, OffsetMap* h) {
h->Clear();
f->Reset();
g->Reset();
int lo = 0;
for (;;) {
// Consume delete operations in f. This moves A without moving
// A' and A''.
if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
// fprintf(stderr,
// "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
}
// FlushAll(), called by Reset(), MapForward() or MapBack(), has
// added an extra COPY_OP to f and g, so this function has
// composed an extra COPY_OP in h from those. To avoid
// FlushAll() adds one more extra COPY_OP to h later, dispatch
// Flush() right now.
h->Flush();
return;
}
// Consume insert operations in g. This moves A'' without moving A
// and A'.
if (lo >= f->current_hi_aprimeoffset_) {
if (!CopyDeletes(f, h)) {
// fprintf(stderr,
// "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
}
}
// Compose one operation which moves A' from lo to hi.
int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
h->Copy(hi - lo);
} else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
h->Delete(hi - lo);
} else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
h->Insert(hi - lo);
}
lo = hi;
}
}
// For testing only -- force a mapping
void OffsetMap::StuffIt(const string& diffs,
int max_aoffset, int max_aprimeoffset) {
Clear();
diffs_ = diffs;
max_aoffset_ = max_aoffset;
max_aprimeoffset_ = max_aprimeoffset;
}
} // namespace CLD2

View File

@ -0,0 +1,175 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef UTIL_UTF8_OFFSETMAP_H_
#define UTIL_UTF8_OFFSETMAP_H_
#include <string> // for string
#include "integral_types.h" // for uint32
// ***************************** OffsetMap **************************
//
// An OffsetMap object is a container for a mapping from offsets in one text
// buffer A' to offsets in another text buffer A. It is most useful when A' is
// built from A via substitutions that occasionally do not preserve byte length.
//
// A series of operators are used to build the correspondence map, then
// calls can be made to map an offset in A' to an offset in A, or vice versa.
// The map starts with offset 0 in A corresponding to offset 0 in A'.
// The mapping is then built sequentially, adding on byte ranges that are
// identical in A and A', byte ranges that are inserted in A', and byte ranges
// that are deleted from A. All bytes beyond those specified when building the
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
// end of the map.
//
// The internal data structure records positions at which bytes are added or
// deleted. Using the map is O(1) when increasing the A' or A offset
// monotonically, and O(n) when accessing random offsets, where n is the
// number of differences.
//
namespace CLD2 {
class OffsetMap {
public:
// Constructor, destructor
OffsetMap();
~OffsetMap();
// Clear the map
void Clear();
// Add to mapping from A to A', specifying how many next bytes correspond
// in A and A'
void Copy(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// inserted in A' while not advancing in A at all
void Insert(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// deleted from A while not advancing in A' at all
void Delete(int bytes);
// Print map to file, for debugging
void Printmap(const char* filename);
// [Finish building map,] Re-position to offset 0
// This call is optional; MapForward and MapBack finish building the map
// if necessary
void Reset();
// Map an offset in A' to the corresponding offset in A
int MapBack(int aprimeoffset);
// Map an offset in A to the corresponding offset in A'
int MapForward(int aoffset);
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
// from A' to A'' and h is from A to A''.
//
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
// to h->MoveForward(aoffset), while
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
// f and insertion in g are at the same place. For example,
//
// A 1 2 3 4
// ^ | ^ ^
// | | / | f
// v vv v
// A' 1' 2' 3'
// ^ ^^ ^
// | | \ | g
// v | v v
// A'' 1'' 2'' 3'' 4''
//
// results in:
//
// A 1 2 3 4
// ^ ^\ ^ ^
// | | \ | | h
// v | vv v
// A'' 1'' 2'' 3'' 4''
//
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
// the latter figure.
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
// For debugging only; writes to stderr
void DumpWindow();
// For testing only -- force a mapping
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
private:
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
void Flush();
void FlushAll();
void MaybeFlushAll();
void Emit(MapOp op, int len);
void SetLeft();
void SetRight();
// Back up over previous range, 1..5 bytes
// Return subscript at the beginning of that. Pins at 0
int Backup(int sub);
// Parse next range, 1..5 bytes
// Return subscript just off the end of that
int ParseNext(int sub, MapOp* op, int* length);
// Parse previous range, 1..5 bytes
// Return current subscript
int ParsePrevious(int sub, MapOp* op, int* length);
void PrintPosition(const char* str);
bool MoveRight(); // Returns true if OK
bool MoveLeft(); // Returns true if OK
void DumpString();
// Copies insert operations from source to dest. Returns true if no
// other operations are found.
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
// Copies delete operations from source to dest. Returns true if no other
// operations are found.
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
std::string diffs_;
MapOp pending_op_;
uint32 pending_length_;
// Offsets in the ranges below correspond to each other, with A' = A + diff
int next_diff_sub_;
int current_lo_aoffset_;
int current_hi_aoffset_;
int current_lo_aprimeoffset_;
int current_hi_aprimeoffset_;
int current_diff_;
int max_aoffset_;
int max_aprimeoffset_;
};
} // namespace CLD2
#endif // UTIL_UTF8_OFFSETMAP_H_

View File

@ -0,0 +1,128 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// These are weird things we need to do to get this compiling on
// random systems [subset].
#ifndef BASE_PORT_H_
#define BASE_PORT_H_
#include <string.h> // for memcpy()
#include "integral_types.h"
namespace CLD2 {
// Portable handling of unaligned loads, stores, and copies.
// On some platforms, like ARM, the copy functions can be more efficient
// then a load and a store.
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
// x86 and x86-64 can perform unaligned loads/stores directly;
// modern PowerPC hardware can also do unaligned integer loads and stores;
// but note: the FPU still sends unaligned loads and stores to a trap handler!
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
#elif defined(__arm__) && \
!defined(__ARM_ARCH_5__) && \
!defined(__ARM_ARCH_5T__) && \
!defined(__ARM_ARCH_5TE__) && \
!defined(__ARM_ARCH_5TEJ__) && \
!defined(__ARM_ARCH_6__) && \
!defined(__ARM_ARCH_6J__) && \
!defined(__ARM_ARCH_6K__) && \
!defined(__ARM_ARCH_6Z__) && \
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__)
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
// do an unaligned read and rotate the words around a bit, or do the reads very
// slowly (trip through kernel mode). There's no simple #define that says just
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
// so in time, maybe we can move on to that.
//
// This is a mess, but there's not much we can do about it.
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
// See if that would be more efficient on platforms supporting it,
// at least for copies.
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
}
#else
#define NEED_ALIGNED_LOADS
// These functions are provided for architectures that don't support
// unaligned loads and stores.
inline uint16 UNALIGNED_LOAD16(const void *p) {
uint16 t;
memcpy(&t, p, sizeof t);
return t;
}
inline uint32 UNALIGNED_LOAD32(const void *p) {
uint32 t;
memcpy(&t, p, sizeof t);
return t;
}
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
memcpy(&t, p, sizeof t);
return t;
}
inline void UNALIGNED_STORE16(void *p, uint16 v) {
memcpy(p, &v, sizeof v);
}
inline void UNALIGNED_STORE32(void *p, uint32 v) {
memcpy(p, &v, sizeof v);
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v);
}
#endif
} // End namespace CLD2
#endif // BASE_PORT_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,297 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
//
// Terminology:
// Incoming original text has HTML tags and entities removed, all but letters
// removed, and letters lowercased. Strings of non-letters are mapped to a
// single ASCII space.
//
// One scriptspan has a run of letters/spaces in a single script. This is the
// fundamental text unit that is scored. There is an optional backmap from
// scriptspan text to the original document text, so that the language ranges
// reported in ResultChunkVector refer to byte ranges inthe original text.
//
// Scripts come in two forms, the full Unicode scripts described by
// http://www.unicode.org/Public/UNIDATA/Scripts.txt
// and a modified list used exclusively in CLD2. The modified form maps all
// the CJK scripts to one, Hani. The current version description is in
// i18n/encodings/cld2/builddata/script_summary.txt
// In addition, all non-letters are mapped to the Common script.
//
// ULScript describes this Unicode Letter script.
//
// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
// Nilgrams (no text lookup at all) are for script-based pseudo-languages and
// for languages that are 1:1 with a given script. Unigrams and bigrams are
// used to score the CJK languages, all in the Hani script. Quadgrams and
// octagrams are used to score all other languages.
//
// RType is the Recognition Type per ulscript.
//
// The scoring tables map various grams to language-probability scores.
// A given gram that hits in scoring table maps to an indirect subscript into
// a list of packed languages and log probabilities.
//
// Languages are stored in two forms: 10-bit values in the Languge enum, and
// shorter 8-bit per-ulscript values in the scoring tables.
//
// Language refers to the full 10-bit range.
// pslang refers to the per-ulscript shorter values.
//
// Log probabilities also come in two forms. The full range uses values 0..255
// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
// TODO BOGUS description, 24 vs 12
// 1/47.5M. The second form quantizes these into multiples of 8 that can be
// added together to represent probability products. The quantized form uses
// values 24..0 with 0 now least likely instead of most likely, thus making
// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
// and 0 maps to original 1/2**24.0 (~1/16M).
//
// qprob refers to quantized log probabilities.
//
// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
// a list of three qprobs. It always nees a companion ulscript
//
// A scriptspan is scored via one or more hitbuffers
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
#define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
#include <stdio.h>
#include "integral_types.h" // for uint8 etc.
#include "cld2tablesummary.h"
#include "compact_lang_det_impl.h" // for ResultChunkVector
#include "getonescriptspan.h"
#include "langspan.h"
#include "tote.h"
#include "utf8statetable.h"
namespace CLD2 {
static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts
// must be power of two for wrap()
static const int kChunksizeQuads = 20; // For non-CJK
static const int kChunksizeUnis = 50; // For CJK
static const int kMaxScoringHits = 1000;
static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
// The first four tables are for CJK languages,
// the next three for quadgram languages, and
// the last for expected scores.
typedef struct {
const UTF8PropObj* unigram_obj; // 80K CJK characters
const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities
const CLD2TableSummary* deltabi_obj;
const CLD2TableSummary* distinctbi_obj;
const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table
const CLD2TableSummary* quadgram_obj2; // Secondary "
const CLD2TableSummary* deltaocta_obj;
const CLD2TableSummary* distinctocta_obj;
const short* kExpectedScore; // Expected base + delta + distinct score
// per 1KB input
// Subscripted by language and script4
} ScoringTables;
// Context for boosting several languages
typedef struct {
int32 n;
uint32 langprob[kMaxBoosts];
int wrap(int32 n) {return n & (kMaxBoosts - 1);}
} LangBoosts;
typedef struct {
LangBoosts latn;
LangBoosts othr;
} PerScriptLangBoosts;
// ScoringContext carries state across scriptspans
// ScoringContext also has read-only scoring tables mapping grams to qprobs
typedef struct {
FILE* debug_file; // Non-NULL if debug output wanted
bool flags_cld2_score_as_quads;
bool flags_cld2_html;
bool flags_cld2_cr;
bool flags_cld2_verbose;
ULScript ulscript; // langprobs below are with respect to this script
Language prior_chunk_lang; // Mostly for debug output
// boost has a packed set of per-script langs and probabilites
// whack has a per-script lang to be suppressed from ever scoring (zeroed)
// When a language in a close set is given as an explicit hint, others in
// that set will be whacked.
PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang=
PerScriptLangBoosts distinct_boost; // From distinctive letter groups
int oldest_distinct_boost; // Subscript in hitbuffer of oldest
// distinct score to use
const ScoringTables* scoringtables; // Probability lookup tables
ScriptScanner* scanner; // For ResultChunkVector backmap
// Inits boosts
void init() {
memset(&langprior_boost, 0, sizeof(langprior_boost));
memset(&langprior_whack, 0, sizeof(langprior_whack));
memset(&distinct_boost, 0, sizeof(distinct_boost));
};
} ScoringContext;
// Begin private
// Holds one scoring-table lookup hit. We hold indirect subscript instead of
// langprob to allow a single hit to use a variable number of langprobs.
typedef struct {
int offset; // First byte of quad/octa etc. in scriptspan
int indirect; // subscript of langprobs in scoring table
} ScoringHit;
typedef enum {
UNIHIT = 0,
QUADHIT = 1,
DELTAHIT = 2,
DISTINCTHIT = 3
} LinearHitType;
// Holds one scoring-table lookup hit resolved into a langprob.
typedef struct {
uint16 offset; // First byte of quad/octa etc. in scriptspan
uint16 type; // LinearHitType
uint32 langprob; // langprob from scoring table
} LangprobHit;
// Holds arrays of scoring-table lookup hits for (part of) a scriptspan
typedef struct {
ULScript ulscript; // langprobs below are with respect to this script
int maxscoringhits; // determines size of arrays below
int next_base; // First unused entry in each array
int next_delta; // "
int next_distinct; // "
int next_linear; // "
int next_chunk_start; // First unused chunk_start entry
int lowest_offset; // First byte of text span used to fill hitbuffer
// Dummy entry at the end of each giving offset of first unused text byte
ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits
ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits
ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits
LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted
// (4: some bases => 2 linear)
int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of
// each scored chunk
int chunk_offset[kMaxSummaries + 1]; // First text subscr of
// each scored chunk
void init() {
ulscript = ULScript_Common;
maxscoringhits = kMaxScoringHits;
next_base = 0;
next_delta = 0;
next_distinct = 0;
next_linear = 0;
next_chunk_start = 0;
lowest_offset = 0;
base[0].offset = 0;
base[0].indirect = 0;
delta[0].offset = 0;
delta[0].indirect = 0;
distinct[0].offset = 0;
distinct[0].indirect = 0;
linear[0].offset = 0;
linear[0].langprob = 0;
chunk_start[0] = 0;
chunk_offset[0] = 0;
};
} ScoringHitBuffer;
// TODO: Explain here why we need both ChunkSpan and ChunkSummary
typedef struct {
int chunk_base; // Subscript of first hitbuffer.base[] in chunk
int chunk_delta; // Subscript of first hitbuffer.delta[]
int chunk_distinct; // Subscript of first hitbuffer.distinct[]
int base_len; // Number of hitbuffer.base[] in chunk
int delta_len; // Number of hitbuffer.delta[] in chunk
int distinct_len; // Number of hitbuffer.distinct[] in chunk
} ChunkSpan;
// Packed into 20 bytes for space
typedef struct {
uint16 offset; // Text offset within current scriptspan.text
uint16 chunk_start; // Scoring subscr within hitbuffer->linear[]
uint16 lang1; // Top lang, mapped to full Language
uint16 lang2; // Second lang, mapped to full Language
uint16 score1; // Top lang raw score
uint16 score2; // Second lang raw score
uint16 bytes; // Number of lower letters bytes in chunk
uint16 grams; // Number of scored base quad- uni-grams in chunk
uint16 ulscript; // ULScript of chunk
uint8 reliability_delta; // Reliability 0..100, delta top:second scores
uint8 reliability_score; // Reliability 0..100, top:expected score
} ChunkSummary;
// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
// 1000-quad hit buffer, so we can do boundary adjustment on them
// when adjacent entries are different languages. After that, we add them
// all into the document score
//
// About 50 * 20 = 1000 bytes. OK for stack alloc
typedef struct {
int n;
ChunkSummary chunksummary[kMaxSummaries + 1];
} SummaryBuffer;
// End private
// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
// scoringcontext
void ScoreEntireScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
void ScoreCJKScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
void ScoreQuadScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
// Score one scriptspan into doc_tote and vec, updating scoringcontext
void ScoreOneScriptSpan(const LangSpan& scriptspan,
ScoringContext* scoringcontext,
DocTote* doc_tote,
ResultChunkVector* vec);
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

View File

@ -0,0 +1,78 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A StringPiece points to part or all of a string, double-quoted string
// literal, or other string-like object. A StringPiece does *not* own the
// string to which it points. A StringPiece is not null-terminated. [subset]
//
#ifndef STRINGS_STRINGPIECE_H_
#define STRINGS_STRINGPIECE_H_
#include <string.h>
#include <string>
typedef int stringpiece_ssize_type;
class StringPiece {
private:
const char* ptr_;
stringpiece_ssize_type length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) {}
StringPiece(const char* str) // NOLINT(runtime/explicit)
: ptr_(str), length_(0) {
if (str != NULL) {
length_ = strlen(str);
}
}
StringPiece(const std::string& str) // NOLINT(runtime/explicit)
: ptr_(str.data()), length_(0) {
length_ = str.size();
}
StringPiece(const char* offset, stringpiece_ssize_type len)
: ptr_(offset), length_(len) {
}
void remove_prefix(stringpiece_ssize_type n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(stringpiece_ssize_type n) {
length_ -= n;
}
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
stringpiece_ssize_type size() const { return length_; }
stringpiece_ssize_type length() const { return length_; }
bool empty() const { return length_ == 0; }
};
class StringPiece;
#endif // STRINGS_STRINGPIECE_H__

View File

@ -0,0 +1,265 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#include "tote.h"
#include "lang_script.h" // For LanguageCode in Dump
#include <stdio.h>
#include <string.h> // For memset
namespace CLD2 {
// Take a set of <key, value> pairs and tote them up.
// After explicitly sorting, retrieve top key, value pairs
// Normal use is key=per-script language and value = probability score
Tote::Tote() {
in_use_mask_ = 0;
byte_count_ = 0;
score_count_ = 0;
// No need to initialize values
}
Tote::~Tote() {
}
void Tote::Reinit() {
in_use_mask_ = 0;
byte_count_ = 0;
score_count_ = 0;
// No need to initialize values
}
// Increment count of quadgrams/trigrams/unigrams scored
void Tote::AddScoreCount() {
++score_count_;
}
void Tote::Add(uint8 ikey, int idelta) {
int key_group = ikey >> 2;
uint64 groupmask = (1ULL << key_group);
if ((in_use_mask_ & groupmask) == 0) {
// Initialize this group
gscore_[key_group] = 0;
in_use_mask_ |= groupmask;
}
score_[ikey] += idelta;
}
// Return current top three keys
void Tote::CurrentTopThreeKeys(int* key3) const {
key3[0] = -1;
key3[1] = -1;
key3[2] = -1;
int score3[3] = {-1, -1, -1};
uint64 tempmask = in_use_mask_;
int base = 0;
while (tempmask != 0) {
if (tempmask & 1) {
// Look at four in-use keys
for (int i = 0; i < 4; ++i) {
int insert_me = score_[base + i];
// Favor lower numbers on ties
if (insert_me > score3[2]) {
// Insert
int insert_at = 2;
if (insert_me > score3[1]) {
score3[2] = score3[1];
key3[2] = key3[1];
insert_at = 1;
if (insert_me > score3[0]) {
score3[1] = score3[0];
key3[1] = key3[0];
insert_at = 0;
}
}
score3[insert_at] = insert_me;
key3[insert_at] = base + i;
}
}
}
tempmask >>= 1;
base += 4;
}
}
// Take a set of <key, value> pairs and tote them up.
// After explicitly sorting, retrieve top key, value pairs
// 0xFFFF in key signifies unused
DocTote::DocTote() {
// No need to initialize score_ or value_
incr_count_ = 0;
sorted_ = 0;
memset(closepair_, 0, sizeof(closepair_));
memset(key_, 0xFF, sizeof(key_));
}
DocTote::~DocTote() {
}
void DocTote::Reinit() {
// No need to initialize score_ or value_
incr_count_ = 0;
sorted_ = 0;
memset(closepair_, 0, sizeof(closepair_));
memset(key_, 0xFF, sizeof(key_));
runningscore_.Reinit();
}
// Weight reliability by ibytes
// Also see three-way associative comments above for Tote
void DocTote::Add(uint16 ikey, int ibytes,
int score, int ireliability) {
++incr_count_;
// Look for existing entry in top 2 positions of 3, times 8 columns
int sub0 = ikey & 15;
if (key_[sub0] == ikey) {
value_[sub0] += ibytes;
score_[sub0] += score;
reliability_[sub0] += ireliability * ibytes;
return;
}
// Look for existing entry in other of top 2 positions of 3, times 8 columns
int sub1 = sub0 ^ 8;
if (key_[sub1] == ikey) {
value_[sub1] += ibytes;
score_[sub1] += score;
reliability_[sub1] += ireliability * ibytes;
return;
}
// Look for existing entry in third position of 3, times 8 columns
int sub2 = (ikey & 7) + 16;
if (key_[sub2] == ikey) {
value_[sub2] += ibytes;
score_[sub2] += score;
reliability_[sub2] += ireliability * ibytes;
return;
}
// Allocate new entry
int alloc = -1;
if (key_[sub0] == kUnusedKey) {
alloc = sub0;
} else if (key_[sub1] == kUnusedKey) {
alloc = sub1;
} else if (key_[sub2] == kUnusedKey) {
alloc = sub2;
} else {
// All choices allocated, need to replace smallest one
alloc = sub0;
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
}
key_[alloc] = ikey;
value_[alloc] = ibytes;
score_[alloc] = score;
reliability_[alloc] = ireliability * ibytes;
return;
}
// Find subscript of a given packed language, or -1
int DocTote::Find(uint16 ikey) {
if (sorted_) {
// Linear search if sorted
for (int sub = 0; sub < kMaxSize_; ++sub) {
if (key_[sub] == ikey) {return sub;}
}
return -1;
}
// Look for existing entry
int sub0 = ikey & 15;
if (key_[sub0] == ikey) {
return sub0;
}
int sub1 = sub0 ^ 8;
if (key_[sub1] == ikey) {
return sub1;
}
int sub2 = (ikey & 7) + 16;
if (key_[sub2] == ikey) {
return sub2;
}
return -1;
}
// Return current top key
int DocTote::CurrentTopKey() {
int top_key = 0;
int top_value = -1;
for (int sub = 0; sub < kMaxSize_; ++sub) {
if (key_[sub] == kUnusedKey) {continue;}
if (top_value < value_[sub]) {
top_value = value_[sub];
top_key = key_[sub];
}
}
return top_key;
}
// Sort first n entries by decreasing order of value
// If key==0 other fields are not valid, treat value as -1
void DocTote::Sort(int n) {
// This is n**2, but n is small
for (int sub = 0; sub < n; ++sub) {
if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
// Bubble sort key[sub] and entry[sub]
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
if (value_[sub] < value_[sub2]) {
// swap
uint16 tmpk = key_[sub];
key_[sub] = key_[sub2];
key_[sub2] = tmpk;
int tmpv = value_[sub];
value_[sub] = value_[sub2];
value_[sub2] = tmpv;
double tmps = score_[sub];
score_[sub] = score_[sub2];
score_[sub2] = tmps;
int tmpr = reliability_[sub];
reliability_[sub] = reliability_[sub2];
reliability_[sub2] = tmpr;
}
}
}
sorted_ = 1;
}
void DocTote::Dump(FILE* f) {
fprintf(f, "DocTote::Dump\n");
for (int sub = 0; sub < kMaxSize_; ++sub) {
if (key_[sub] != kUnusedKey) {
Language lang = static_cast<Language>(key_[sub]);
fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
value_[sub], score_[sub], reliability_[sub]);
}
}
fprintf(f, " %d chunks scored<br>\n", incr_count_);
}
} // End namespace CLD2

View File

@ -0,0 +1,112 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
#define I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
#include <stdio.h>
#include "integral_types.h" // for uint8 etc
namespace CLD2 {
// Take a set of <key, score> pairs and tote them up.
// Key is an 8-bit per-script language
// After explicitly sorting, retrieve top key, score pairs
// Normal use is key=per-script language
// The main data structure is an array of 256 uint16 counts. We normally
// expect this to be initialized, added-to about 60 times, then the top three
// items found. The reduce the initial and final time, we also keep a bit vector
// of unused (and uninitialized) parts, each of 64 bits covering four keys.
class Tote {
public:
Tote();
~Tote();
void Reinit();
void AddScoreCount();
void Add(uint8 ikey, int idelta);
void AddBytes(int ibytes) {byte_count_ += ibytes;}
void CurrentTopThreeKeys(int* key3) const;
int GetScoreCount() const {return score_count_;}
int GetByteCount() const {return byte_count_;}
int GetScore(int i) const {return score_[i];}
void SetScoreCount(uint16 v) {score_count_ = v;}
void SetScore(int i, int v) {score_[i] = v;}
private:
uint64 in_use_mask_; // 64 bits, one for each group of 4 scores.
// 0 = not initialized,not used
int byte_count_; // Bytes of text scored
int score_count_; // Number of quadgrams/etc. scored
union {
uint64 gscore_[64]; // For alignment and clearing quickly
uint16 score_[256]; // Probability score sum
};
};
// Take a set of <key, score, reliability> triples and tote them up.
// Key is a 16-bit full language
// After explicitly sorting, retrieve top key, score, reliability triples
class DocTote {
public:
DocTote();
~DocTote();
void Reinit();
void Add(uint16 ikey, int ibytes, int score, int ireliability);
int Find(uint16 ikey);
void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
int CurrentTopKey();
Tote* RunningScore() {return &runningscore_;}
void Sort(int n);
void Dump(FILE* f);
int GetIncrCount() const {return incr_count_;}
int GetClosePair(int subscr) const {return closepair_[subscr];}
int MaxSize() const {return kMaxSize_;}
uint16 Key(int i) const {return key_[i];}
int Value(int i) const {return value_[i];} // byte count
int Score(int i) const {return score_[i];} // sum lg prob
int Reliability(int i) const {return reliability_[i];}
void SetKey(int i, int v) {key_[i] = v;}
void SetValue(int i, int v) {value_[i] = v;}
void SetScore(int i, int v) {score_[i] = v;}
void SetReliability(int i, int v) {reliability_[i] = v;}
static const uint16 kUnusedKey = 0xFFFF;
private:
static const int kMaxSize_ = 24;
static const int kMaxClosePairSize_ = 8;
int incr_count_; // Number of Add calls
int sorted_; // Contents have been sorted, cannot Add
Tote runningscore_; // Top lang scores across entire doc, for
// helping resolve close pairs
// Align at multiple of 8 bytes
int closepair_[kMaxClosePairSize_];
uint16 key_[kMaxSize_]; // Lang unassigned = 0xFFFF, valid = 1..1023
int value_[kMaxSize_]; // Bytecount this lang
int score_[kMaxSize_]; // Probability score sum
int reliability_[kMaxSize_]; // Percentage 0..100
};
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,756 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by utf8tablebuilder version 2.9
//
// Replaces all codes from file:
// lettermarklower_6.2.0.txt
// Accepts all other UTF-8 codes 0000..10FFFF
// Space optimized
//
// ** ASSUMES INPUT IS STRUCTURALLY VALID UTF-8 **
//
// Table entries are absolute statetable subscripts
#ifndef UTF8REPL_LETTERMARKLOWER_H__
#define UTF8REPL_LETTERMARKLOWER_H__
#include "integral_types.h"
#include "utf8statetable.h"
namespace CLD2 {
#define X__ (kExitIllegalStructure)
#define RJ_ (kExitReject)
#define S1_ (kExitReplace1)
#define S2_ (kExitReplace2)
#define S3_ (kExitReplace3)
#define S21 (kExitReplace21)
#define S31 (kExitReplace31)
#define S32 (kExitReplace32)
#define T1_ (kExitReplaceOffset1)
#define T2_ (kExitReplaceOffset2)
#define S11 (kExitReplace1S0)
#define SP_ (kExitSpecial)
#define D__ (kExitDoAgain)
#define RJA (kExitRejectAlt)
// Entire table has 111 state blocks of 64 entries each
static const unsigned int utf8repl_lettermarklower_STATE0 = 0; // state[0]
static const unsigned int utf8repl_lettermarklower_STATE0_SIZE = 320; // =[5]
static const unsigned int utf8repl_lettermarklower_TOTAL_SIZE = 7104;
static const unsigned int utf8repl_lettermarklower_MAX_EXPAND_X4 = 12;
static const unsigned int utf8repl_lettermarklower_SHIFT = 6;
static const unsigned int utf8repl_lettermarklower_BYTES = 1;
static const unsigned int utf8repl_lettermarklower_LOSUB = 0x5b5b5b5b;
static const unsigned int utf8repl_lettermarklower_HIADD = 0x00000000;
static const uint8 utf8repl_lettermarklower[] = {
// state[0] 0x000000 Byte 1
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11,S11,S11,S11,S11,S11,
S11,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__, 6, 11, 13, 16, 19, 22, 25, 28, 6, 6, 6, 31, 33, 36,
39, 42, 44, 46, 48, 51, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 54, 74, 8, 8, 8, 8, 8, 8, 8, 88, 8, 8, 8, 8,100,
104, 9, 9, 9, 10,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[6 + 2] 0x000080 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// state[7 + 2] 0x000000 Byte 2 of 3
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[8 + 2] 0x003000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[9 + 2] 0x040000 Byte 2 of 4
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
// state[10 + 2] 0x100000 Byte 2 of 4
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
// state[11 + 2] 0x0000c0 Byte 2 of 2
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0x00, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[13 + 2] 0x000100 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S21, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,S2_,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x69,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0x00,0xba,0x00,0xbc,0x00,0xbe,0x00,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xc5,
// state[16 + 2] 0x000140 Byte 2 of 2
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S2_,S1_, 0,S1_, 0,S1_, 0, 0,
0x00,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xbf,0xba,0x00,0xbc,0x00,0xbe,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[19 + 2] 0x000180 Byte 2 of 2
0,S2_,S1_, 0,S1_, 0,S2_,S1_, 0,S2_,S2_,S1_, 0, 0,S2_,S2_,
S2_,S1_, 0,S2_,S2_, 0,S2_,S2_, S1_, 0, 0, 0,S2_,S2_, 0,S2_,
S1_, 0,S1_, 0,S1_, 0,S2_,S1_, 0,S2_, 0, 0,S1_, 0,S2_,S1_,
0,S2_,S2_,S1_, 0,S1_, 0,S2_, S1_, 0, 0, 0,S1_, 0, 0, 0,
0x00,0x93,0x83,0x00,0x85,0x00,0x94,0x88, 0x00,0x96,0x97,0x8c,0x00,0x00,0x9d,0x99,
0x9b,0x92,0x00,0xa0,0xa3,0x00,0xa9,0xa8, 0x99,0x00,0x00,0x00,0xaf,0xb2,0x00,0xb5,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0x80,0xa8, 0x00,0x83,0x00,0x00,0xad,0x00,0x88,0xb0,
0x00,0x8a,0x8b,0xb4,0x00,0xb6,0x00,0x92, 0xb9,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
0x00,0xc9,0x00,0x00,0x00,0x00,0xc9,0x00, 0x00,0xc9,0xc9,0x00,0x00,0x00,0xc7,0xc9,
0xc9,0x00,0x00,0xc9,0xc9,0x00,0xc9,0xc9, 0x00,0x00,0x00,0x00,0xc9,0xc9,0x00,0xc9,
0x00,0x00,0x00,0x00,0x00,0x00,0xca,0x00, 0x00,0xca,0x00,0x00,0x00,0x00,0xca,0x00,
0x00,0xca,0xca,0x00,0x00,0x00,0x00,0xca, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[22 + 2] 0x0001c0 Byte 2 of 2
0, 0, 0, 0,S1_,S1_, 0,S1_, S1_, 0,S1_,S1_, 0,S1_, 0,S1_,
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0,S1_,S1_, 0,S1_, 0,S2_,S2_, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x86,0x86,0x00,0x89, 0x89,0x00,0x8c,0x8c,0x00,0x8e,0x00,0x90,
0x00,0x92,0x00,0x94,0x00,0x96,0x00,0x98, 0x00,0x9a,0x00,0x9c,0x00,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0xb3,0xb3,0x00,0xb5,0x00,0x95,0xbf, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xc6,0xc6, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[25 + 2] 0x000200 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S2_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0,T1_,S1_, 0,S2_,T1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0x9e,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xbc,0x00,0x9a,0x01,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xc6,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc6,0x00,0x00,
// state[28 + 2] 0x000240 Byte 2 of 2
0,S1_, 0,S2_,S2_,S2_,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x82,0x00,0x80,0x89,0x8c,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0xc6,0xca,0xca,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[31 + 2] 0x000340 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xb1,0x00,0xb3,0x00,0x00,0x00,0xb7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[33 + 2] 0x000380 Byte 2 of 2
0, 0, 0, 0, 0, 0,S1_, 0, S1_,S1_,S1_, 0,S2_, 0,S2_,S2_,
0,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_, 0,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0xac,0x00, 0xad,0xae,0xaf,0x00,0x8c,0x00,0x8d,0x8e,
0x00,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x00,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0xcf,0x00,0xcf,0xcf,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xcf,0xcf,0x00,0xcf,0xcf,0xcf,0xcf,0xcf, 0xcf,0xcf,0xcf,0xcf,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[36 + 2] 0x0003c0 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0,S2_, 0, 0,S1_, 0,S1_,S1_, 0, 0,S2_,S2_,S2_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0x00,0x00,0xb8,0x00,0x00,0xb8, 0x00,0xb2,0xbb,0x00,0x00,0xbb,0xbc,0xbd,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0xce,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xcd,0xcd,0xcd,
// state[39 + 2] 0x000400 Byte 2 of 2
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[42 + 2] 0x000440 Byte 2 of 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[44 + 2] 0x000480 Byte 2 of 2
S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[46 + 2] 0x0004c0 Byte 2 of 2
S1_,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x8f,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x8a,0x00,0x8c,0x00,0x8e,0x00,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[48 + 2] 0x000500 Byte 2 of 2
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5, 0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,
// state[51 + 2] 0x000540 Byte 2 of 2
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[54 + 2] 0x001000 Byte 2 of 3
6, 6, 55, 57, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 59, 59, 61, 59, 64, 66, 68, 71,
// state[55 + 2] 0x001080 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09, 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,
0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19, 0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,
// state[57 + 2] 0x0010c0 Byte 3 of 3
T1_,T1_,T1_,T1_,T1_,T1_, 0,T1_, 0, 0, 0, 0, 0,T1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x22,0x23,0x24,0x25,0x26,0x27,0x00,0x28, 0x00,0x00,0x00,0x00,0x00,0x29,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[59 + 2] 0x001e00 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[61 + 2] 0x001e80 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S32, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc3,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[64 + 2] 0x001f00 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
// state[66 + 2] 0x001f40 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x91,0x00,0x93,0x00,0x95,0x00,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[68 + 2] 0x001f80 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb0,0xb1,0xb3,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
// state[71 + 2] 0x001fc0 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb2,0xb3,0xb4,0xb5,0x83,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0xb6,0xb7,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xba,0xbb,0xa5,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb8,0xb9,0xbc,0xbd,0xb3,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
// state[74 + 2] 0x002000 Byte 2 of 3
6, 6, 6, 6, 75, 6, 78, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
80, 83, 59, 86, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[75 + 2] 0x002100 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,S32, 0, 0, 0,S31,S32, 0, 0, 0, 0,
0, 0,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x89,0x00, 0x00,0x00,0x6b,0xa5,0x00,0x00,0x00,0x00,
0x00,0x00,0x8e,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x00, 0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x00,
0x00,0x00,0x85,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[78 + 2] 0x002180 Byte 3 of 3
0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[80 + 2] 0x002c00 Byte 3 of 3
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[83 + 2] 0x002c40 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S32,T1_,S32, 0, 0,S1_, 0,S1_, 0,S1_, 0,S32,S32,S32,
S32, 0,S1_, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S32,S32,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xab,0x2a,0xbd,0x00,0x00,0xa8, 0x00,0xaa,0x00,0xac,0x00,0x91,0xb1,0x90,
0x92,0x00,0xb3,0x00,0x00,0xb6,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xbf,0x80,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0xc9,0x00,0xc9,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0xc9,0xc9,
0xc9,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc8,0xc9,
// state[86 + 2] 0x002cc0 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0, 0,
0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xac,0x00,0xae,0x00,0x00,
0x00,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[88 + 2] 0x00a000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 89, 91, 6, 93, 95, 97, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[89 + 2] 0x00a640 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[91 + 2] 0x00a680 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[93 + 2] 0x00a700 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
// state[95 + 2] 0x00a740 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,T1_,S1_, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0xba,0x00,0xbc,0x00,0x2b,0xbf,0x00,
// state[97 + 2] 0x00a780 Byte 3 of 3
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0,S1_, 0,S32, 0, 0,
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S32, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x00,0x00,0x00,0x8c,0x00,0xa5,0x00,0x00,
0x91,0x00,0x93,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xa6,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xc9,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
// state[100 + 2] 0x00f000 Byte 2 of 3
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,101, 6, 6, 6,
// state[101 + 2] 0x00ff00 Byte 3 of 3
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_, 0, 0, 0, 0, 0,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,
0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,0x00,
// state[104 + 2] 0x000000 Byte 2 of 4
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
105, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
// state[105 + 2] 0x010000 Byte 3 of 4
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
106, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
// state[106 + 2] 0x010400 Byte 4 of 4
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91,
0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};
// Remap base[44] = (del, add, string_offset)
static const RemapEntry utf8repl_lettermarklower_remap_base[] = {
{2,3, 0}, {2,3, 3}, {3,3, 6}, {3,3, 9},
{3,3, 12}, {3,3, 15}, {3,3, 18}, {3,3, 21},
{3,3, 24}, {3,3, 27}, {3,3, 30}, {3,3, 33},
{3,3, 36}, {3,3, 39}, {3,3, 42}, {3,3, 45},
{3,3, 48}, {3,3, 51}, {3,3, 54}, {3,3, 57},
{3,3, 60}, {3,3, 63}, {3,3, 66}, {3,3, 69},
{3,3, 72}, {3,3, 75}, {3,3, 78}, {3,3, 81},
{3,3, 84}, {3,3, 87}, {3,3, 90}, {3,3, 93},
{3,3, 96}, {3,3, 99}, {3,3, 102}, {3,3, 105},
{3,3, 108}, {3,3, 111}, {3,3, 114}, {3,3, 117},
{3,3, 120}, {3,3, 123}, {3,3, 126}, {3,3, 129},
{0,0,0} };
// Remap string[132]
static const unsigned char utf8repl_lettermarklower_remap_string[] = {
0xe2,0xb1,0xa5,0xe2,0xb1,0xa6,0xe2,0xb4, 0x80,0xe2,0xb4,0x81,0xe2,0xb4,0x82,0xe2,
0xb4,0x83,0xe2,0xb4,0x84,0xe2,0xb4,0x85, 0xe2,0xb4,0x86,0xe2,0xb4,0x87,0xe2,0xb4,
0x88,0xe2,0xb4,0x89,0xe2,0xb4,0x8a,0xe2, 0xb4,0x8b,0xe2,0xb4,0x8c,0xe2,0xb4,0x8d,
0xe2,0xb4,0x8e,0xe2,0xb4,0x8f,0xe2,0xb4, 0x90,0xe2,0xb4,0x91,0xe2,0xb4,0x92,0xe2,
0xb4,0x93,0xe2,0xb4,0x94,0xe2,0xb4,0x95, 0xe2,0xb4,0x96,0xe2,0xb4,0x97,0xe2,0xb4,
0x98,0xe2,0xb4,0x99,0xe2,0xb4,0x9a,0xe2, 0xb4,0x9b,0xe2,0xb4,0x9c,0xe2,0xb4,0x9d,
0xe2,0xb4,0x9e,0xe2,0xb4,0x9f,0xe2,0xb4, 0xa0,0xe2,0xb4,0xa1,0xe2,0xb4,0xa2,0xe2,
0xb4,0xa3,0xe2,0xb4,0xa4,0xe2,0xb4,0xa5, 0xe2,0xb4,0xa7,0xe2,0xb4,0xad,0xe1,0xb5,
0xbd,0xe1,0xb5,0xb9,0 };
static const unsigned char utf8repl_lettermarklower_fast[256] = {
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
};
static const UTF8ReplaceObj utf8repl_lettermarklower_obj = {
utf8repl_lettermarklower_STATE0,
utf8repl_lettermarklower_STATE0_SIZE,
utf8repl_lettermarklower_TOTAL_SIZE,
utf8repl_lettermarklower_MAX_EXPAND_X4,
utf8repl_lettermarklower_SHIFT,
utf8repl_lettermarklower_BYTES,
utf8repl_lettermarklower_LOSUB,
utf8repl_lettermarklower_HIADD,
utf8repl_lettermarklower,
utf8repl_lettermarklower_remap_base,
utf8repl_lettermarklower_remap_string,
utf8repl_lettermarklower_fast
};
#undef X__
#undef RJ_
#undef S1_
#undef S2_
#undef S3_
#undef S21
#undef S31
#undef S32
#undef T1_
#undef T2_
#undef S11
#undef SP_
#undef D__
#undef RJA
// Table has 7668 bytes, Hash = 07A2-C4E3
} // End namespace CLD2
#endif // UTF8REPL_LETTERMARKLOWER_H__

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,283 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// State Table follower for scanning UTF-8 strings without converting to
// 32- or 16-bit Unicode values.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef UTIL_UTF8_UTF8STATETABLE_H_
#define UTIL_UTF8_UTF8STATETABLE_H_
#include <string>
#include "integral_types.h" // for uint8, uint32, uint16
#include "stringpiece.h"
namespace CLD2 {
class OffsetMap;
// These four-byte entries compactly encode how many bytes 0..255 to delete
// in making a string replacement, how many bytes to add 0..255, and the offset
// 0..64k-1 of the replacement string in remap_string.
struct RemapEntry {
uint8 delete_bytes;
uint8 add_bytes;
uint16 bytes_offset;
};
// Exit type codes for state tables. All but the first get stuffed into
// signed one-byte entries. The first is only generated by executable code.
// To distinguish from next-state entries, these must be contiguous and
// all <= kExitNone
typedef enum {
kExitDstSpaceFull = 239,
kExitIllegalStructure, // 240
kExitOK, // 241
kExitReject, // ...
kExitReplace1,
kExitReplace2,
kExitReplace3,
kExitReplace21,
kExitReplace31,
kExitReplace32,
kExitReplaceOffset1,
kExitReplaceOffset2,
kExitReplace1S0,
kExitSpecial,
kExitDoAgain,
kExitRejectAlt,
kExitNone // 255
} ExitReason;
typedef enum {
kExitDstSpaceFull_2 = 32767, // 0x7fff
kExitIllegalStructure_2, // 32768 0x8000
kExitOK_2, // 32769 0x8001
kExitReject_2, // ...
kExitReplace1_2,
kExitReplace2_2,
kExitReplace3_2,
kExitReplace21_2,
kExitReplace31_2,
kExitReplace32_2,
kExitReplaceOffset1_2,
kExitReplaceOffset2_2,
kExitReplace1S0_2,
kExitSpecial_2,
kExitDoAgain_2,
kExitRejectAlt_2,
kExitNone_2 // 32783 0x800f
} ExitReason_2;
// This struct represents one entire state table. The three initialized byte
// areas are state_table, remap_base, and remap_string. state0 and state0_size
// give the byte offset and length within state_table of the initial state --
// table lookups are expected to start and end in this state, but for
// truncated UTF-8 strings, may end in a different state. These allow a quick
// test for that condition. entry_shift is 8 for tables subscripted by a full
// byte value and 6 for space-optimized tables subscripted by only six
// significant bits in UTF-8 continuation bytes.
typedef struct {
const uint32 state0;
const uint32 state0_size;
const uint32 total_size;
const int max_expand;
const int entry_shift;
const int bytes_per_entry;
const uint32 losub;
const uint32 hiadd;
const uint8* state_table;
const RemapEntry* remap_base;
const uint8* remap_string;
const uint8* fast_state;
} UTF8StateMachineObj;
// Near-duplicate declaration for tables with two-byte entries
typedef struct {
const uint32 state0;
const uint32 state0_size;
const uint32 total_size;
const int max_expand;
const int entry_shift;
const int bytes_per_entry;
const uint32 losub;
const uint32 hiadd;
const unsigned short* state_table;
const RemapEntry* remap_base;
const uint8* remap_string;
const uint8* fast_state;
} UTF8StateMachineObj_2;
typedef UTF8StateMachineObj UTF8PropObj;
typedef UTF8StateMachineObj UTF8ScanObj;
typedef UTF8StateMachineObj UTF8ReplaceObj;
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericProperty(const UTF8PropObj* st,
const uint8** src,
int* srclen);
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
// BigOneByte versions are needed for tables > 240 states, but most
// won't need the TwoByte versions.
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
const uint8** src,
int* srclen);
// TwoByte versions are needed for tables > 240 states that don't fit onto
// BigOneByte -- rare ultimate fallback
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
// Look up property of one UTF-8 character and advance over it
// Return 0 if input length is zero
// Return 0 and advance one byte if input is ill-formed
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
const uint8** src,
int* srclen);
// Look up property of one UTF-8 character (assumed to be valid).
// (This is a faster version of UTF8GenericProperty.)
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
// Scan a UTF-8 stringpiece based on a state table.
// Always scan complete UTF-8 characters
// Set number of bytes scanned. Return reason for exiting
int UTF8GenericScan(const UTF8ScanObj* st,
const StringPiece& str,
int* bytes_consumed);
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
// and doing text replacements.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed,
OffsetMap* offsetmap);
// Older version without offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// Older version without is_plain_text or offsetmap
int UTF8GenericReplace(const UTF8ReplaceObj* st,
const StringPiece& istr,
StringPiece& ostr,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// TwoByte version is needed for tables > about 256 states, such
// as the table for full Unicode 4.1 canonical + compatibility mapping
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
// copying to output stringpiece
// and doing text replacements.
// Always scan complete UTF-8 characters
// Set number of bytes consumed from input, number filled to output.
// Return reason for exiting
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed,
OffsetMap* offsetmap);
// Older version without offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
bool is_plain_text,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
// Older version without is_plain_text or offsetmap
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
const StringPiece& istr,
StringPiece& ostr,
int* bytes_consumed,
int* bytes_filled,
int* chars_changed);
static const unsigned char kUTF8LenTbl[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
};
inline int UTF8OneCharLen(const char* in) {
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
}
// Adjust a stringpiece to encompass complete UTF-8 characters.
// The data pointer will be increased by 0..3 bytes to get to a character
// boundary, and the length will then be decreased by 0..3 bytes
// to encompass the last complete character.
// This is useful especially when a UTF-8 string must be put into a fixed-
// maximum-size buffer cleanly, such as a MySQL buffer.
void UTF8TrimToChars(StringPiece* istr);
} // End namespace CLD2
#endif // UTIL_UTF8_UTF8STATETABLE_H_

View File

@ -0,0 +1,32 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
addOnPreMain(function() {
onmessage = function(aMsg){
// Convert the string to an array of UTF8 bytes.
var encoder = new TextEncoder();
encoder['encoding'] = "utf-8";
var utf8Array = encoder['encode'](aMsg.data);
// Copy the UTF8 byte array to the heap.
var strLength = utf8Array.length;
var ptr = Module['_malloc'](strLength + 1);
var heap = Module['HEAPU8'];
new Uint8Array(heap.buffer, ptr, strLength).set(utf8Array);
// Add a \0 at the end of the C string.
heap[ptr + strLength] = 0;
var lang = Pointer_stringify(_detectLangCode(ptr));
var confident = !!Module['ccall']("lastResultReliable", "number");
postMessage({'language': lang,
'confident': confident});
Module['_free'](ptr);
};
postMessage("ready");
});

View File

@ -0,0 +1,320 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// NOTE:
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
// HAITIAN_CREOLE is detected as such.
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
// MONTENEGRIN is not detected as such, but likely scores as Serbian.
// CROATIAN is detected in the Latin script
// SERBIAN is detected in the Cyrililc and Latin scripts
// Zhuang is detected in the Latin script only.
//
// The languages X_PIG_LATIN and X_KLINGON are detected in the
// extended calls ExtDetectLanguageSummary().
//
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
// is high enough. This happens with non-text input such as the bytes of a
// JPEG, and also with text in languages outside training set.
//
// The following languages are to be detected in multiple scripts:
// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
// BURMESE (Latin, Myanmar)
// HAUSA (Latin, Arabic)
// KASHMIRI (Arabic, Devanagari)
// KAZAKH (Latin, Cyrillic, Arabic)
// KURDISH (Latin*, Arabic)
// KYRGYZ (Cyrillic, Arabic)
// LIMBU (Devanagari, Limbu)
// MONGOLIAN (Cyrillic, Mongolian)
// SANSKRIT (Latin, Devanagari)
// SINDHI (Arabic, Devanagari)
// TAGALOG (Latin, Tagalog)
// TAJIK (Cyrillic, Arabic*)
// TATAR (Latin, Cyrillic, Arabic)
// TURKMEN (Latin, Cyrillic, Arabic)
// UIGHUR (Latin, Cyrillic, Arabic)
// UZBEK (Latin, Cyrillic, Arabic)
//
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
// Arabic script.
//
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
#include <vector>
#include "../internal/lang_script.h" // For Language
namespace CLD2 {
// Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages.
//
// Design goals:
// Skip over big stretches of HTML tags
// Able to return ranges of different languages
// Relatively small tables and relatively fast processing
// Thread safe
//
// For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded.
//
// We distinguish between bytes of the raw input buffer and bytes of non-tag
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
// and are nearly all seven-bit ASCII English, we prefer to distinguish
// language mixture fractions based on just the non-tag text.
//
// Inputs: text and text_length
// Code skips HTML tags and expands HTML entities, unless
// is_plain_text is true
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is some amount more
// probable then the second-best Language. Calculation is a complex function
// of the length of the text and the different-script runs of text.
// Return value: the most likely Language for the majority of the input text
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
// defaults to ENGLISH.
//
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
// backwards compatibility with a different detector.
//
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
// language codes from lang_script.h
//
// Instead of individual arguments, pass in hints as an initialized struct
// Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
//
// Pass in hints whenever possible; doing so improves detection accuracy. The
// set of passed-in hints are all information that is external to the text
// itself.
//
// The content_language_hint is intended to come from an HTTP header
// Content-Language: field, the tld_hint from the hostname of a URL, the
// encoding-hint from an encoding detector applied to the input
// document, and the language hint from any other context you might have.
// The lang= tags inside an HTML document will be picked up as hints
// by code within the compact language detector.
typedef struct {
const char* content_language_hint; // "mi,en" boosts Maori and English
const char* tld_hint; // "id" boosts Indonesian
int encoding_hint; // SJS boosts Japanese
Language language_hint; // ITALIAN boosts it
} CLDHints;
static const int kMaxResultChunkBytes = 65535;
// For returning a vector of per-language pieces of the input buffer
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
typedef struct {
int offset; // Starting byte offset in original buffer
uint16 bytes; // Number of bytes in chunk
uint16 lang1; // Top lang, as full Language. Apply
// static_cast<Language>() to this short value.
} ResultChunk;
typedef std::vector<ResultChunk> ResultChunkVector;
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional interface languages and Unicode
// single-language scripts, from lang_script.h
//
// language3[0] is usually also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from lang_script.h
//
// language3[0] is usually also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, and also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable);
// Use this one.
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwaise process different parts of the input
// buffer in language-dependant ways.
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
int flags,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable);
// Return version text string
// String is "code_version - data_build_date"
const char* DetectLanguageVersion();
// Public use flags, debug output controls
static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
/***
Flag meanings:
kCLDFlagScoreAsQuads
Normally, several languages are detected solely by their Unicode script.
Combined with appropritate lookup tables, this flag forces them instead
to be detected via quadgrams. This can be a useful refinement when looking
for meaningful text in these languages, instead of just character sets.
The default tables do not support this use.
kCLDFlagHtml
For each detection call, write an HTML file to stderr, showing the text
chunks and their detected languages.
kCLDFlagCr
In that HTML file, force a new line for each chunk.
kCLDFlagVerbose
In that HTML file, show every lookup entry.
kCLDFlagQuiet
In that HTML file, suppress most of the output detail.
kCLDFlagEcho
Echo every input buffer to stderr.
***/
// Debug output: Print the resultchunkvector to file f
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector);
#ifdef CLD2_DYNAMIC_MODE
// If compiled with dynamic mode, load data from the specified file location.
// If other data has already been loaded, it is discarded and the data is read
// in from the specified file location again (even if the file has not changed).
// WARNING: Before calling this method, language detection will always fail
// and will always return the unknown language.
void loadData(const char* fileName);
// If compiled with dynamic mode, unload the previously-loaded data.
// WARNING: After calling this method, language detection will no longer work
// and will always return the unknown language.
void unloadData();
// Returns true if and only if data has been loaded via a call to loadData(...)
// and has not been subsequently unladed via a call to unloadDate().
bool isDataLoaded();
#endif // #ifdef CLD2_DYNAMIC_MODE
}; // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

View File

@ -0,0 +1,169 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
namespace CLD2 {
enum Encoding {
ISO_8859_1 = 0, // ASCII
ISO_8859_2 = 1, // Latin2
ISO_8859_3 = 2, //
ISO_8859_4 = 3, // Latin4
ISO_8859_5 = 4, // ISO-8859-5
ISO_8859_6 = 5, // Arabic
ISO_8859_7 = 6, // Greek
ISO_8859_8 = 7, // Hebrew
ISO_8859_9 = 8, //
ISO_8859_10 = 9, //
JAPANESE_EUC_JP = 10, // EUC_JP
JAPANESE_SHIFT_JIS = 11, // SJS
JAPANESE_JIS = 12, // JIS
CHINESE_BIG5 = 13, // BIG5
CHINESE_GB = 14, // GB
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
// CNS11643EUC, before that EUC-CN(!)
KOREAN_EUC_KR = 16, // KSC
UNICODE_UNUSED = 17, // Unicode
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was
// CNS11643EUC, before that EUC.
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was
// CNS11643EUC, before that CNS.
CHINESE_BIG5_CP950 = 20, // BIG5_CP950
JAPANESE_CP932 = 21, // CP932
UTF8 = 22,
UNKNOWN_ENCODING = 23,
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
RUSSIAN_KOI8_R = 25, // KOI8R
RUSSIAN_CP1251 = 26, // CP1251
//----------------------------------------------------------
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
// KOI8-U is used much more often than KOI8-RU.
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
//----------------------------------------------------------
//----------------------------------------------------------
MSFT_CP1254 = 31, // used for Turkish
MSFT_CP1257 = 32, // used in Baltic countries
//----------------------------------------------------------
//----------------------------------------------------------
//----------------------------------------------------------
ISO_8859_11 = 33, // aka TIS-620, used for Thai
MSFT_CP874 = 34, // used for Thai
MSFT_CP1256 = 35, // used for Arabic
//----------------------------------------------------------
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
ISO_8859_8_I = 37, // Iso Hebrew Logical
HEBREW_VISUAL = 38, // Iso Hebrew Visual
//----------------------------------------------------------
//----------------------------------------------------------
CZECH_CP852 = 39,
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
MSFT_CP1253 = 41, // used for Greek
RUSSIAN_CP866 = 42,
//----------------------------------------------------------
//----------------------------------------------------------
// Handled by iconv in glibc
ISO_8859_13 = 43,
ISO_2022_KR = 44,
GBK = 45,
GB18030 = 46,
BIG5_HKSCS = 47,
ISO_2022_CN = 48,
//-----------------------------------------------------------
// Following 4 encodings are deprecated (font encodings)
TSCII = 49,
TAMIL_MONO = 50,
TAMIL_BI = 51,
JAGRAN = 52,
MACINTOSH_ROMAN = 53,
UTF7 = 54,
//-----------------------------------------------------------
// Following 2 encodings are deprecated (font encodings)
BHASKAR = 55, // Indic encoding - Devanagari
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
//-----------------------------------------------------------
UTF16BE = 57, // big-endian UTF-16
UTF16LE = 58, // little-endian UTF-16
UTF32BE = 59, // big-endian UTF-32
UTF32LE = 60, // little-endian UTF-32
//-----------------------------------------------------------
//-----------------------------------------------------------
// An encoding that means "This is not text, but it may have some
// simple ASCII text embedded". Intended input conversion
// is to keep strings of >=4 seven-bit ASCII characters
BINARYENC = 61,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
HZ_GB_2312 = 62,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Some external vendors make the common input error of
// converting MSFT_CP1252 to UTF8 *twice*.
UTF8UTF8 = 63,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Following 6 encodings are deprecated (font encodings)
TAM_ELANGO = 64, // Elango - Tamil
TAM_LTTMBARANI = 65, // Barani - Tamil
TAM_SHREE = 66, // Shree - Tamil
TAM_TBOOMIS = 67, // TBoomis - Tamil
TAM_TMNEWS = 68, // TMNews - Tamil
TAM_WEBTAMIL = 69, // Webtamil - Tamil
//-----------------------------------------------------------
//-----------------------------------------------------------
// Shift_JIS variants used by Japanese cell phone carriers.
KDDI_SHIFT_JIS = 70,
DOCOMO_SHIFT_JIS = 71,
SOFTBANK_SHIFT_JIS = 72,
// ISO-2022-JP variants used by KDDI and SoftBank.
KDDI_ISO_2022_JP = 73,
SOFTBANK_ISO_2022_JP = 74,
//-----------------------------------------------------------
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
// valid Encoding enum, it is only used to
// indicate the total number of Encodings.
};
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__

View File

@ -0,0 +1,15 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
JS_MODULES_PATH = 'modules/translation'
EXTRA_JS_MODULES = [
'cld2/cld-worker.js',
'cld2/cld-worker.js.mem',
'LanguageDetector.jsm'
]
XPCSHELL_TESTS_MANIFESTS += [
'test/xpcshell.ini'
]

View File

@ -0,0 +1,395 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// Unit test compact language detector, CLD2
//
// Test strings.
const kTeststr_en =
"confiscation of goods is assigned as the penalty part most of the courts " +
"consist of members and when it is necessary to bring public cases before a " +
"jury of members two courts combine for the purpose the most important cases " +
"of all are brought jurors or";
const kTeststr_aa_Latn = " nagay tanito nagay tanto nagayna naharsi nahrur nake nala nammay nammay haytu nanu narig ne ni num numu o obare obe obe obisse oggole ogli olloyta ongorowe orbise othoga r rabe rade ra e rage rakub rasitte rasu reyta rog ruddi ruga s sa al bada sa ala";
const kTeststr_ab_Cyrl = " а зуа абзиара дақәшәоит ан лыбзиабара ахә амаӡам ауаҩы игәы иҭоу ихы иҿы ианубаалоит аҧҳәыс ҧшӡа ахацәа лышьҭоуп аҿаасҭа лара дрышьҭоуп";
const kTeststr_af_Latn = " aam skukuza die naam beteken hy wat skoonvee of hy wat alles onderstebo keer wysig bosveldkampe boskampe is kleiner afgeleë ruskampe wat oor min fasiliteite beskik daar is geen restaurante of winkels nie en slegs oornagbesoekers word toegelaat bateleur";
const kTeststr_ak_Latn = "Wɔwoo Hilla Limann Mumu-Ɔpɛnimba 12 afe 1934. Wɔwoo no wɔ Gwollu wɔ Sisala Mantaw mu Nna ne maame yɛ Mma Hayawah. Ne papa so nna ɔyɛ Babini Yomu. Ɔwarr Fulera Limann ? Ne mba yɛ esuon-- Lariba Montia [wɔwoo no Limann]; Baba Limann; Sibi Andan [wɔwoo no Limann]; Lida Limann; Danni Limann; Zilla Limann na Salma Limann. Ɔtenaa ase kɔpemm Sanda-Kwakwa da ɛtɔ so 23 wɔ afe 1998 wɔ ?.";
const kTeststr_am_Ethi = " ለመጠይቅ ወደ እስክንድርያ ላኩዋቸውና የእስክንድርያ ጳጳስ አቴናስዮስ ፍሬምንጦስን እራሳቸውን ሾመው ልከዋል ከዚያ እስከ ዓ ም ድረስ የኢትዮጵያ አቡነ";
const kTeststr_ar_Arab = "احتيالية بيع أي حساب";
const kTeststr_as_Beng = "অঞ্চল নতুন সদস্যবৃন্দ সকলোৱে ভৰ্তি হব পাৰে মুল পৃষ্ঠা জন লেখক গুগ ল দল সাৰাংশ ই পত্ৰ টা বাৰ্তা এজন";
const kTeststr_ay_Latn = " aru wijar aru ispañula ukaran aru witanam aru kurti aru kalis aru warani aru malta aru yatiyawi niya jakitanaka isluwiñ aru lmir phuran aru masirunan aru purtukal aru kruwat aru jakira urtu aru inklisa pirsan aru suyku aru malay aru jisk aptayma thaya";
const kTeststr_az_Arab = " آذربایجان دا انسان حاقلاری ائوی آچیلاجاق ب م ت ائلچيسي برمه موخاليفتي نين ليدئري ايله گؤروشه بيليب ترس شوونيسم فارس از آزادي ملتهاي تورکمن";
const kTeststr_az_Latn = " a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri keçirilib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında ölüb";
const kTeststr_ba_Cyrl = " арналђан бындай ђилми эш тіркињлњ тњјге тапєыр нњшер ителњ ғинуар бєхет именлектє етешлектє ауыл ўќмерџєре хеџмєт юлын ћайлаѓанда";
const kTeststr_be_Cyrl = " а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым часам нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе";
const kTeststr_bg_Cyrl = " а дума попада в състояние на изпитание ключовите думи с предсказана малко под то изискване на страниците за търсене в";
//const kTeststr_bh_Deva = " विकिपीडिया इंटरनेट आधारित एक मुक्त ज्ञानकोष परियोजना ह ई विकि के रुप मेँ बा यानी एगो अईसन जाल पृष्ठ जे सभन के संपादन करे के छूट देवेला विकिपीडिया शब्द विकि अउर इनसाइक्लोपीडिया ज्ञानकोष शब्दन के मिला के बनल बा विकिपीडिया एक बहुभाषीय प्रकल्प ह अउर स्वयंसेवकन के सहकार से निर्मित बा जेहु के भी इंटरनेट तक पहुँच बा ऊ विकिपीडिया पर लिख सकत बा अउर लेखन के संपादन कर सकत बा";
// From 10% testing part of new lang=bh scrape
const kTeststr_bh_Deva = "काल में उनका हमला से बचे खाती एहिजा भाग के अइले आ भोजपुर नाम से नगर बसवले. एकरा बारे में विस्तार से जानकारी नीचे दीहल गइल बा. बाकिर आश्चर्यजनक रूप से मालवा के राजा भोज के बिहार आवे आ भोजपुर नगर बसावे आ चाहे भोजपुरी के साथे उनकर कवनो संबंध होखे के कवनो जानकारी भोपाल के भोज संस्थान आ चाहे मध्य प्रदेश के इतिहासकार लोगन के तनिको नइखे. हालांकि ऊ सब लोग एह बात के मानत बा कि एकरा बारे में अबहीं तकले मूर्ति बनवइलें. राजा भोज के जवना जगहा पऽ वाग्देवी के दर्शन भइल रहे, ओही स्थान पऽ एह मूर्ति के स्थापना कइल गइल. अब अगर एह मंदिर के एह शिलालेख के तस्वीर (पृष्ठ संख्या 33 पऽ प्रकाशित) रउआ धेयान से देखीं तऽ एकरा पऽ कैथी लिपि में -सीताराम- लिखल साफ लउकत बा. कैथी भोजपुरी के बहुत प्रचलित लिपि रहल बिया. एकरा बारे में कवनो शंका संदेह बिहार-यूपी के जानकार लोगन में नइखे. एल. एस. एस. वो माले के लिखल पढ़ीं ";
const kTeststr_bi_Latn = " king wantaem nomo hem i sakem setan mo ol rabis enjel blong hem oli aot long heven oli kamdaon long wol taswe ol samting oli kam nogud olgeta long wol ya stat long revelesen ol faet kakae i sot ol sik mo fasin blong brekem loa oli kam antap olgeta samting";
const kTeststr_blu_Latn = " Kuv hlub koj txawm lub ntuj yuav si ntshi nphaus los kuv tsis ua siab nkaug txawm ntiab teb yuav si ntshi nphaus los kuv tseem ua lon tsaug vim kuv hlub koj tag lub siab";
const kTeststr_blu_Latn2 = "Kuv hnov Txhiaj Xeeb Vaj, co-owner of Hmong Village Shopping Center, hais ua hnub ua hmo tias kom Hmoob yuav tsum txhawb Hmoob thiab listed cov mini-shops uas nyob rau hauv nws lub MALL txhua txhua kom sawv daws mus txhawb, tiam sis uas cas zaum twg twb pom nws mus kav kiav hauv taj laj qhabmeem (Sun Foods) xwb tiag. Nag hmo kuv pom nws mus shopping nrog nws poj niam hauv Sun Foods. Thaum tawm mus txog nraum parking lot kuv thiaj txhob txwm mus ze ze seb ua li nws mus yuav dab tsi tiag, thiab seb tej uas nws yuav ntawd puas muaj nyob ntawm tej kiab khw Hmoob. Surprised!!! Vuag.... txhua yam nws yuav hauv Sun Foods peb Hmoob cov khw yeej muaj tag nrho. Peb niaj hnub nqua hu kom Hmoob yuav tsum pab Hmoob yog pab li no lod?";
//const kTeststr_bn_Beng = " ংখ্যা নমুনায়ন বিন্যাস পরিসংখ্যানিক মডেল পরিসংখ্যানিক সিদ্ধান্ত ফাংশন পরিসংখ্যানিক";
// From 10% testing part of new lang=bn scrape
const kTeststr_bn_Beng = "গ্যালারির ৩৮ বছর পূর্তিতে মূল্যছাড় অর্থনীতি বিএনপির ওয়াক আউট তপন চৌধুরী হারবাল অ্যাসোসিয়েশনের সভাপতি আন্তর্জাতিক পরামর্শক বোর্ড দিয়ে শরিয়াহ্ ইনন্ডেক্স করবে সিএসই মালিকপক্ষের কান্না, শ্রমিকের অনিশ্চয়তা মতিঝিলে সমাবেশ নিষিদ্ধ: এফবিসিসিআইয়ের ধন্যবাদ বিনোদন বিশেষ প্রতিবেদন বাংলালিংকের গ্র্যান্ডমাস্টার সিজন-৩ ব্রাজিলে বিশ্বকাপ ফুটবল আয়োজনবিরোধী বিক্ষোভ দেশের নিরাপত্তার চেয়ে অনেক বেশি সচেতন । প্রার্থীদের দক্ষতা ও যোগ্যতার পাশাপাশি তারা জাতীয় ইস্যুগুলোতে প্রাধান্য দিয়েছেন । ” পাঁচটি সিটিতে ২০ লাখ ভোটারদের দিয়ে জাতীয় নির্বাচনে ৮ কোটি ভোটারদের সঙ্গে তুলনা করা যাবে কি একজন দর্শকের এমন প্রশ্নে জবাবে আব্দুল্লাহ আল নোমান বলেন , “ এই পাঁচটি সিটি কর্পোরেশন নির্বাচন দেশের পাঁচটি বড় বিভাগের প্রতিনিধিত্ব করছে । এছাড়া এখানকার ভোটার রা সবাই সচেতন । তারা";
//const kTeststr_bo_Tibt = " གང ནི ཀུན ལ སྦྱར པ དང ཅན ལྡན བདག པོའི སྒྲ ག ད བ ས ན མ པ ང འ ར ལ མཐའ མེད པ བདག པོའི སྒྲ ལ པ ཉིད དོ མ མི མིན";
// From 10% testing part of new lang=bo scrape
const kTeststr_bo_Tibt = " ་གྱིས་ཁ་ཆེའི་ཕྱག་འཚལ་ཁང་ཞིག་བཤིག་སྲིད་པ། ཡར་ཀླུང་གཙང་པོར་ཆ ུ་མཛོང་བརྒྱག་རྒྱུའི་ལས་འཆར་ལ་རྒྱ་གར་གྱི་སེམས་ཚབས། རྒྱ་གརགྱི་མཚོ་འོག་དམག་གྲུར་སྦར་གས་བྱུང་བ། པ་ཀི་སི་ཏན་གྱིས་རྒྱ་གར་ལ་མི་སེར་བསད་པའི་སྐྱོན་འཛུགས་བྱས་པ། རྩོམ་ཡིག་མང་བ། འབྲེལ་མཐུད་བརྒྱུད་ལམ། ཐོན་སྐྱེད་དང་སྲི་ཞུ། ་ཐོག་དེབ་བཞི་ དཔར་འགྲེམས་གནང་ཡོད་པ་དང་བོད་ཡིག་དྲ་ཚིགས་ཁག་ནང་ལ་ཡང་རྩོམ་ཡང་ཡང་བྲིས་གནང་མཁན་རེད། ལེ་ཚན་ཁག ལེ་ཚན་ཁག འབྲེལ་ཡོད། འགྲེམ་སྟོན། རྒྱུད་ལམ་སྣ་མང་ཡིག་མཛོད། བཀོལ་སྤྱོད་པའི་འཇོག་ཡུལ་དྲ་ངོས། སྔོན་མ། རྗེས་མ། བསྟན་འཛིན་བདེ་སྐྱིད། ཚེ་རིང་རྣམ་རྒྱལ། བསྟན་འཛིན་ངག་དབང་། ཡོལ་གདོང་ཚེ་རིང་ལྷག་པ། ་དབང་ ཕྱུག་གཉིས་ཀྱིས་བརྗོད་གཞི་བྱེ་བྲག་པ་ཞིག་ལ་བགྲོ་གླེང་གཏིང་ཟབ་བྱེད་པའི་གཟའ་ འཁོར་གཉིས་རེའི་མཚམས་ཀྱི་ལེ་ཚན་ཞིག་ཡིན། དཔྱད་ཞིབ་ཀྱིས་རྒྱ་ནག་ནང་ཁུལ་གྱི་འགྱུར་ལྡོག་དང༌། རྒྱ་ནག་དང་རྒྱལ་སྤྱིའི་འབྲེལ་བར་དམིགས་སུ་བཀར་ནས་བགྲོ་གླེང་བྱེད་ཀྱི་ཡོད།། རྒྱང་སྲིང་དུས་ཚོད།";
const kTeststr_br_Latn = " a chom met leuskel a ra e blas da jack irons dilabour hag aet kuit eus what is this dibab a reont da c houde michael beinhorn evit produiñ an trede pladenn kavet e vez ar ganaouennoù buhan ha buhan ganto setu stummet ar bladenn adkavet e vez enni funk";
const kTeststr_bs_Cyrl = "историја босне књ историја босне књ историја босне књ историја босне књ ";
//const kTeststr_bs_Latn = " a radi bržeg rada pošto rom radi sporije nego ram izvorni rom se isključuje a dio ram a se rezerviše te se u njega ne ploča procesor ram memorija grafička kartica zvučna kartica modem mrežna kartica napojna jedinica uređaji za pohranjivanje";
// From 10% testing part of new lang=bs scrape
const kTeststr_bs_Latn = "Novi predsjednik Mešihata Islamske zajednice u Srbiji (IZuS) i muftija dr. Mevlud ef. Dudić izjavio je u intervjuu za Anadolu Agency (AA) kako je uvjeren da će doći do vraćanja jedinstva među muslimanima i unutar Islamske zajednice na prostoru Sandžaka, te da je njegova ruka pružena za povratak svih u okrilje Islamske zajednice u Srbiji nakon skoro sedam godina podjela u tom dijelu Srbije. Dudić je za predsjednika Mešihata IZ u Srbiji izabran 4. januara, a zvanična inauguracija će biti obavljena u prvoj polovini februara. Kako se očekuje, prisustvovat će joj i reisu-l-ulema Islamske zajednice u Srbiji Husein ef. Kavazović koji će i zvanično promovirati Dudića u novog prvog čovjeka IZ u Srbiji. Dudić će danas boraviti u prvoj zvaničnoj posjeti reisu Kavazoviću, što je njegov privi simbolični potez nakon imenovanja. ";
const kTeststr_ca_Latn = "al final en un únic lloc nhorabona l correu electrònic està concebut com a eina de productivitat aleshores per què perdre el temps arxivant missatges per després intentar recordar on els veu desar i per què heu d eliminar missatges importants per l";
const kTeststr_ceb_Latn = "Ang Sugbo usa sa mga labing ugmad nga lalawigan sa nasod. Kini ang sentro sa komersyo, edukasyon ug industriya sa sentral ug habagatang dapit sa kapupod-an. Ang mipadayag sa Sugbo isip ikapito nga labing nindot nga pulo sa , ang nag-inusarang pulo sa Pilipinas nga napasidunggan sa maong magasin sukad pa sa tuig";
const kTeststr_ceb_Latn2 = "Ang mga komyun sa Pransiya duol-duol sa inkorporadong mga lungsod ug mga dakbayan sa Estados Unidos. Wala kini susamang istruktura sa Hiniusang Gingharian (UK) tungod kay ang estado niini taliwala sa di-metropolitan nga distrito ug sa sibil nga parokya. Wala usab kini susamang istruktura sa Pilipinas.";
const kTeststr_chr_Cher = "ᎠᎢᏍᎩ ᎠᏟᎶᏍᏗ ᏥᏄᏍᏛᎩ ᎦᎫᏍᏛᏅᎯ ᎾᎥᎢ";
const kTeststr_co_Latn = " a prupusitu di risultati for utilizà a scatula per ricercà ind issi risultati servore errore u servore ha incuntratu una errore pruvisoria é ùn ha pussutu compie a vostra dumanda per piacè acimenta dinò ind una minuta tuttu listessu ligami truvà i";
const kTeststr_crs_Latn = "Sesel ou menm nou sel patri. Kot nou viv dan larmoni. Lazwa, lanmour ek lape. Nou remersye Bondye. Preserv labote nou pei. Larises nou losean. En leritaz byen presye. Pour boner nou zanfan. Reste touzour dan linite. Fer monte nou paviyon. Ansanm pou tou leternite. Koste Seselwa!";
const kTeststr_cs_Latn = " a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře nelze načíst systémový profil jednotky smoot okud používáte pro určení polokoule značky z západ nebo v východ používejte nezáporné hodnoty zeměpisné délky nelze";
const kTeststr_cy_Latn = " a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi fydd yn cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu cadw mewn cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am gmail mae n gwasanaeth gwebost";
const kTeststr_da_Latn = " a z tallene og punktummer der er tilladte log ud angiv den ønskede adgangskode igen november gem personlige oplysninger kontrolspørgsmål det sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de tegn du kan se i billedet nedenfor";
const kTeststr_de_Latn = " abschnitt ordner aktivieren werden die ordnereinstellungen im farbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben optional n diesem schritt geben sie für jedesfeld aus dem datenset den typ an ieser schritt ist optional eldtypen";
const kTeststr_dv_Thaa = " ހިންދީ ބަހުން ވާހަކަ ދައްކާއިރު ދެވަނަ ބަހެއްގެ ގޮތުގައާއި އެނޫން ގޮތްގޮތުން ހިންދީ ބަހުން ވާހަކަ ދައްކާ މީހުންގެ އަދަދު މިލިއަނަށް";
const kTeststr_dz_Tibt = " རྩིས བརྐྱབ ཚུལ ལྡན དང ངེས བདེན སྦ སྟོན ནིའི དོན ལུ ཁྱོད གུག ཤད ལག ལེན འཐབ དགོ ག དང ཨིན པུཊི གྲལ ཐིག གུ";
const kTeststr_ee_Latn = "Yi (Di tanya sia) tatia akɔ wò ayi axa yeye dzi kple tanya si sɔ kple esi wòŋlɔ ɖe goa me, negbe axaa ɖe li kpakple tanya mawo xoxo ko. Teƒe le axa yeye sia dzi si wòateŋu atia na kpekpeɖeŋu kple nuwoŋlɔŋlɔ ne anɔ hahiãm na wò. Mehiã be na gbugbɔ ava afii na axa yeye gɔmedzedze o. Woateŋu adze wo gɔme kple nuŋɔŋlɔ dzẽwo tatia. Megavɔ̃ na nuyeyewo gɔmedzedze kroa o.";
const kTeststr_el_Grek = " ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε";
const kTeststr_en_Latn = " a backup credit card by visiting your billing preferences page or visit the adwords help centre for more details https adwords google com support bin answer py answer hl en we were unable to process the payment of for your outstanding google adwords";
const kTeststr_eo_Latn = " a jarcento refoje per enmetado de koncerna pastro tiam de reformita konfesio ekde refoje ekzistis luteranaj komunumanoj tamen tiuj fondis propran komunumon nur en ambaŭ apartenis ekde al la evangela eklezio en prusio resp ties rejnlanda provinceklezio en";
const kTeststr_es_Latn = " a continuación haz clic en el botón obtener ruta también puedes desplazarte hasta el final de la página para cambiar tus opciones de búsqueda gráfico y detalles ésta es una lista de los vídeos que te recomendamos nuestras recomendaciones se basan";
const kTeststr_et_Latn = " a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt heaks kiidetud on sinu kohustuseks see krediidilimiit";
const kTeststr_eu_Latn = " a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen eraso batek aes apurtuko luke nahiz eta oraingoz eraso bideraezina izan gaur egungo teknologiaren mugak direla eta oraingoz kezka hauek alde batera utzi daitezke orain arteko indar";
const kTeststr_fa_Arab = " آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با";
const kTeststr_fi_Latn = " a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia käyttötietoja kuten virheraportteja käytetään google desktopin parantamiseen etsi näyttää mukautettuja uutisia google desktop keskivaihto leikkaa voit kaksoisnapsauttaa";
const kTeststr_fj_Latn = " i kina na i iri ka duatani na matana main a meke wesi se meke mada na meke ni yaqona oqo na meke ka dau vakayagataki ena yaqona vakaturaga e dau caka toka ga kina na vucu ka dau lagati tiko kina na ka e yaco tiko na talo ni wai ni yaqona na lewai ni wai";
const kTeststr_fo_Latn = " at verða átaluverdar óhóskandi ella áloypandi vit kunnu ikki garanterða at google leitanin ikki finnur naka sum er áloypandi óhóskandi ella átaluvert og google tekur onga ábyrgd yvir tær síður sum koma við í okkara leitiskipan fá tær ein";
const kTeststr_fr_Latn = " a accès aux collections et aux frontaux qui lui ont été attribués il peut consulter et modifier ses collections et exporter des configurations de collection toutefois il ne peut pas créer ni supprimer des collections enfin il a accès aux fonctions";
const kTeststr_fy_Latn = " adfertinsjes gewoan lytse adfertinsjes mei besibbe siden dy t fan belang binne foar de ynhâld fan jo berjochten wolle jo mear witte fan gmail foardat jo jo oanmelde gean dan nei wy wurkje eltse dei om gmail te ferbetterjen dêrta sille wy jo sa út en";
const kTeststr_ga_Latn = " a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail breise a chur leis na cinn a cuardaíodh cheana chun an cuardach a bheachtú nó a chúngú má chuirtear focal breise isteach aimseofar fo aicme ar leith de na torthaí a fuarthas";
const kTeststr_gaa_Latn = "Akε mlawookpeehe kε Maŋhiεnyiεlכ oshikifככ lε eba naagbee ni maŋ lε nitsumכ ni kwεכ oshikifככ nכ lε etsככ mכ ni ye kunim ni akε lε eta esεŋ nכ. Dani nomεi baaba nכ lε, maŋ nכkwεmכ kui wuji enyכ ni yככ wכ maŋ lε mli, NPP kε NDC mli bii fכfכi wiemכi kεmaje majee amεhe. Ekomεi kwraa po yafee hiεkwεmכi ni ha ni gidigidi, pilamכ kε la shishwiemכ aaba yε heikomεi. ";
const kTeststr_gd_Latn = " air son is gum bi casg air a h uile briosgaid no gum faigh thu brath nuair a tha briosgaid a tighinn gad rannsachadh ghoogle gu ceart mura bheil briosgaidean ceadaichte cuiridh google briosgaid dha do neach cleachdaidh fa leth tha google a cleachdadh";
const kTeststr_gl_Latn = " debe ser como mínimo taranto tendas de venda polo miúdo cociñas servizos bordado canadá viaxes parques de vehículos de recreo hotel oriental habitación recibir unha postal no enderezo indicado anteriormente";
const kTeststr_gn_Latn = " aháta añe ë ne mbo ehára ndive ajeruréta chupe oporandujey haĝua peëme mba épa pekaru ha áĝa oporandúvo nde eréta avei re paraguaýpe kachíke he i leúpe ndépa re úma kure tatakuápe ha leu ombohovái héë ha ujepéma kachíke he ijey";
const kTeststr_gu_Gujr = " આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા હાલનો જ સંદેશ વિષયની";
const kTeststr_gv_Latn = " and not ripe as i thought yn assyl yn shynnagh as yn lion the ass the fox and the lion va assyl as shynnagh ayns commee son nyn vendeilys as sauchys hie ad magh ayns y cheyll dy shelg cha row ad er gholl feer foddey tra veeit ad rish lion yn shynnagh";
const kTeststr_ha_Latn = " a cikin a kan sakamako daga sakwannin a kan sakamako daga sakwannin daga ranar zuwa a kan sakamako daga guda daga ranar zuwa a kan sakamako daga shafukan daga ranar zuwa a kan sakamako daga guda a cikin last hour a kan sakamako daga guda daga kafar";
const kTeststr_haw_Latn = "He puke noiʻi kūʻikena kūnoa ʻo Wikipikia. E ʻoluʻolu nō, e hāʻawi mai i kāu ʻike, kāu manaʻo, a me kou leo no ke kūkulu ʻana a me ke kākoʻo ʻana mai i ka Wikipikia Hawaiʻi. He kahua pūnaewele Hawaiʻi kēia no ka hoʻoulu ʻana i ka ʻike Hawaiʻi. Inā hiki iā ʻoe ke ʻōlelo Hawaiʻi, e ʻoluʻolu nō, e kōkua mai a e hoʻololi i nā ʻatikala ma ʻaneʻi, a pono e haʻi aku i kou mau hoa aloha e pili ana i ka Wikipikia Hawaiʻi. E ola mau nō ka ʻōlelo Hawaiʻi a mau loa aku.";
const kTeststr_hi_Deva = " ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने विज्ञापनों का अधिकतम लाभ";
const kTeststr_hr_Latn = "Posljednja dva vladara su Kijaksar (Κυαξαρης; 625-585 prije Krista), fraortov sin koji će proširiti teritorij Medije i Astijag. Kijaksar je imao kćer ili unuku koja se zvala Amitis a postala je ženom Nabukodonosora II. kojoj je ovaj izgradio Viseće vrtove Babilona. Kijaksar je modernizirao svoju vojsku i uništio Ninivu 612. prije Krista. Naslijedio ga je njegov sin, posljednji medijski kralj, Astijag, kojega je detronizirao (srušio sa vlasti) njegov unuk Kir Veliki. Zemljom su zavladali Perzijanci.";
const kTeststr_ht_Latn = " ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta fèt pou li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo marye kit yo pa marye tout manman ki fè pitit leta fèt pou ba yo konkoul menm jan tou pou timoun piti ak pou";
const kTeststr_hu_Latn = " a felhasználóim a google azonosító szöveget ikor látják a felhasználóim a google azonosító szöveget felhasználók a google azonosító szöveget fogják látni minden tranzakció után ha a vásárlását regisztrációját oldalunk";
const kTeststr_hy_Armn = " ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է";
const kTeststr_ia_Latn = " super le sitos que tu visita isto es necessari pro render disponibile alcun functionalitates del barra de utensiles a fin que nos pote monstrar informationes ulterior super un sito le barra de utensiles debe dicer a nos le";
//const kTeststr_id_Latn = "Geng: Pengembaraan Bermula adalah film animasi 3D CGI pertama yang diproduksi di Malaysia. Film ini dibuat oleh Les' Copaque Production (LCP) dan dirilis di bioskop-bioskop seluruh Malaysia pada 12 Februari 2009. Film Geng pertama kali diluncurkan dalam sebuah acara peluncuran pada 11 September 2007 bersama dengan serial animasi pendek Upin & Ipin yang berhubungan dengan film tersebut. Pembuatan film ini didukung oleh berbagai pihak seperti Kementerian Sains, Teknologi dan Inovasi Malaysia (MOSTI) dengan memberi bantuan berupa dana sebesar RM1 juta.";
// From 10% testing part of new lang=id scrape
const kTeststr_id_Latn = "berdiri setelah pengurusnya yang berusia 83 tahun, Fayzrahman Satarov, mendeklarasikan diri sebagai nabi dan rumahnya sebagai negara Islam Satarov digambarkan sebagai mantan ulama Islam tahun 1970-an. Pengikutnya didorong membaca manuskripnya dan kebanyakan dilarang meninggalkan tempat persembunyian bawah tanah di dasar gedung delapan lantai mereka. Jaksa membuka penyelidikan kasus kriminal pada kelompok itu dan menyatakan akan membubarkan kelompok kalau tetap melakukan kegiatan ilegal seperti mencegah anggotanya mencari bantuan medis atau pendidikan. Sampai sekarang pihak berwajib belum melakukan penangkapan meskipun polisi mencurigai adanya tindak kekerasan pada anak. Pengadilan selanjutnya akan memutuskan apakah anak-anak diizinkan tetap tinggal dengan orang tua mereka. Kazan yang berada sekitar 800 kilometer di timur Moskow merupakan wilayah Tatarstan yang";
const kTeststr_ie_Latn = " abhorre exceptiones in li derivation plu cardinal por un l i es li regularità del flexion conjugation ples comparar latino sine flexione e li antiqui projectes naturalistic queles have quasi null regules de derivation ma si on nu examina li enunciationes";
const kTeststr_ig_Latn = "Chineke bụ aha ọzọ ndï omenala Igbo kpọro Chukwu. Mgbe ndị bekee bịara, ha mee ya nke ndi Christian. N'echiche ndi ekpere chi Omenala Ndi Igbo, Christianity, Judaism, ma Islam, Chineke nwere ọtụtụ utu aha, ma nwee nanị otu aha. Ụzọ abụọ e si akpọ aha ahụ bụ Jehovah ma Ọ bụ Yahweh. Na ọtụtụ Akwụkwọ Nsọ, e wepụla aha Chineke ma jiri utu aha bụ Onyenwe Anyị ma ọ bụ Chineke dochie ya. Ma mgbe e dere akwụkwọ nsọ, aha ahụ bụ Jehova pụtara nime ya, ihe dị ka ugboro pụkụ asaa(7,000).";
//const kTeststr_ik_Latn = " kuubuuraqabniqsuq ataruamik colville mi aasii tavrani siku kilaabman sulukpaukkat makua niksisugrufagivut tavrani sunaimña atifa quaqqat ii quaqqat aasii ukiabmagu utiqhuta tamaufa utqiabvifñun aasiiñ tatpaaffaqapta tuvaaqatinifarufa aasiiñ";
// From 10% testing part of new lang=ik scrape
const kTeststr_ik_Latn = "sabvaqjuktuq sabvaba atiqaqpa atiqaqpa ibiq iebiq ixafich niuqtulgiññatif uvani natural gas tatpikka ufasiksigiruaq maaffa savaannafarufa mi tatkivani navy qanuqjugugguuq taaptuma inna uqsrunik ivaqjiqhutik taktuk allualiuqtuq sigukun nanuq puuvraatuq taktuum amugaa kalumnitigun nanuq agliruq allualiuqtuq";
const kTeststr_is_Latn = " a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa byggja upp aðallista yfir ný leitarorð fyrir auglýsingahópana og skoða ítarleg gögn um árangur leitarorða eins og samkeppni auglýsenda og leitarmagn er krafist notkun";
const kTeststr_it_Latn = " a causa di un intervento di manutenzione del sistema fino alle ore circa ora legale costa del pacifico del novembre le campagne esistenti continueranno a essere pubblicate come di consueto anche durante questo breve periodo di inattività ci scusiamo per";
const kTeststr_iu_Cans = "ᐃᑯᒪᒻᒪᑦ ᕿᓈᖏᓐᓇᓲᖑᒻᒪᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ ᑎᑦᕆᐊᑐᓐᖏᑦᑕᑎᑦ ᑎᑎᖅᑕᑉᐱᑦ ᓯᕗᓂᖓᓂ ᑎᑎᖅᖃᖅ ᑎᑎᕆᐊᑐᓐᖏᑕᐃᑦ ᕿᓂᓲᖑᔪᒍᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ";
const kTeststr_iw_Hebr = " או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך ב";
const kTeststr_ja_Hani = " このペ ジでは アカウントに指定された予算の履歴を一覧にしています それぞれの項目には 予算額と特定期間のステ タスが表示されます 現在または今後の予算を設定するには";
const kTeststr_jw_Latn = " account ten server niki kalian username meniko tanpo judul cacahe account nggonanmu wes pol pesen mu wes diguwak pesenan mu wes di simpen sante wae pesenan mu wes ke kirim mbuh tekan ora pesenan e ke kethok pesenan mu wes ke kirim mbuh tekan ora pesenan";
const kTeststr_ka_Geor = " ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში გადაინაცვლებს ორი უჯრით";
const kTeststr_kha_Latn = " kaba jem jai sa sngap thuh ia ki bynta ba sharum naka sohbuin jong phi nangta sa pynhiar ia ka kti kadiang jong phi sha ka krung jong phi bad da kaba pyndonkam kumjuh ia ki shympriahti jong phi sa sngap thuh shapoh ka tohtit jong phi pyndonkam ia kajuh ka";
const kTeststr_kk_Arab = " ﻗﻴﺎﻧﺎﺕ ﺑﻮﻟﻤﺎﻳﺪﻯ ﺑﯘﻝ ﭘﺮﻭﺗﺴﻪﺳﯩﻦ ﻳﺎﻋﻨﻲ ﻗﺎﻻ ﻭﻣﯩﺮﯨﻨﺪﻩ ﻗﺎﺯﺍﻕ ء ﺗﯩﻠﯩﻨﯩﯔ ﻗﻮﻟﺪﺍﻧﯩﻠﻤﺎﯞﻯ ﻗﺎﺯﺍﻕ ﺟﻪﺭﯨﻨﺪﻩ";
const kTeststr_kk_Cyrl = " а билердің өзіне рұқсат берілмеген егер халық талап етсе ғана хан келісім берген өздеріңіз білесіздер қр қыл мыс тық кодексінде жазаның";
const kTeststr_kk_Latn = " bolsa da otanyna qaityp keledi al oralmandar basqa elderde diasporasy ote az bolghandyqtan bir birine komektesip bauyrmal bolady birde men poezben oralmandardyng qazaqstangha keluin kordim monghol qazaqtary poezdan tuse sala jerdi suip jylap keletin biraq";
const kTeststr_kl_Latn = " at nittartakkalli uani toqqarsimasatta akornanni nittartakkanut allanut ingerlaqqittoqarsinnaavoq kanukoka tassaavoq kommuneqarfiit kattuffiat nuna tamakkerlugu kommunit nittartagaannut ingerlaqqiffiusinnaasoq kisitsiserpassuit nunatsinnut tunngasut";
const kTeststr_km_Khmr = " ក ខ គ ឃ ង ច ឆ ជ ឈ ញ ដ ឋ ឌ ឍ ណ ត ថ ទ ធ ន ប ផ ព ភ ម យ រ ល វ ស ហ ឡ អ ឥ ឦ ឧ ឪ ឫ ឬ ឯ ឱ ទាំងអស់";
const kTeststr_kn_Knda = " ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ";
const kTeststr_ko_Hani = " 개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 부여사용자에게 프로필 리포트에 액세스할 수 있는 권한을 부여하시려면 가용 프로필 상자에서 프로필 이름을 선택한 다음";
//const kTeststr_ks_Arab = "پیٹھ سٮ۪اگت! آکھ آزاد گیانکوشٖٔ ہۄ کٲنٛسِہ تِہ ہٮ۪کُن اٮ۪ڑِٹ۔ تور چھک ٢٢٨ مَضموٗنن منز کٲشُر ویکیپیٖڈیا چھُ آکھ مَنصوٗبہٕ خٲطرٕ بنَاوُن آکھ گیانکوشٖٔ سۭتۍ آزاد منز 280 زَبانَن تٔمِس یۄسہٕ ژٕ سۭتۍ تُہُنٛد گیان ہُرٮ۪ر کَرُن ہٮ۪کُن";
// From 10% testing part of new lang=ks scrape
const kTeststr_ks_Arab = " ژماں سرابن منز گرٲن چھِہ خابٕک کھلونہٕ ؤڈراواں تُلتِھ نِیَس تہٕ گوشہِ گوشہِ مندچھاوى۪س دِلس چھُہ وون٘ت وُچھان از ستم قلم صبوٝرٕ وول مسٲفر لیۆکھُن بێتابن منز ورل سوال چھُہ تراواں جوابن منز کالہٕ پھۯستہٕ پھن٘ب پگَہہ پہ پۆت نظر دِژ نہٕ ژھالہٕ مٔت آرن مٲنز مسول متھان چھےٚ مس والن وۅن چھےٚ غارن تہِ نارٕ ژھٹھ ژاپان رێش تۅرگ تراوٕہن تہٕ ون رٹہٕ ہن ہوشہِ ہێۆچھ نہٕ پوشنوٝلس نِش مۅہرٕ دی دی زٕلاں چھِ زى۪و حرفن لۆدرٕ پھٔل ہى۪تھ ملر عازمؔ سۆدرٕ کھۅنہِ منز منگاں چھُہ ندرى۪ن پن ژے تھى۪کی یہِ مسٲفر پنن وُڈو تہٕ پڑاو گٕتَو گٕتَو چھےٚ یہِ کۅل بُتھ تہٕ بانہٕ سٕہہ گۅردٕ چھہِ سپداں دمہٕ پُھٹ چھِٹہ پونپر پکھہٕ داران سُہ یتى۪ن تۯاوِ کم نظر دۯاکھ تہٕ باسیوے سُہ مۆہ ہیو یێران مےٚ ژى۪تُرمُت چھُہ سُلی تس چھےٚ کتى۪ن تھپھ شاد مس کراں وُچھ مےٚ خون ژٕ خبر کیازِ کراں دۯاکھ تمِس پى۪ٹھ ماتم أز کہِ شبہٕ آو مےٚ بێیہِ پیش سفر زانہِ خدا دارِ پى۪ٹھ ژٲنگ ہنا تھو زِ ژے چھےٚ مێون أنہٕ کپٹاں چھُہ زٕژن سون مظفّر عازمؔ پوشہ برگن چھُہ سُواں چاکھ سُہ الماس قلم لوِ کٔ ڈ نوِ سرٕ سونتس کل پروِ بۆر بێیہ از بانبرِ ہۆت یمبرزلہِ ٹارى۪ن منز نار وزملہِ کۅسہٕ کتھ کٔر اظہار کچھہِ منزٕ ؤن رووُم اچھہِ چشمو ژوپُم کٔنڈ انبار تماشہِ چھہِ تگاں";
const kTeststr_ks_Deva = "नमस्ते शारदे देवि काश्मिरपुर्वासिनि त्वामहम प्रार्थये देवि विद्य दानम च देहि मे कॉशुर लेख॒नुक सारिव॒य खॊत॒ आसान तरीक॒ छु यि देवनागरी टाइपराइटर इस्तिमाल करुन. अथ मंज़ छि कॉशुर लेख॒न॒चि सारॆय मात्रायि. अमि अलाव॒ हॆकिव तॊह्य् यिम॒ यूनिकोड एडिटर ति वरतॉविथ मगर कॉशिरि मात्रायि लेख॒नस गछ़ि हना दिकथ: अक्षरमालाछु अख मुफ़्त त॒ सॅहॅल सोफ्टवेर यॆमि स॒त्य् युनिकोड देवनागरी मंज़ ITRANS scheme स॒त्य् छु यिवान लेख॒न॒. वुछिव: सहायता. अथ स॒त्य् जुडिथ जालपृष्ठ (वेबपेज) (सॉरी अँग्रीज़ी पॉठ्य)";
const kTeststr_ku_Arab = " بۆ به ڕێوه بردنی نامه ی که دێتن ڕاسته وخۆ ڕه وان بکه نامه کانی گ مایل بۆ حسابی پۆستێکی تر هێنانی په یوه ندکاره کان له";
const kTeststr_ku_Latn = " be zmaneki ter le inglis werdegeretewe em srvise heshta le cor beta daye wate hest a taqi dekrete u bashtr dekret tewawwzmanekan wernegrawnetewe u ne hemu laperakn ke eme pshtiwan dekayn be teaweti wergerawete nermwalley wergeran teksti new wene nasnatewe";
const kTeststr_ky_Arab = " جانا انى تانۇۇ ۇلۇتۇن تانۇۇ قىرعىزدى بئلۉۉ دەگەندىك اچىق ايتساق ماناستى تاانىعاندىق ۅزۉڭدۉ تاانىعاندىق بۉگۉن تەما جۉكتۅمۅ ق ى رع ى ز ت ى ل ى";
const kTeststr_ky_Cyrl = " агай эле оболу мен садыбакас аганын өзү менен эмес эмгектери менен тааныштым жылдары ташкенде өзбекстан илимдер академиясынын баяны";
const kTeststr_la_Latn = " a deo qui enim nocendi causa mentiri solet si iam consulendi causa mentiatur multum profecit sed aliud est quod per se ipsum laudabile proponitur aliud quod in deterioris comparatione praeponitur aliter enim gratulamur cum sanus est homo aliter cum melius";
const kTeststr_lb_Latn = " a gewerkschaften och hei gefuerdert dir dammen an dir häre vun de gewerkschaften denkt un déi aarm wann der äer fuerderunge formuléiert d sechst congés woch an aarbechtszäitverkierzung hëllefen hinnen net d unhiewe vun de steigerungssäz bei de";
const kTeststr_lg_Latn = " abaana ba bani lukaaga mu ana mu babiri abaana ba bebayi lukaaga mu abiri mu basatu abaana ba azugaadi lukumi mu ebikumi bibiri mu abiri mu babiri abaana ba adonikamu lukaaga mu nltaaga mu mukaaga abaana ba biguvaayi enkumi bbiri mu ataano mu mukaaga";
const kTeststr_lif_Limb = "ᤁᤡᤖᤠᤳ ᤕᤠᤰᤌᤢᤱ ᤆᤢᤶᤗᤢᤱᤖᤧ ᤛᤥᤎᤢᤱᤃᤧᤴ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤆᤧᤶᤈᤱᤗᤧ ᤁᤢᤔᤡᤱᤅᤥ ᤏᤠᤈᤡᤖᤡ ᤋᤱᤒᤣ ᥈᥆᥆᥉ ᤒᤠ ᤈᤏᤘᤖᤡ ᤗᤠᤏᤢᤀᤠᤱ ᤁ᤹ᤏᤠ ᤋᤱᤒᤣ ᤁᤠᤰ ᤏᤠ᤺ᤳᤋᤢ ᤕᤢᤖᤢᤒᤠ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤋᤱᤃᤡᤵᤛᤡᤱ ᤌᤡᤶᤒᤣᤴ ᤂᤠᤃᤴ ᤛᤡᤛᤣ᤺ᤰᤗᤠ ᥇᥍ ᤂᤧᤴ ᤀᤡᤛᤡᤰ ᥇ ᤈᤏᤘᤖᤡ ᥈᥆᥆᥊ ᤀᤥ ᤏᤠᤛᤢᤵ ᤆᤥ᤺ᤰᤔᤠ ᤌᤡᤶᤒᤣ ᤋᤱᤃᤠᤶᤛᤡᤱᤗ ᤐᤳᤐᤠ ᤀᤡᤱᤄᤱ ᤘᤠ᤹";
const kTeststr_ln_Latn = " abakisamaki ndenge esengeli moyebami abongisamaki solo mpenza kombo ya moyebami elonguamaki kombo ya bayebami elonguamaki kombo eleki molayi po na esika epesameli limbisa esika ya kotia ba kombo esuki boye esengeli olimbola ndako na yo ya mikanda kombo";
const kTeststr_lo_Laoo = " ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້";
const kTeststr_lt_Latn = " a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių pasimokiau penki dalykai be kurių negaliu gyventi mano miegamajame tu surasi ideali pora išsilavinimas aukštoji mokykla koledžas universitetas pagrindinis laipsnis metai";
const kTeststr_lv_Latn = " a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas saistītas ar sezonas izpārdošanu speciālajām atlaidēm u c ir parastas un atslēgvārdi kas ir populāri noteiktos laika posmos šajā laikā saņems lielāku klikšķu";
const kTeststr_mfe_Latn = "Anz dir mwa, Sa bann delo ki to trouve la, kot fam prostitie asize, samem bann pep, bann lafoul dimoun, bann nasion ek bann langaz. Sa dis korn ki to finn trouve, ansam avek bebet la, zot pou ena laenn pou prostitie la; zot pou pran tou seki li ena e met li touni, zot pou manz so laser e bril seki reste dan dife. Parski Bondie finn met dan zot leker proze pou realiz so plan. Zot pou met zot dakor pou sed zot pouvwar bebet la ziska ki parol Bondie fini realize.";
const kTeststr_mg_Latn = " amporisihin i ianao mba hijery ny dika teksta ranofotsiny an ity lahatsoratra ity tsy ilaina ny opérateur efa karohina daholo ny teny rehetra nosoratanao ampiasao anaovana dokambarotra i google telugu datin ny takelaka fikarohana sary renitakelak i";
const kTeststr_mi_Latn = " haere ki te kainga o o haere ki te kainga o o haere ki te kainga o te rapunga ahua o haere ki te kainga o ka tangohia he ki to rapunga kaore au mohio te tikanga whakatiki o te ra he whakaharuru te pai rapunga a te rapunga ahua a e kainga o nga awhina o te";
const kTeststr_mk_Cyrl = " гласовите коалицијата на вмро дпмне како партија со најмногу освоени гласови ќе добие евра а на сметката на коализијата за македонија";
const kTeststr_ml_Mlym = " അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് നിന്നു എഴുന്നേറ്റു";
const kTeststr_mn_Cyrl = " а боловсронгуй болгох орон нутгийн ажил үйлсийг уялдуулж зохицуулах дүрэм журам боловсруулах орон нутгийн өмч хөрөнгө санхүүгийн";
const kTeststr_mn_Mong = "ᠦᠭᠡ ᠵᠢᠨ ᠴᠢᠨᠭ᠎ᠠ ᠬᠦᠨᠳᠡᠢ ᠵᠢ ᠢᠯᠭᠠᠬᠣ";
const kTeststr_mr_Deva = "हैदराबाद उच्चार ऐका (सहाय्य·माहिती)तेलुगू: హైదరాబాదు , उर्दू: حیدر آباد हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर आहे. हैदराबादची लोकसंख्या ७७ लाख ४० हजार ३३४ आहे. मोत्यांचे शहर अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक, सांस्कृतिक आणि स्थापत्यशास्त्रीय वारसा लाभला आहे. १९९० नंतर शिक्षण आणि माहिती तंत्रज्ञान त्याचप्रमाणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची वाढ शहरात झाली. दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे हैदराबाद हे केंद्र आहे";
//const kTeststr_ms_Latn = "daripada dirinya hirako shinji seorang pemuda merujuk diri mereka sebagai vizard shinji telah cuba untuk menyakinkan ichigo untuk menyertai kumpulan mereka mengatakan bahawa hanya dia sahaja yang mampu mengajar ichigo teknik untuk mengawal hollow";
// From 10% testing part of new lang=ms scrape
const kTeststr_ms_Latn = "pengampunan beramai-ramai supaya mereka pulang ke rumah masing-masing. Orang-orang besarnya enggan mengiktiraf sultan yang dilantik oleh Belanda sebagai Yang DiPertuan Selangor. Orang ramai pula tidak mahu menjalankan perniagaan bijih timah dengan Belanda, selagi raja yang berhak tidak ditabalkan. Perdagang yang lain dibekukan terus kerana untuk membalas jasa beliau yang membantu Belanda menentang Riau, Johor dan Selangor. Di antara tiga orang Sultan juga dipandang oleh rakyat sebagai seorang sultan yang paling gigih. 1 | 2 SULTAN Sebagai ganti Sultan Ibrahim ditabalkan Raja Muhammad iaitu Raja Muda. Walaupun baginda bukan anak isteri pertama bergelar Sultan Muhammad bersemayam di Kuala Selangor juga. Pentadbiran baginda yang lemah itu menyebabkan Kuala Selangor menjadi sarang ioleh Cina di Lukut tidak diambil tindakan, sedangkan baginda sendiri banyak berhutang kepada 1";
const kTeststr_ms_Latn2 = "bilik sebelah berkata julai pada pm ladymariah hmm sume ni terpulang kepada individu mungkin anda bernasib baik selama ini dalam membeli hp yang bagus deli berkata julai pada pm walaupun bukan bahsa baku tp tetap bahasa melayu kan perubahan boleh dibuat";
const kTeststr_mt_Latn = " ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il buttuna ikteb żid numri tfittxijja tal kotba mur print home kotba minn pagni ghal pagna minn ghall ktieb ta aċċessa stieden habib iehor grazzi it tim tal gruppi google";
const kTeststr_my_Latn = " jyk ef oif gawgodcsifayvdrfhrnf bmawgrsm topf dsvj g mail tamumif avhvm atmif txjwgif yxrqhk avhvm efae m pwifavhvm ef ufkyfwdky help center odkyvmyg drsm ar avh dswjhar cgef rsm udkawdkifygw f tajzawgudk smedkifygw f jyd awmh g mail cool features rsm";
const kTeststr_my_Mymr = " တက္ကသုိလ္ မ္ဟ ပ္ရန္ လာ္ရပီးေနာက္ န္ဟစ္ အရ္ဝယ္ ဦးသန္ ့သည္ ပန္ းတနော္ အမ္ယုိးသား ေက္ယာင္ း";
const kTeststr_na_Latn = " arcol obabakaen riringa itorere ibibokiei ababaro min kuduwa airumena baoin tokin rowiowet itiket keram damadamit eigirow etoreiy row keitsito boney ibingo itsiw dorerin naoerodelaporte s nauruan dictionary a c a c d g h o p s t y aiquen ion eins aiquen";
const kTeststr_ne_Deva = "अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू";
const kTeststr_nl_Latn = " a als volgt te werk om een configuratiebestand te maken sitemap gen py ebruik filters om de s op te geven die moeten worden toegevoegd of uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s bevatten voor een bepaalde opmaaktaal dit";
const kTeststr_nn_Latn = " a for verktylina til å hjelpa deg å nå oss merk at pagerank syninga ikkje automatisk kjem til å henta inn informasjon frå sider med argument dvs frå sider med eit i en dersom datamaskina di er plassert bak ein mellomtenar for vevsider kan det verka";
const kTeststr_no_Latn = " a er obligatorisk tidsforskyvning plassering av katalogsøk planinformasjon loggfilbane gruppenavn kontoinformasjon passord domene gruppeinformasjon alle kampanjesporing alternativ bruker grupper oppgaveplanlegger oppgavehistorikk kontosammendrag antall";
const kTeststr_nr_Latn = "ikomiti elawulako yegatja emhlanganweni walo ]imithetho mgomo ye anc ibekwa malunga wayo begodu ubudosiphambili kugandelela lokho okutjhiwo yi lokha nayithi abantu ngibo ";
const kTeststr_nso_Latn = "Bophara bja Asia ekaba 8.6% bja lefase goba 29.4% bja naga ya lefase (ntle le mawatle). Asia enale badudu bao bakabago dimillione millione tše nne (4 billion) yeo e bago 60% ya badudi ba lefase ka bophara. A bapolelwa rena sefapanong mehleng ya Pontius Pilatus. A hlokofatšwa, A bolokwa, A tsoga ka letšatši la boraro, ka mo mangwalo a bolelago ka gona, a rotogela magodimong, ";
const kTeststr_ny_Latn = "Boma ndi gawo la dziko lomwe linapangidwa ndi cholinga chothandiza ntchito yolamulira. Kuŵalako kulikuunikabe mandita, Edipo nyima unalephera kugonjetsa kuŵalako.";
const kTeststr_oc_Latn = " Pasmens, la classificacion pus admesa uei (segon Juli Ronjat e Pèire Bèc) agropa lei parlars deis Aups dins l'occitan vivaroaupenc e non dins lo dialècte provençau.";
const kTeststr_om_Latn = " afaan katalaa bork bork bork hiikaa jira hin argamne gareen barbaadame hin argamne gargarsa qube en gar bayee jira garee walitti firooman gareewwan walitti firooman fuula web akka tartiiba qubeetiin agarsiisi akka tartiiba qubeetiin agarsiisaa jira akka";
const kTeststr_or_Orya = "ଅକ୍ଟୋବର ଡିସେମ୍ବର";
const kTeststr_pa_Guru = " ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ";
const kTeststr_pl_Latn = " a australii będzie widział inne reklamy niż użytkownik z kanady kierowanie geograficzne sprawia że reklamy są lepiej dopasowane do użytkownika twojej strony oznacza to także że możesz nie zobaczyć wszystkich reklam które są wyświetlane na";
const kTeststr_ps_Arab = " اتو مستقل رياست جوړ شو او د پخواني ادبي انجمن څانګې ددې رياست جز شوی او ددې انجمن د ژبې مديريت د پښتو ټولنې په لوی مديريت واوښت لوی مدير يې د";
const kTeststr_pt_Latn = " a abit prevê que a entrada desses produtos estrangeiros no mercado têxtil e vestuário do brasil possa reduzir os preços em cerca de a partir de má notícia para os empresários que terão que lutar para garantir suas margens de lucro mas boa notícia";
const kTeststr_qu_Latn = " is t ipanakunatapis rikuchinankupaq qanpa simiykipi noqaykoqpa uya jllanakunamanta kunan jamoq simikunaman qelqan tiyan watukuy qpa uyata qanpa llaqtaykipi llank anakuna simimanta yanapakuna simimanta mayqen llaqtallapis kay simimanta t ijray qpa qelqa";
const kTeststr_rm_Latn = " Cur chil chantun Turitg ha dà il dretg da votar a las dunnas (1970) è ella vegnida elegida en il cussegl da vischnanca da Zumikon per la Partida liberaldemocratica svizra (PLD). Da 1974 enfin 1982 è ella stada presidenta da vischnanca da Zumikon. Lonn 1979 è Elisabeth Kopp vegnida elegida en il Cussegl naziunal e reelegida quatter onns pli tard cun in resultat da sur 100 000 vuschs. Lonn 1984 è ella daventada vicepresidenta da la PLD.";
const kTeststr_rn_Latn = " ishaka mu ndero y abana bawe ganira n abigisha nimba hari ingorane izo ari zo zose ushobora gusaba kubonana n umwigisha canke kuvugana nawe kuri terefone inyuma y uko babarungikira urutonde rw amanota i muhira mu bisanzwe amashure aratumira abavyeyi";
const kTeststr_ro_Latn = " a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai atunci când pe site ul dvs survine o acţiune dorită site urile negative nu pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare sau constructoare să";
const kTeststr_ro_Cyrl = "оперативэ а органелор ши институциилор екзекутиве ши а органелор жудичиаре але путерий де стат фиекэруй орган ал путерий де стат и се";
const kTeststr_ru_Cyrl = " а неправильный формат идентификатора дн назад";
const kTeststr_rw_Latn = " dore ibyo ukeneye kumenya ukwo watubona ibibazo byinshi abandi babaza ububonero byibibina google onjela ho izina dyikyibina kyawe onjela ho yawe mulugo kulaho ibyandiko byawe shyilaho tegula yawe tulubaka tukongeraho iyanya mishya buliko tulambula";
const kTeststr_sa_Deva = " ं क र्मणस् त स्य य त्कि ङ्चेह करो त्यय ं त स्माल् लोका त्पु नरै ति अस्मै लोका य क र्मण इ ति नु काम";
const kTeststr_sa_Latn = " brahmā tatraivāntaradhīyata tataḥ saśiṣyo vālmīkir munir vismayam āyayau tasya śiṣyās tataḥ sarve jaguḥ ślokam imaṃ punaḥ muhur muhuḥ prīyamāṇāḥ prāhuś ca bhṛśavismitāḥ samākṣaraiś caturbhir yaḥ pādair gīto";
const kTeststr_sco_Latn = " a gless an geordie runciman ower a gless an tamson their man preached a hale hoor aboot the glorious memories o forty three an backsliders an profane persons like esau an aboot jeroboam the son o nebat that gaed stravagin to anither kirk an made aa israel";
const kTeststr_sd_Arab = " اضافو ٿي ٿيو پر اها خبر عثمان کي بعد پيئي ته سگريٽ ڇڪيندڙ مسلمان نه هو بلڪ هندو هو دڪان تي پهچي عثمان ڪسبت کولي گراهڪن جي سيرب لاهڻ شروع ڪئي پر";
const kTeststr_sg_Latn = " atâa na âkotta zo me lâkwê angbâ gï tarrango nî âkotta zo tî koddoro nî âde agbû tenne nî na kate töngana mbênî kotta kpalle tî nzönî dutï tî halëzo pëpe atâa sô âla lü gbâ tî ândya tî mâi na sahngo asâra gbâ tî";
const kTeststr_si_Sinh = " අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි ් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද";
const kTeststr_sit_NP = " dialekten in de roerstreek pierre bakkes oet roerstreek blz bewirk waordebook zónjig oktoeaber is t ieëste mofers waordebook oetgekaome dit waordebook is samegestèldj";
const kTeststr_sk_Latn = " a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte prispôsobiť uložte ju ako šablónu a pokračujte v úprave vyberte si jednu z možností nižšie a kliknite na tlačidlo uložiť kampaň nastavenia kampane môžete ľubovoľne";
const kTeststr_sl_Latn = " adsense stanje prijave za google adsense google adsense račun je bil začasno zamrznjen pozdravljeni hvala za vaše zanimanje v google adsense po pregledu vaše prijavnice so naši strokovnjaki ugotovili da spletna stran ki je trenutno povezana z vašim";
const kTeststr_sm_Latn = " autu mea o lo totonu le e le minaomia matou te tuu i totonu i le faamatalaina o le suesuega i taimi uma mea o lo totonu fuafua i mea e tatau fa afoi tala mai le newsgroup mataupu fa afoi mai tala e ai le mataupu e ai totonu tusitala o le itu o faamatalaga";
const kTeststr_sn_Latn = " chete vanyori vanotevera vakabatsira kunyora zvikamu zvino kumba home tinyorere tsamba chikamu chakumbirwa hachina kuwanikwa chikamu ichi cheninge chakayiswa kuimwe nzvimbo mudhairekitori rino chimwe chikamu chopadhuze pane chinhu chatadza kushanda bad";
const kTeststr_so_Latn = " a oo maanta bogga koobaad ugu qoran yahey beesha caalamka laakiin si kata oo beesha caalamku ula guntato soomaaliya waxa aan shaki ku jirin in aakhirataanka dadka soomaalida oo kaliya ay yihiin ku soomaaliya ka saari kara dhibka ay ku jirto";
const kTeststr_sq_Latn = " a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës zoti thaçi prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyrisht se do të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur dhe sovran të";
const kTeststr_sr_Cyrl = "балчак балчак на мапи србије уреди демографија у насељу балчак живи пунолетна становника а просечна старост становништва износи година";
//const kTeststr_sr_Latn = " autonomnih pokrajina saveznim zakonom može se propisati poseban sastav organizacija i delokrug saveta za poslove narodne odbrane članove saveta federacije bira na predlog predsedništva savezna skupština iz reda društveno političkih i drugih javnih";
const kTeststr_sr_Latn = "Društvo | četvrtak 1.08.2013 | 13:43 Krade se i izvorska voda Izvor: Gornji Milanovac -- U gružanskom selu Belo Polje prošle noći ukradeno je više od 10.000 litara kojima je obijen bazen. Bazen je bio zaključan i propisno obezbeđen.";
const kTeststr_sr_ME_Latn = "savjet pobjeda a radi bržeg rada pošto rom radi sporije nego ram izvorni rom se isključuje a dio ram a se rezerviše te se u njega ne ploča procesor ram memorija grafička kartica zvučna kartica modem mrežna kartica napojna jedinica uređaji za pohranjivanje";
const kTeststr_ss_Latn = " bakhokhintsela yesikhashana bafake imininingwane ye akhawunti leliciniso kulelifomu nangabe akukafakwa imininingwane leliciniso imali lekhokhiwe angeke ifakwe kumkhokhintsela lofanele imininingwane ye akhawunti ime ngalendlela lelandzelako inombolo";
const kTeststr_st_Latn = " bang ba nang le thahasello matshwao a sehlooho thuto e thehilweng hodima diphetho ke tsela ya ho ruta le ho ithuta e totobatsang hantle seo baithuti ba lokelang ho se fihlella ntlhatheo eo e sebetsang ka yona ke ya hore titjhere o hlakisa pele seo";
const kTeststr_su_Latn = "Nu ngatur kahirupan warga, keur kapentingan pamarentahan diatur ku RT, RW jeung Kepala Dusun, sedengkeun urusan adat dipupuhuan ku Kuncen jeung kepala adat. Sanajan Kampung Kuta teu pati anggang jeung lembur sejenna nu aya di wewengkon Desa Pasir Angin, tapi boh wangunan imah atawa tradisi kahirupan masarakatna nenggang ti nu lian.";
const kTeststr_sv_Latn = " a bort objekt från google desktop post äldst meny öretag dress etaljer alternativ för vad är inne yaste google skrivbord plugin program för nyheter google visa nyheter som är anpassade efter de artiklar som du läser om du till exempel läser";
const kTeststr_sw_Latn = " a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na kushiriki mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi lugha hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka udhibitisho wa neno kwa haraka fikia";
const kTeststr_syr_Syrc = "ܐܕܪܝܣ ܓܛܘ ܫܘܪܝܐ ܡܢ ܦܪܢܣܐ ܡܢ ܐܣܦܢܝܐ ܚܐܪܘܬܐ ܒܐܕܪ ܒܢܝܣܢ ܫܛܝܚܘܬܐ ܟܠܢܝܐ ܡܝ̈ܐ ܒܥܠܡܐ";
const kTeststr_ta_Taml = " அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும் உள்ளது தொகு";
const kTeststr_te_Telu = " ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు విశ్వదాభిరామ వినర వేమ";
const kTeststr_tg_Arab = "رادیو فردا راديوى آزادى";
const kTeststr_tg_Cyrl = " адолат ва инсондӯстиро бар фашизм нажодпарастӣ ва адоват тарҷеҳ додааст чоп кунед ба дигарон фиристед чоп кунед ба дигарон фиристед";
const kTeststr_th_Thai = " กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา";
const kTeststr_ti_Ethi = " ሃገር ተረፎም ዘለዉ ኢትዮጵያውያን ኣብቲ ምስ ኢትዮጵያ ዝዳውብ ኣውራጃ ደቡብ ንኽነብሩ ኣይፍቀደሎምን እዩ ካብ ሃገር ንኽትወጽእ ዜጋ ኹን ወጻእተኛ ናይ";
const kTeststr_tk_Cyrl = " айдянларына ынанярмыка эхли боз мейданлары сурулип гутарылан тебигы ота гарып гумлукларда миллиондан да артыкмач ири шахлы малы миллиона";
const kTeststr_tk_Latn = " akyllylyk çyn söýgi üçin böwet däl de tebigylykdyr duýgularyň gödeňsiligi aç açanlygy bahyllygy söýgini betnyşanlyk derejesine düşürýändir söýeni söý söýmedige süýkenme özüni söýmeýändigini görmek ýigit üçin uly";
const kTeststr_tl_Latn = " a na ugma sa google ay nakaka bantog sa gitna nang kliks na nangyayari sa pamamagitan nang ordinaryong paggagamit at sa kliks na likha nang pandaraya o hindi tunay na paggamit bunga nito nasasala namin ang mga kliks na hindi kailangan o hindi gusto nang";
const kTeststr_tl_Tglg = " ᜋᜇ᜔ ᜐᜓᜎᜆ᜔ ᜃ ᜈᜅ᜔ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜂᜉᜅ᜔᜔ ᜋᜐᜈᜌ᜔ ᜎᜅ᜔ ᜁᜐ ᜉᜅ᜔ ᜀᜃ᜔ᜎᜆ᜔ ᜆᜓᜅ᜔ᜃᜓᜎ᜔ ᜐ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜐ ᜆᜒᜅᜒᜈ᜔ ᜃᜓ";
const kTeststr_tlh_Latn = " a ghuv bid soh naq jih lodni yisov chich wo vamvo qeylis lunge pu chah povpu vodleh a dah ghah cho ej dah wo che pujwi bommu tlhegh darinmohlahchu pu majqa horey so lom qa ip quv law may vad suvtahbogh wa sanid utlh quv pus datu pu a vitu chu pu johwi tar";
const kTeststr_tn_Latn = " go etela batla ditsebe tsa web tse di nang le le batla ditsebe tse di golaganya le tswang mo leka go batla web yotlhe batla mo web yotlhe go bona home page ya google batla mo a o ne o batla gore a o ne o batla ditsebe tsa bihari batla mo re maswabi ga go";
const kTeststr_to_Latn = " a ke kumi oku ikai ke ma u vakai ki hono hokohoko faka alafapeti api pe ko e uluaki peesi a ho o fekumi faka malatihi fekumi ki he lea oku fakaha atu pe ko ha fonua fekumi ki he fekumi ki he peesi oku ngaahi me a oku sai imisi alu ki he ki he ulu aki";
const kTeststr_tr_Latn = " a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için günlük bütçenizi gözden geçirebileceğiniz yeri arıyorsanız kampanya yönetimi ne gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi tıklayın sunumu";
const kTeststr_ts_Latn = " a ku na timhaka leti nga ta vulavuriwa na google google yi hlonipha yi tlhela yi sirheleta vanhu hinkwavo lava tirhisaka google toolbar ku dyondza hi vusireleli eka system ya hina hi kombela u hlaya vusireleli bya hina eka toolbar mbulavulo wu tshikiwile";
const kTeststr_tt_Cyrl = "ачарга да бирмәде чәт чәт килеп тора безнең абыйнымы олы абыйнымы эштән";
const kTeststr_tt_Latn = " alarnı eşkärtü proğramnarın eşläwen däwam itü tatar söylämen buldıru wä sizep alu sistemnarın eşläwen däwat itü häm başqalar yılnıñ mayında tatar internetı ictimağıy oyışması milli ts isemle berençe däräcäle häm tat";
const kTeststr_tw_Latn = " amammui tumidifo no bɛtow ahyɛ atoro som so mpofirim na wɔasɛe no pasaa ma ayɛ nwonwa dɛn na ɛbɛka wɔn ma wɔayɛ saa bible no ma ho mmuae wɔ adiyisɛm nhoma no mu sɛ onyankopɔn na ɔde hyɛɛ wɔn komam sɛ wɔmma ne nsusuwii mmra mu";
const kTeststr_ug_Arab = " ئالەملەرنىڭ پەرۋەردىگارىدىن تىلەيمەن سىلەر بۇ يەرلەردە باغچىلاردىن بۇلاقلاردىن زىرائەتلەردىن يۇمشاق پىشقان خورمىلاردىن بەھرىمەن بولۇپ";
const kTeststr_ug_Cyrl = " а башлиди әмма бу қетимқи канада мәтбуатлириниң хәвәрлиридә илгирикидәк хитай һөкүмәт мәтбуатлиридин нәқил алидиған вә уни көчүрүп";
const kTeststr_ug_Latn = " adawet bolghachqa hazir musherrepmu bu ikki partiyining birleshme hökümet qurushta pikir birliki hasil qilalmasliqini kütüwatqan iken wehalenki pakistan xelq partiyisining rehbiri asif eli zerdari pakistandiki bashqa ushshaq partiyilerning rehberliri";
const kTeststr_uk_Cyrl = " а більший бюджет щоб забезпечити собі максимум прибутків від переходів відстежуйте свої об яви за датою географічним розташуванням";
const kTeststr_ur_Arab = " آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں اگر آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو";
const kTeststr_uz_Arab = " آرقلی بوتون سیاسی حزب و گروه لرفعالیتیگه رخصت بیرگن اخبارات واسطه لری شو ییل مدتیده مثال سیز ترقی تاپکن و اهالی نینگ اقتصادی وضعیتی اوتمیش";
const kTeststr_uz_Cyrl = " а гапирадиган бўлсак бунинг иккита йўли бор биринчиси мана шу қуриган сатҳини қумликларни тўхтатиш учун экотизимни мустаҳкамлаш қумга";
const kTeststr_uz_Latn = " abadiylashtirildi aqsh ayol prezidentga tayyormi markaziy osiyo afg onistonga qanday yordam berishi mumkin ukrainada o zbekistonlik muhojirlar tazyiqdan shikoyat qilmoqda gruziya va ukraina hozircha natoga qabul qilinmaydi afg oniston o zbekistonni g";
const kTeststr_ve_Latn = "Vho ṱanganedzwa kha Wikipedia nga tshiVenḓa. Vhadivhi vha manwalo a TshiVenda vha talusa divhazwakale na vhubvo ha Vhavenda ngau fhambana. Vha tikedza mbuno dzavho uya nga mawanwa a thoduluso dze vha ita. Vhanwe vha vhatodulusi vhari Vhavenda vho tumbuka Afrika vhukati vha tshimbila vha tshiya Tshipembe ha Afrika, Rhodesia hune ha vho vhidzwa Zimbagwe namusi.";
const kTeststr_vi_Latn = " adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín dụng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt hàng dạng cấp dữ liệu ác minh trang web của bạn để xem";
const kTeststr_vo_Latn = " brefik se volapükavol nüm balid äpubon ün dü lif lölik okas redakans älaipübons gasedi at nomöfiko äd ai mu kuratiko pläo timü koup nedäna fa ns deutän kü päproibon fa koupanef me gased at ästeifülom ad propagidön volapüki as sam ün";
const kTeststr_war_Latn = "Amo ini an balay han Winaray o Binisaya nga Lineyte-Samarnon nga Wikipedia, an libre ngan gawasnon nga ensayklopedya nga bisan hin-o puyde magliwat o mag-edit. An Wikipedia syahan gintikang ha Iningles nga yinaknan han tuig 2001. Ini nga bersyon Winaray gintikang han ika-25 han Septyembre 2005 ngan ha yana mayda 514,613 nga artikulo. Kon karuyag niyo magsari o magprobar, pakadto ha . An Gastrotheca pulchra[2] in uska species han Anura nga ginhulagway ni Ulisses Caramaschi ngan Rodrigues hadton 2007. An Gastrotheca pulchra in nahilalakip ha genus nga Gastrotheca, ngan familia nga Hemiphractidae.[3][4] Ginklasipika han IUCN an species komo kulang hin datos.[1] Waray hini subspecies nga nakalista.[3]";
const kTeststr_wo_Latn = " am ak dëgg dëggam ak gëm aji bind ji te gëstu ko te jëfandikoo tegtalu xel ci saxal ko sokraat nag jëfandikoo woon na xeltu ngir tas jikko yu rafet ci biir nit ñi ak dëggu ak soppante sokraat nag ñëw na mook aflaton platon sukkandiku ci ñaari";
const kTeststr_xh_Latn = " a naynga zonke futhi libhengezwa kwiwebsite yebond yasemzantsi afrika izinga elisebenzayo xa usenza olu tyalo mali liya kusebenza de liphele ixesha lotyalo mali lwakho inzala ihlawulwa rhoqo emva kweenyanga ezintandathu ngomhla wamashumi amathathu ananye";
const kTeststr_xx_Bugi = "ᨄᨛᨑᨊᨒ ᨑᨗ ᨔᨒᨗᨓᨛ ᨕᨗᨋᨗᨔᨗ ᨒᨛᨄ ᨑᨛᨔᨛᨆᨗᨊ";
const kTeststr_xx_Goth = "𐌰 𐌰𐌱𐍂𐌰𐌷𐌰𐌼 𐌰𐌲𐌲𐌹𐌻𐌹𐍃𐌺𐍃 𐌸𐌹𐌿𐌳𐌹𐍃𐌺𐍃 𐍆𐍂𐌰𐌲𐌺𐌹𐍃𐌺𐍃";
const kTeststr_yi_Hebr = "און פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער";
const kTeststr_yo_Latn = " abinibi han ikawe alantakun le ni opolopo ede abinibi ti a to lesese bi eniyan to fe lo se fe lati se atunse jowo mo pe awon oju iwe itakunagbaye miran ti ako ni oniruru ede abinibi le faragba nipa atunse ninu se iwadi blogs ni ori itakun agbaye ti e ba";
const kTeststr_za_Hani = " 两个宾语的字数较少时 只带一个动词 否则就带两个动词 三句子类 从句子方面去谈汉 壮语结构格式相异的类型的 叫句子类 汉 壮语中 句子类结构格式有差别的自然不少";
const kTeststr_za_Latn = " dih yinzminz ndaej daengz bujbienq youjyau dih cingzyin caeuq cinhingz diuz daihit boux boux ma daengz lajmbwn couh miz cwyouz cinhyenz caeuq genzli bouxboux bingzdaengj gyoengq vunz miz lijsing caeuq liengzsim wngdang daih gyoengq de lumj beixnuengx";
const kTeststr_zh_Hans = "产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 对于要提交的图书 我确认 我是版权所有者或已得到版权所有者的授权 要更改您的国家 地区 请在此表的最上端更改您的";
const kTeststr_zh_Hant = " 之前為 帳單交易作業區 已變更 廣告內容 之前為 銷售代表 之前為 張貼日期為 百分比之前為 合約 為 目標對象條件已刪除 結束日期之前為";
const kTeststr_zu_Latn = " ana engu uma inkinga iqhubeka siza ubike kwi isexwayiso ngenxa yephutha lomlekeleli sikwazi ukubuyisela emuva kuphela imiphumela engaqediwe ukuthola imiphumela eqediwe zama ukulayisha kabusha leli khasi emizuzwini engu uma inkinga iqhubeka siza uthumele";
const kTeststr_zzb_Latn = "becoose a ve a leemit qooereees tu vurds um gesh dee bork bork nu peges vere a fuoond cunteeening is a fery cummun vurd und ves nut inclooded in yuoor seerch zee ooperetur is unnecessery ve a incloode a ell seerch terms by deffoolt um de hur de hur de hur";
const kTeststr_zze_Latn = " a diffewent type of seawch send feedback about google wiwewess seawch to wap google com wesuwts found on de entiwe web fow wesuwts found on de mobiwe web fow de functionawity of de toolbar up button has been expanded swightwy it now considews fow exampwe";
const kTeststr_zzh_Latn = " b x z un b e t und rs n a dr ss p as ry an th r a dr ss ry us n a l ss mb gu us c ti n l ke a z p c d n a dr ss nt r d pl as en r n a dr ss y ur s ar h f r n ar d d n t m tch ny l c ti n w th n m l s nd m r r at d p g s th l c ti ns b l w w r ut m t ca y";
const kTeststr_zzp_Latn = " away ackupbay editcray ardcay ybay isitingvay ouryay illingbay eferencespray agepay orway isitvay ethay adwordsway elphay entrecay orfay oremay etailsday adwordsway ooglegay omcay upportsay";
// Two very close Wikipedia page beginnings
const kTeststr_ms_close = "sukiyaki wikipedia bahasa melayu ensiklopedia bebas sukiyaki dari wikipedia bahasa melayu ensiklopedia bebas lompat ke navigasi gelintar sukiyaki sukiyaki hirisan tipis daging lembu sayur sayuran dan tauhu di dalam periuk besi yang dimasak di atas meja makan dengan cara rebusan sukiyaki dimakan dengan mence";
const kTeststr_id_close = "sukiyaki wikipedia indonesia ensiklopedia bebas berbahasa bebas berbahasa indonesia langsung ke navigasi cari untuk pengertian lain dari sukiyaki lihat sukiyaki irisan tipis daging sapi sayur sayuran dan tahu di dalam panci besi yang dimasak di atas meja makan dengan cara direbus sukiyaki dimakan dengan mence";
// Simple intermixed French/English text
const kTeststr_fr_en_Latn = "France is the largest country in Western Europe and the third-largest in Europe as a whole. " +
"A accès aux chiens et aux frontaux qui lui ont été il peut consulter et modifier ses collections et exporter " +
"Cet article concerne le pays européen aujourdhui appelé République française. Pour dautres usages du nom France, " +
"Pour une aide rapide et effective, veuiller trouver votre aide dans le menu ci-dessus." +
"Motoring events began soon after the construction of the first successful gasoline-fueled automobiles. The quick brown fox jumped over the lazy dog";
// This can be used to cross-check the build date of the main quadgram table
const kTeststr_version = "qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmxyzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas";
const kTestPairs = [
// A simple case to begin
["en", "ENGLISH", kTeststr_en],
// 20 languages recognized via Unicode script
["hy", "ARMENIAN", kTeststr_hy_Armn],
["chr", "CHEROKEE", kTeststr_chr_Cher],
["dv", "DHIVEHI", kTeststr_dv_Thaa],
["ka", "GEORGIAN", kTeststr_ka_Geor],
["el", "GREEK", kTeststr_el_Grek],
["gu", "GUJARATI", kTeststr_gu_Gujr],
["iu", "INUKTITUT", kTeststr_iu_Cans],
["kn", "KANNADA", kTeststr_kn_Knda],
["km", "KHMER", kTeststr_km_Khmr],
["lo", "LAOTHIAN", kTeststr_lo_Laoo],
["lif", "LIMBU", kTeststr_lif_Limb],
["ml", "MALAYALAM", kTeststr_ml_Mlym],
["or", "ORIYA", kTeststr_or_Orya],
["pa", "PUNJABI", kTeststr_pa_Guru],
["si", "SINHALESE", kTeststr_si_Sinh],
["syr", "SYRIAC", kTeststr_syr_Syrc],
["tl", "TAGALOG", kTeststr_tl_Tglg], // Also in quadgram list below
["ta", "TAMIL", kTeststr_ta_Taml],
["te", "TELUGU", kTeststr_te_Telu],
["th", "THAI", kTeststr_th_Thai],
// 4 languages regognized via single letters
["zh", "CHINESE", kTeststr_zh_Hans],
["zh-Hant", "CHINESET", kTeststr_zh_Hant],
["ja", "JAPANESE", kTeststr_ja_Hani],
["ko", "KOREAN", kTeststr_ko_Hani],
// 60 languages recognized via combinations of four letters
["af", "AFRIKAANS", kTeststr_af_Latn],
["sq", "ALBANIAN", kTeststr_sq_Latn],
["ar", "ARABIC", kTeststr_ar_Arab],
["az", "AZERBAIJANI", kTeststr_az_Latn],
["eu", "BASQUE", kTeststr_eu_Latn],
["be", "BELARUSIAN", kTeststr_be_Cyrl],
["bn", "BENGALI", kTeststr_bn_Beng], // No Assamese in subset
["bh", "BIHARI", kTeststr_bh_Deva],
["bg", "BULGARIAN", kTeststr_bg_Cyrl],
["ca", "CATALAN", kTeststr_ca_Latn],
["ceb", "CEBUANO", kTeststr_ceb_Latn],
["hr", "CROATIAN", kTeststr_hr_Latn],
["cs", "CZECH", kTeststr_cs_Latn],
["da", "DANISH", kTeststr_da_Latn],
["nl", "DUTCH", kTeststr_nl_Latn],
["en", "ENGLISH", kTeststr_en_Latn],
["et", "ESTONIAN", kTeststr_et_Latn],
["fi", "FINNISH", kTeststr_fi_Latn],
["fr", "FRENCH", kTeststr_fr_Latn],
["gl", "GALICIAN", kTeststr_gl_Latn],
["lg", "GANDA", kTeststr_lg_Latn],
["de", "GERMAN", kTeststr_de_Latn],
["ht", "HAITIAN_CREOLE", kTeststr_ht_Latn],
["iw", "HEBREW", kTeststr_iw_Hebr],
["hi", "HINDI", kTeststr_hi_Deva],
["hmn", "HMONG", kTeststr_blu_Latn],
["hu", "HUNGARIAN", kTeststr_hu_Latn],
["is", "ICELANDIC", kTeststr_is_Latn],
["id", "INDONESIAN", kTeststr_id_Latn],
["ga", "IRISH", kTeststr_ga_Latn],
["it", "ITALIAN", kTeststr_it_Latn],
["jw", "JAVANESE", kTeststr_jw_Latn],
["rw", "KINYARWANDA", kTeststr_rw_Latn],
["lv", "LATVIAN", kTeststr_lv_Latn],
["lt", "LITHUANIAN", kTeststr_lt_Latn],
["mk", "MACEDONIAN", kTeststr_mk_Cyrl],
["ms", "MALAY", kTeststr_ms_Latn],
["mt", "MALTESE", kTeststr_mt_Latn],
["mr", "MARATHI", kTeststr_mr_Deva],
["ne", "NEPALI", kTeststr_ne_Deva],
["no", "NORWEGIAN", kTeststr_no_Latn],
["fa", "PERSIAN", kTeststr_fa_Arab],
["pl", "POLISH", kTeststr_pl_Latn],
["pt", "PORTUGUESE", kTeststr_pt_Latn],
["ro", "ROMANIAN", kTeststr_ro_Latn],
["ro", "ROMANIAN", kTeststr_ro_Cyrl],
["ru", "RUSSIAN", kTeststr_ru_Cyrl],
["gd", "SCOTS_GAELIC", kTeststr_gd_Latn],
["sr", "SERBIAN", kTeststr_sr_Cyrl],
["sr", "SERBIAN", kTeststr_sr_Latn],
["sk", "SLOVAK", kTeststr_sk_Latn],
["sl", "SLOVENIAN", kTeststr_sl_Latn],
["es", "SPANISH", kTeststr_es_Latn],
["sw", "SWAHILI", kTeststr_sw_Latn],
["sv", "SWEDISH", kTeststr_sv_Latn],
["tl", "TAGALOG", kTeststr_tl_Latn],
["tr", "TURKISH", kTeststr_tr_Latn],
["uk", "UKRAINIAN", kTeststr_uk_Cyrl],
["ur", "URDU", kTeststr_ur_Arab],
["vi", "VIETNAMESE", kTeststr_vi_Latn],
["cy", "WELSH", kTeststr_cy_Latn],
["yi", "YIDDISH", kTeststr_yi_Hebr],
// Added 2013.08.31 so-Latn ig-Latn ha-Latn yo-Latn zu-Latn
["so", "SOMALI", kTeststr_so_Latn],
["ig", "IGBO", kTeststr_ig_Latn],
["ha", "HAUSA", kTeststr_ha_Latn],
["yo", "YORUBA", kTeststr_yo_Latn],
["zu", "ZULU", kTeststr_zu_Latn],
// Added 2014.01.22 bs-Latn
["bs", "BOSNIAN", kTeststr_bs_Latn],
// 2 statistically-close languages
["id", "INDONESIAN", kTeststr_id_close, true],
["ms", "MALAY", kTeststr_ms_close],
// Simple intermixed French/English text
["fr", "FRENCH", kTeststr_fr_en_Latn],
// Cross-check the main quadgram table build date
// Change the expected language each time it is rebuilt
//["WELSH", kTeststr_version], // 2013.07.15
["az", "AZERBAIJANI", kTeststr_version] // 2014.01.31
];
Components.utils.import("resource:///modules/translation/LanguageDetector.jsm");
add_task(function test_pairs() {
for (let pair of kTestPairs) {
let result = yield LanguageDetector.detectLanguage(pair[2]);
do_check_eq(result.language, pair[0]);
do_check_eq(result.confident, !pair[3]);
}
});
var run_test = run_next_test;

View File

@ -0,0 +1,6 @@
[DEFAULT]
head =
tail =
firefox-appdir = browser
[test_cld2.js]