Files
ZY_encoder 51e346a522 tex-hyphen告警清理
Signed-off-by: ZY_encoder <zhouyong10@huawei.com>
2025-03-24 17:20:26 +08:00

478 lines
16 KiB
C++

/*
* Copyright (c) 2024 Huawei Device Co., Ltd.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "hyphen_pattern.h"
#include <codecvt>
#include <cstdio>
#include <cerrno>
#include <fcntl.h>
#include <fstream>
#include <iostream>
#include <map>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unicode/utf.h>
#include <unicode/utf8.h>
#include <unistd.h>
using namespace std;
namespace OHOS::Hyphenate {
vector<uint16_t> ConvertToUtf16(const string& utf8Str)
{
int32_t i = 0;
UChar32 c = 0;
vector<uint16_t> target;
const int32_t textLength = utf8Str.size();
while (i < textLength) {
U8_NEXT(reinterpret_cast<const uint8_t*>(utf8Str.c_str()), i, textLength, c);
if (U16_LENGTH(c) == 1) {
target.push_back(c);
} else {
target.push_back(U16_LEAD(c));
target.push_back(U16_TRAIL(c));
}
}
return target;
}
struct Pattern {
uint8_t patterns[8]; // dynamic
};
struct ArrayOf16bits {
uint16_t count;
uint16_t codes[3]; // dynamic
};
struct Header {
uint8_t magic1;
uint8_t magic2;
uint8_t minCp;
uint8_t maxCp;
uint32_t toc;
uint32_t mappings;
uint32_t version;
inline uint16_t CodeOffset(uint16_t code, const ArrayOf16bits* maps = nullptr) const
{
if (maps && (code < minCp || code > maxCp)) {
for (size_t i = maps->count; i != 0;) {
i -= HYPHEN_BASE_CODE_SHIFT;
if (maps->codes[i] == code) {
// cout << "resolved mapping ix: " << static_cast<int>(m->codes[i + 1]) << endl;
auto offset = maps->codes[i + 1];
return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + (offset - maxCp) * HYPHEN_BASE_CODE_SHIFT + 1;
}
}
return MaxCount(maps);
}
if (maps) {
// + 1 because previous end is before next start
// 2x because every second value to beginning addres
return (code - minCp) * HYPHEN_BASE_CODE_SHIFT + 1;
} else {
if (code < minCp || code > maxCp) {
return maxCp + 1;
}
return (code - minCp);
}
}
inline static void ToLower(uint16_t& code)
{
if (code == '.') {
code = '`';
} else if (code == '\'') {
code = '^';
} else if (code == '-') {
code = '_';
} else {
code = tolower(code);
}
cout << "tolower: " << hex << static_cast<int>(code) << endl;
}
inline uint16_t MaxCount(const ArrayOf16bits* maps) const
{
// need to write this in binary provider !!
return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + maps->count;
}
};
struct CodeInfo {
int32_t OpenPatFile(const char* filePath);
int32_t GetHeader();
int32_t GetCodeInfo(uint16_t code);
void ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct);
bool ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset);
void ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result);
bool ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset);
void ClearResource();
Header* fHeader{nullptr};
uint8_t* fAddress{nullptr};
FILE* fFile{nullptr};
size_t fFileSize{0};
uint16_t fMaxCount{0};
PathType fType{PathType::PATTERN};
uint16_t fOffset{0};
uint16_t fCode{0};
uint32_t fIndex{0};
uint32_t fNextOffset;
uint16_t* fStaticOffset{nullptr};
ArrayOf16bits* fMappings{nullptr};
};
int32_t CodeInfo::OpenPatFile(const char* filePath)
{
cout << "Attempt to mmap " << filePath << endl;
FILE* file = fopen(filePath, "r");
if (file == nullptr) {
cerr << "FATAL: " << errno << endl;
return FAILED;
}
struct stat st;
if (fstat(fileno(file), &st) != 0) {
cerr << "FATAL: fstat" << endl;
fclose(file);
return FAILED;
}
size_t length = st.st_size;
uint8_t* address = static_cast<uint8_t*>(mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fileno(file), 0u));
if (address == MAP_FAILED) {
cerr << "FATAL: mmap" << endl;
fclose(file);
return FAILED;
}
cout << "Magic: " << hex << *reinterpret_cast<uint32_t*>(address) << dec << endl;
this->fFile = file;
this->fFileSize = length;
this->fAddress = address;
return SUCCEED;
}
static std::vector<uint16_t> GetInputWord(const char* input)
{
const std::string utf8Str = "." + std::string(input) + ".";
std::vector<uint16_t> target = ConvertToUtf16(utf8Str);
for (auto& code : target) {
Header::ToLower(code);
}
return target;
}
int32_t CodeInfo::GetHeader()
{
fHeader = reinterpret_cast<Header*>(fAddress);
uint16_t minCp = fHeader->minCp;
uint16_t maxCp = fHeader->maxCp;
// get master table, it always is in direct mode
fMappings = reinterpret_cast<ArrayOf16bits*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->mappings));
// this is actually beyond the real 32 bit address, but just to have an offset that
// is clearly out of bounds without recalculating it again
fMaxCount = fHeader->MaxCount(fMappings);
cout << "min/max: " << minCp << "/" << maxCp << " count " << static_cast<int>(fMaxCount) << endl;
cout << "size of top level mappings: " << static_cast<int>(fMappings->count) << endl;
if (minCp == maxCp && fMappings->count == 0) {
cerr << "### unexpected min/max in input file-> exit" << endl;
return FAILED;
}
return SUCCEED;
}
void CodeInfo::ClearResource()
{
(void)munmap(fAddress, fFileSize);
fAddress = nullptr;
(void)fclose(fFile);
fFile = nullptr;
fFileSize = 0;
}
int32_t CodeInfo::GetCodeInfo(uint16_t code)
{
fType = PathType::PATTERN;
this->fCode = code;
this->fIndex = 0;
fOffset = fHeader->CodeOffset(code, fMappings);
if (fOffset == fMaxCount) {
cout << hex << char(code) << " unable to map, contiue straight" << endl;
return FAILED;
}
// previous entry end
uint32_t baseOffset =
*reinterpret_cast<uint32_t*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset - 1);
uint32_t initialValue = *(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset);
fType = static_cast<PathType>(initialValue >> SHIFT_BITS_30);
// direct and pairs need to have offset different from zero
if (initialValue == 0 && (fType == PathType::DIRECT || fType == PathType::PAIRS)) {
cout << char(code) << " is not in main dict, contiue straight" << endl;
return FAILED;
}
// base offset is 16 bit
fStaticOffset = reinterpret_cast<uint16_t*>(fAddress + HYPHEN_BASE_CODE_SHIFT * baseOffset);
// get a subtable according character
// once: read as 32bit, the rest of the access will be 16bit (13bit for offsets)
fNextOffset = (initialValue & 0x3fffffff);
cout << hex << baseOffset << " top level code: 0x" << hex << static_cast<int>(code) <<
" starting with offset: 0x" << hex << fOffset << " table-offset 0x" << fNextOffset << endl;
return SUCCEED;
}
void CodeInfo::ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct)
{
cout << "direct : " << direct << " " << hex << fNextOffset << endl;
uint16_t poffset = 0;
if (direct && (fHeader->version >> 0x18) >= 0x2) {
poffset = *(reinterpret_cast<uint16_t*>(fAddress) + fNextOffset + (fHeader->version & 0xffff));
} else {
poffset = *(fStaticOffset + fNextOffset);
}
fNextOffset++; // there now is always at least pattern count before next node
if (!poffset) {
return;
}
uint16_t count = (poffset >> 0xc) * 0x4; // patterns are padded to 4 byte arrays.
// to save bits, the count is multiplied by four
poffset = 0xfff & poffset;
// if we have reached pattern, apply it to result
auto p = reinterpret_cast<const Pattern*>(fAddress + poffset);
if (count != 0) {
cout << "Node with a pattern, count " << count << hex << " offset: " << poffset << endl;
size_t i = 0;
for (size_t j = offset - fIndex; j < result.size() && i < count; j++) {
cout << " " << static_cast<int>(j) << ": pattern index: " << i << " value: 0x" << hex
<< static_cast<int>(p->patterns[i]) << endl;
result[j] = std::max(result[j], (p->patterns[i]));
i++;
}
}
}
bool CodeInfo::ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset)
{
// resolve new code point
if (fIndex == offset) { // should never be the case
cout << "# break loop on direct" << endl;
return true;
}
fIndex++;
fCode = target[offset - fIndex];
fOffset = fHeader->CodeOffset(fCode);
if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
cout << "# break loop on direct" << endl;
return true;
}
auto nextValue = *(fStaticOffset + fNextOffset + fOffset);
fNextOffset = nextValue & 0x3fff;
fType = static_cast<PathType>(nextValue >> SHIFT_BITS_14);
cout << " found direct: " << char(fCode) << " : " << hex << nextValue << " with offset: " << fNextOffset << endl;
return false;
}
void CodeInfo::ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result)
{
auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
auto count = p->count;
fIndex++;
cout << "# linear " << offset << " " << fIndex << endl;
if (fIndex > offset || count > (offset - fIndex + 1)) {
// the pattern is longer than the remaining word
cout << "# break loop on linear " << offset << " " << fIndex << endl;
return;
}
// check the rest of the string
for (auto j = 0; j < count; j++) {
cout << " linear " << offset << " index: " << j << " value: " << hex << static_cast<int>(p->codes[j]) <<
" vs " << static_cast<int>(target[offset - fIndex]) << endl;
if (p->codes[j] != target[offset - fIndex]) {
return;
} else {
fIndex++;
}
}
// if we reach the end, apply pattern
fNextOffset += count + 1; // array items + one for the count
fIndex--; // because of recursion
ProcessPattern(offset, result, false);
if (*(fStaticOffset + fNextOffset) != 0 && offset > count) { // peek if there is more to come
// make it tail recursive to save stack
return ProcessLinear(target, offset, result);
}
}
bool CodeInfo::ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset)
{
// resolve new code point
if (fIndex == offset) { // should detect this sooner
cout << "# break loop on pairs" << endl;
return true;
}
auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
uint16_t count = p->count;
fIndex++;
cout << " continue to value pairs with size: " << count << " and code '" <<
static_cast<int>(target[offset - fIndex]) << "'" << endl;
// check pairs, array is sorted (but small)
bool match = false;
for (size_t j = 0; j < count; j += HYPHEN_BASE_CODE_SHIFT) {
cout << " checking pair: " << j << " value: " << hex << static_cast<int>(p->codes[j]) << " vs " <<
static_cast<int>(target[offset - fIndex]) << endl;
if (p->codes[j] == target[offset - fIndex]) {
fCode = target[offset - fIndex];
cout << " new value pair in : 0x" << j << " with code 0x" << hex << static_cast<int>(fCode) << "'" <<
endl;
fOffset = fHeader->CodeOffset(fCode);
if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
cout << "# could not resolve debug offset in pairs" << endl;
}
fNextOffset = p->codes[j + 1] & 0x3fff;
fType = static_cast<PathType>(p->codes[j + 1] >> SHIFT_BITS_14);
match = true;
break;
} else if (p->codes[j] > target[offset - fIndex]) {
break;
}
}
if (!match) {
cout << "# break loop on pairs" << endl;
return true;
}
return false;
}
void PrintResult(const vector<uint8_t>& result, const vector<uint16_t>& target)
{
cout << dec << "result size: " << result.size() << " while expecting " << target.size() << endl;
if (result.size() <= target.size() + 1) {
size_t i = 0;
for (auto bp : result) {
cout << hex << static_cast<int>(target[i++]) << ": " << to_string(bp) << endl;
}
}
}
bool InitializeCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const char* filePath)
{
if (codeInfo.OpenPatFile(filePath) != SUCCEED) {
return false;
}
if (codeInfo.GetHeader() != SUCCEED) {
codeInfo.ClearResource();
return false;
}
return true;
}
void ProcessCodeLoop(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target, size_t i,
std::vector<uint8_t>& result)
{
bool continueLoop = true;
while (continueLoop) {
std::cout << "#loop c: '" << codeInfo.fCode << "' starting with offset: 0x" << std::hex << codeInfo.fOffset <<
" table-offset 0x" << codeInfo.fNextOffset << " index: " << codeInfo.fIndex << std::endl;
codeInfo.ProcessPattern(i, result, codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN);
if (codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN) {
continueLoop = false;
} else if (codeInfo.fType == OHOS::Hyphenate::PathType::DIRECT) {
if (codeInfo.ProcessDirect(target, i)) {
continueLoop = false;
}
} else if (codeInfo.fType == OHOS::Hyphenate::PathType::LINEAR) {
codeInfo.ProcessLinear(target, i, result);
continueLoop = false;
} else {
if (codeInfo.ProcessNextCode(target, i)) {
continueLoop = false;
}
}
}
}
void ProcessCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target,
std::vector<uint8_t>& result)
{
for (size_t i = target.size() - 1; i != 0; --i) {
if (codeInfo.GetCodeInfo(target[i]) != SUCCEED) {
continue;
}
codeInfo.fIndex = 0;
ProcessCodeLoop(codeInfo, target, i, result);
}
}
int32_t HyphenReader::Read(const char* filePath, const std::vector<uint16_t>& utf16Target) const
{
CodeInfo codeInfo;
if (!InitializeCodeInfo(codeInfo, filePath)) {
return FAILED;
}
std::vector<uint8_t> result(utf16Target.size(), 0);
ProcessCodeInfo(codeInfo, utf16Target, result);
codeInfo.ClearResource();
PrintResult(result, utf16Target);
return SUCCEED;
}
} // namespace OHOS::Hyphenate
namespace {
constexpr size_t ARG_NUM = 2;
std::vector<uint16_t> CheckArgs(int argc, char** argv)
{
std::vector<uint16_t> target;
if (argc != 3) { // 3: valid argument number
cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
return target;
}
target = OHOS::Hyphenate::GetInputWord(argv[ARG_NUM]);
if (target.empty()) {
cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
}
return target;
}
} // namespace
int main(int argc, char** argv)
{
std::vector<uint16_t> target = CheckArgs(argc, argv);
if (target.empty()) {
return FAILED;
}
OHOS::Hyphenate::HyphenReader hyphenReader;
return hyphenReader.Read(argv[1], target);
}