gecko-dev/intl/lwbrk/rulebrk.c
Sylvestre Ledru e77bfc655d Bug 1519636 - Reformat recent changes to the Google coding style r=Ehsan
# ignore-this-changeset

Differential Revision: https://phabricator.services.mozilla.com/D38057

--HG--
extra : moz-landing-system : lando
2019-07-16 07:33:44 +00:00

389 lines
9.0 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#define TH_UNICODE
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include "th_char.h"
#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
#define th_isspace(c) ((c) == ' ' || (c) == '\t')
/*
/////////////////////////////////////////////////
// Thai character type array
*/
typedef unsigned short twb_t;
extern const twb_t _TwbType[0x100 - 0xa0];
/*
// bit definition
*/
#define VRS 0x0001
#define VRE 0x0002
#define VRX 0x0004
#define VRA 0x0008
#define VLA 0x0010
#define VLO 0x0020
#define VLI 0x0040
#define VC 0x0080
#define CC 0x0100
#define CS 0x0200
#define C2 0x0400
#define CHB 0x0800
#define CHE 0x1000
#define MT 0x2000
/*
//_#define me 0x2000
*/
#define M 0x4000
#define T 0x8000
#define VL (VLA | VLO | VLI)
#define VR (VRS | VRE | VRX)
#define NE (VL | VRS)
#define NB (VR | M)
#define V (VL | VR)
#define CX (CC | CS)
#define C (CX | VC)
#define A (C | V | M)
#define twbtype(c) (_TwbType[th_zcode(c)])
#ifndef TRUE
# define TRUE 1
# define FALSE 0
#endif
#define RETURN(b) return (b)
/*
/////////////////////////////////////////////////
*/
int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
int right)
/* const ThBreakIterator *it, const th_char **p)*/
{
/*
//int left, right;
//const th_char *s = *p;
*/
const th_char* lstr = pstr + left;
th_char _c[6];
twb_t _t[6];
#define c(i) (_c[(i) + 3])
#define t(i) (_t[(i) + 3])
int i, j;
/*
//left = s - it->begin;
*/
if (left < 0) return -1;
/*
//right = (it->end == NULL) ? 4 : it->begin - s;
*/
if (right < 1) return -1;
/*
// get c(0), t(0)
*/
c(0) = rstr[0]; /* may be '\0' */
if (!th_isthai(c(0))) return -1;
t(0) = twbtype(c(0));
if (!(t(0) & A)) return -1;
/*
// get c(-1), t(-1)
*/
if (left >= 1) {
c(-1) = lstr[-1];
if (!th_isthai(c(-1))) return 0;
t(-1) = twbtype(c(-1));
if (!(t(-1) & A)) return 0; /* handle punctuation marks here */
} else {
c(-1) = 0;
t(-1) = 0;
}
/*
// get c(1..2), t(1..2)
*/
for (i = 1; i <= 2; i++) {
if (i >= right) {
c(i) = 0;
t(i) = 0;
} else {
c(i) = rstr[i]; /* may be '\0'; */
if (!th_isthai(c(i)))
right = i--;
else {
t(i) = twbtype(c(i));
if (!(t(i) & A)) right = i--;
}
}
}
/*
// get c(-2..-3), t(-2..-3)
*/
for (i = -2, j = -2; i >= -3; j--) {
if (j < -left) {
c(i) = 0;
t(i) = 0;
i--;
} else {
c(i) = lstr[j];
if (!th_isthai(c(i)))
left = 0;
else {
t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
if (!(t(i) & A))
left = 0;
else {
if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) {
c(i + 1) = c(i);
t(i + 1) = t(i);
} else
i--;
}
}
}
}
/*
// prohibit the unlikely
*/
if ((t(-1) & C) && (t(0) & C)) {
if ((t(-1) & CHE) || (t(0) & CHB)) return -1;
}
/*
// special case : vlao, C/ sara_a|aa, !sara_a
*/
if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
(c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA))
return 0;
/*
// prohibit break
*/
if (t(0) & NB) return -1;
if (t(-1) & NE) return -1;
/*
// apply 100% rules
*/
if (t(-1) & VRE) {
if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
return -1; /* usually too short syllable, part of word */
}
if (t(-2) & VRE) return -1;
if ((t(0) & C) && (t(1) & (VR | MT)) &&
(c(2) != TH_THANTHAKHAT)) { /*?C, NB */
if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
if (t(-1) & (V | M)) return 0; /* !C/ C, NB */
if (t(-2) & VRS) return 0; /* VRS, C / C, NB */
if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
}
}
if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M)))
return 0; /* VRS, C/ !C */
if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
}
/*
// apply 90% rules
*/
if (t(0) & VL) return 0;
if (t(1) & VL) return -1;
if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING)
return 0;
/*
//return -1;
// apply 80% rules
*/
if (t(0) & CHE) {
if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
/*if(t(-1) & VRX) return 0; // VRX/ CHE */
if (t(-1) & VC) return 0; /* VC/ CHE */
}
if (t(-1) & CHB) {
if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
if (t(0) & VC) return 0; /* CHB/ VC */
}
if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
if (t(-2) & VLI)
return 0; /* VLI,C/C,VR .*/
else { /* vlao, C ? C , VR */
if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
}
}
/* C,MT,C */
if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
return -1;
}
int TrbFollowing(const th_char* begin, int length, int offset)
/*
//(ThBreakIterator *this, int offset)
*/
{
const th_char* w = begin + offset;
const th_char* end = begin + length;
while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
if (w < end && *w && !th_isthai(*w)) {
int english = FALSE;
while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
if (th_isalpha(*w)) english = TRUE;
w++;
}
if (english || w == end || (!th_isthai(*w) && th_isspace(*w)))
return w - begin;
}
if (w == end || *w == 0 || !th_isthai(*w)) return w - begin;
w++;
if (w < end && *w && th_isthai(*w)) {
int brk = TrbWordBreakPos(begin, w - begin, w, end - w);
while (brk < 0) {
w++;
if (w == end || *w == 0 || !th_isthai(*w)) break;
brk = TrbWordBreakPos(begin, w - begin, w, end - w);
}
if (brk > 0) w += brk;
}
if (w < end && *w && !th_isthai(*w)) {
while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) &&
!th_isspace(*w))
w++;
}
return w - begin;
}
/*
/////////////////////////////////////////////////
*/
const twb_t _TwbType[0x100 - 0xa0] = {
#if 0
/* 80 € */ T,
/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/* 90  */ T,
/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#endif
/* a0   */ 0,
/* a1 ¡ */ CS,
/* a2 ¢ */ CS | CHE,
/* a3 £ */ CC | CHE,
/* a4 € */ CS | CHE,
/* a5 ¥ */ CC | CHE,
/* a6 Š */ CS,
/* a7 § */ CS | CHB,
/* a8 š */ CS,
/* a9 © */ CC | CHE,
/* aa ª */ CS,
/* ab « */ CC | CHE,
/* ac ¬ */ CC | CHB | CHE,
/* ad ­ */ CS | CHB,
/* ae ® */ CS | CHB,
/* af ¯ */ CS | CHB,
/* b0 ° */ CS,
/* b1 ± */ CS | CHB | CHE,
/* b2 ² */ CS | CHB | CHE,
/* b3 ³ */ CS | CHB,
/* b4 Ž */ CS,
/* b5 µ */ CS,
/* b6 ¶ */ CS,
/* b7 · */ CS,
/* b8 ž */ CS,
/* b9 ¹ */ CS,
/* ba º */ CS,
/* bb » */ CS,
/* bc Œ */ CC | CHE,
/* bd œ */ CC | CHE,
/* be Ÿ */ CS,
/* bf ¿ */ CS,
/* c0 À */ CS | CHE,
/* c1 Á */ CS,
/* c2 Â */ CS,
/* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
/* c4 Ä */ VC | CHE,
/* c5 Å */ CS | C2,
/* c6 Æ */ VC | CHE,
/* c7 Ç */ VC | C2,
/* c8 È */ CS,
/* c9 É */ CS | CHB,
/* ca Ê */ CS | CHE,
/* cb Ë */ CC | CHE,
/* CC Ì */ CS | CHB | CHE,
/* cd Í */ VC,
/* ce Î */ CC | CHE,
/* cf Ï */ T,
/* d0 Ð */ VRE | VRA,
/* d1 Ñ */ VRS,
/* d2 Ò */ VRX | VRA,
/* d3 Ó */ VRE,
/* d4 Ô */ VRX | VRA,
/* d5 Õ */ VRX | VRA,
/* d6 Ö */ VRS,
/* d7 × */ VRS | VRA,
/* d8 Ø */ VRX,
/* d9 Ù */ VRX,
/* da Ú */ T,
/* db Û */ 0,
/* dc Ü */ 0,
/* dd Ý */ 0,
/* de Þ */ 0,
/* df ß */ T,
/* e0 à */ VLA,
/* e1 á */ VLO,
/* e2 â */ VLO,
/* e3 ã */ VLI,
/* e4 ä */ VLI,
/* e5 å */ VRE,
/* e6 æ */ M,
/* e7 ç */ M,
/* e8 è */ M | MT,
/* e9 é */ M | MT,
/* ea ê */ M | MT,
/* eb ë */ M | MT,
/* ec ì */ M,
/* ed í */ T,
/* ee î */ T,
/* ef ï */ T,
/* f0 ð */ T,
/* f1 ñ */ T,
/* f2 ò */ T,
/* f3 ó */ T,
/* f4 ô */ T,
/* f5 õ */ T,
/* f6 ö */ T,
/* f7 ÷ */ T,
/* f8 ø */ T,
/* f9 ù */ T,
/* fa ú */ T,
/* fb û */ T,
/* fc ü */ 0,
/* fd ý */ 0,
/* fe þ */ 0,
/* ff ’ */ 0};