mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-05 08:35:26 +00:00
790694c449
--HG-- rename : intl/lwbrk/src/Makefile.in => intl/lwbrk/Makefile.in rename : intl/lwbrk/src/crashtests/416721.html => intl/lwbrk/crashtests/416721.html rename : intl/lwbrk/src/crashtests/crashtests.list => intl/lwbrk/crashtests/crashtests.list rename : intl/lwbrk/src/jisx4051class.h => intl/lwbrk/jisx4051class.h rename : intl/lwbrk/src/jisx4051pairtable.txt => intl/lwbrk/jisx4051pairtable.txt rename : intl/lwbrk/src/nsCarbonBreaker.cpp => intl/lwbrk/nsCarbonBreaker.cpp rename : intl/lwbrk/src/nsComplexBreaker.h => intl/lwbrk/nsComplexBreaker.h rename : intl/lwbrk/public/nsILineBreaker.h => intl/lwbrk/nsILineBreaker.h rename : intl/lwbrk/idl/nsISemanticUnitScanner.idl => intl/lwbrk/nsISemanticUnitScanner.idl rename : intl/lwbrk/public/nsIWordBreaker.h => intl/lwbrk/nsIWordBreaker.h rename : intl/lwbrk/src/nsJISx4051LineBreaker.cpp => intl/lwbrk/nsJISx4051LineBreaker.cpp rename : intl/lwbrk/src/nsJISx4051LineBreaker.h => intl/lwbrk/nsJISx4051LineBreaker.h rename : intl/lwbrk/public/nsLWBrkCIID.h => intl/lwbrk/nsLWBrkCIID.h rename : intl/lwbrk/src/nsPangoBreaker.cpp => intl/lwbrk/nsPangoBreaker.cpp rename : intl/lwbrk/src/nsRuleBreaker.cpp => intl/lwbrk/nsRuleBreaker.cpp rename : intl/lwbrk/src/nsSampleWordBreaker.cpp => intl/lwbrk/nsSampleWordBreaker.cpp rename : intl/lwbrk/src/nsSampleWordBreaker.h => intl/lwbrk/nsSampleWordBreaker.h rename : intl/lwbrk/src/nsSemanticUnitScanner.cpp => intl/lwbrk/nsSemanticUnitScanner.cpp rename : intl/lwbrk/src/nsSemanticUnitScanner.h => intl/lwbrk/nsSemanticUnitScanner.h rename : intl/lwbrk/src/nsUniscribeBreaker.cpp => intl/lwbrk/nsUniscribeBreaker.cpp rename : intl/lwbrk/src/rulebrk.c => intl/lwbrk/rulebrk.c rename : intl/lwbrk/src/rulebrk.h => intl/lwbrk/rulebrk.h rename : intl/lwbrk/src/th_char.h => intl/lwbrk/th_char.h
376 lines
7.9 KiB
C
376 lines
7.9 KiB
C
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||
#define TH_UNICODE
|
||
|
||
#include <stdlib.h>
|
||
#include <assert.h>
|
||
#include "th_char.h"
|
||
#define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
|
||
#define th_isspace(c) ((c)==' '||(c)=='\t')
|
||
|
||
|
||
/*
|
||
/////////////////////////////////////////////////
|
||
// Thai character type array
|
||
*/
|
||
|
||
typedef unsigned short twb_t;
|
||
extern const twb_t _TwbType[0x100-0xa0];
|
||
|
||
/*
|
||
// bit definition
|
||
*/
|
||
|
||
#define VRS 0x0001
|
||
#define VRE 0x0002
|
||
#define VRX 0x0004
|
||
|
||
#define VRA 0x0008
|
||
|
||
#define VLA 0x0010
|
||
#define VLO 0x0020
|
||
#define VLI 0x0040
|
||
|
||
#define VC 0x0080
|
||
|
||
#define CC 0x0100
|
||
#define CS 0x0200
|
||
|
||
#define C2 0x0400
|
||
#define CHB 0x0800
|
||
#define CHE 0x1000
|
||
|
||
#define MT 0x2000
|
||
/*
|
||
//_#define me 0x2000
|
||
*/
|
||
#define M 0x4000
|
||
|
||
#define T 0x8000
|
||
|
||
#define VL (VLA|VLO|VLI)
|
||
#define VR (VRS|VRE|VRX)
|
||
#define NE (VL|VRS)
|
||
#define NB (VR|M)
|
||
#define V (VL|VR)
|
||
#define CX (CC|CS)
|
||
#define C (CX|VC)
|
||
#define A (C|V|M)
|
||
|
||
#define twbtype(c) (_TwbType[th_zcode(c)])
|
||
|
||
#ifndef TRUE
|
||
#define TRUE 1
|
||
#define FALSE 0
|
||
#endif
|
||
#define RETURN(b) return (b)
|
||
|
||
|
||
/*
|
||
/////////////////////////////////////////////////
|
||
*/
|
||
|
||
int TrbWordBreakPos(const th_char *pstr, int left,
|
||
const th_char *rstr, int right)
|
||
/* const ThBreakIterator *it, const th_char **p)*/
|
||
{
|
||
/*
|
||
//int left, right;
|
||
//const th_char *s = *p;
|
||
*/
|
||
const th_char *lstr = pstr + left;
|
||
th_char _c[6];
|
||
twb_t _t[6];
|
||
#define c(i) (_c[(i)+3])
|
||
#define t(i) (_t[(i)+3])
|
||
int i, j;
|
||
|
||
/*
|
||
//left = s - it->begin;
|
||
*/
|
||
if(left < 0) return -1;
|
||
/*
|
||
//right = (it->end == NULL) ? 4 : it->begin - s;
|
||
*/
|
||
if(right < 1) return -1;
|
||
|
||
/*
|
||
// get c(0), t(0)
|
||
*/
|
||
c(0) = rstr[0]; /* may be '\0' */
|
||
if(!th_isthai(c(0))) return -1;
|
||
t(0) = twbtype(c(0));
|
||
if(!(t(0) & A)) return -1;
|
||
|
||
/*
|
||
// get c(-1), t(-1)
|
||
*/
|
||
if(left >= 1) {
|
||
c(-1) = lstr[-1];
|
||
if(!th_isthai(c(-1))) return 0;
|
||
t(-1) = twbtype(c(-1));
|
||
if(!(t(-1) & A)) return 0; /* handle punctuation marks here */
|
||
} else { c(-1) = 0; t(-1) = 0; }
|
||
|
||
/*
|
||
// get c(1..2), t(1..2)
|
||
*/
|
||
for(i = 1; i <= 2; i++) {
|
||
if(i >= right) { c(i) = 0; t(i) = 0; }
|
||
else {
|
||
c(i) = rstr[i]; /* may be '\0'; */
|
||
if(!th_isthai(c(i))) right = i--;
|
||
else {
|
||
t(i) = twbtype(c(i));
|
||
if(!(t(i) & A)) right = i--;
|
||
}
|
||
}
|
||
}
|
||
/*
|
||
// get c(-2..-3), t(-2..-3)
|
||
*/
|
||
for(i = -2, j = -2; i >= -3 ; j--) {
|
||
if(j < -left) { c(i) = 0; t(i) = 0; i--; }
|
||
else {
|
||
c(i) = lstr[j];
|
||
if(!th_isthai(c(i))) left = 0;
|
||
else {
|
||
t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
|
||
if(!(t(i) & A)) left = 0;
|
||
else {
|
||
if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
|
||
c(i+1) = c(i); t(i+1) = t(i);
|
||
} else i--;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/*
|
||
// prohibit the unlikely
|
||
*/
|
||
if((t(-1) & C) && (t(0) & C)) {
|
||
if((t(-1) & CHE) || (t(0) & CHB)) return -1;
|
||
}
|
||
/*
|
||
// special case : vlao, C/ sara_a|aa, !sara_a
|
||
*/
|
||
if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
|
||
(c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
|
||
|
||
/*
|
||
// prohibit break
|
||
*/
|
||
if(t(0) & NB) return -1;
|
||
if(t(-1) & NE) return -1;
|
||
|
||
|
||
/*
|
||
// apply 100% rules
|
||
*/
|
||
if(t(-1) & VRE) {
|
||
if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
|
||
return -1; /* usually too short syllable, part of word */
|
||
}
|
||
|
||
if(t(-2) & VRE) return -1;
|
||
|
||
if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
|
||
if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
|
||
if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
|
||
if(t(-2) & VRS) return 0; /* VRS, C / C, NB */
|
||
if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
|
||
if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
|
||
if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
|
||
}
|
||
}
|
||
if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
|
||
if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
|
||
|
||
|
||
if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
|
||
if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
|
||
if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
|
||
}
|
||
/*
|
||
// apply 90% rules
|
||
*/
|
||
if(t(0) & VL) return 0;
|
||
if(t(1) & VL) return -1;
|
||
if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
|
||
|
||
/*
|
||
//return -1;
|
||
// apply 80% rules
|
||
*/
|
||
if(t(0) & CHE) {
|
||
if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
|
||
/*if(t(-1) & VRX) return 0; // VRX/ CHE */
|
||
if(t(-1) & VC) return 0; /* VC/ CHE */
|
||
}
|
||
if(t(-1) & CHB) {
|
||
if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
|
||
if(t(0) & VC) return 0; /* CHB/ VC */
|
||
}
|
||
|
||
if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
|
||
if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/
|
||
else { /* vlao, C ? C , VR */
|
||
if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
|
||
if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
|
||
if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
|
||
}
|
||
}
|
||
/* C,MT,C */
|
||
if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
|
||
|
||
return -1;
|
||
}
|
||
|
||
|
||
int TrbFollowing(const th_char *begin, int length, int offset)
|
||
/*
|
||
//(ThBreakIterator *this, int offset)
|
||
*/
|
||
{
|
||
const th_char *w = begin + offset;
|
||
const th_char *end = begin + length;
|
||
while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
|
||
|
||
if(w < end && *w && !th_isthai(*w)) {
|
||
int english = FALSE;
|
||
while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
|
||
if(th_isalpha(*w)) english = TRUE;
|
||
w++;
|
||
}
|
||
if(english || w == end ||
|
||
(!th_isthai(*w) && th_isspace(*w))) return w - begin;
|
||
}
|
||
if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
|
||
w++;
|
||
if(w < end && *w && th_isthai(*w)) {
|
||
int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
|
||
while (brk < 0) {
|
||
w++;
|
||
if(w == end || *w == 0 || !th_isthai(*w)) break;
|
||
brk = TrbWordBreakPos(begin, w-begin, w, end-w);
|
||
}
|
||
if (brk > 0) w += brk;
|
||
}
|
||
if(w < end && *w && !th_isthai(*w)) {
|
||
while(w < end && *w && !th_isthai(*w) &&
|
||
!th_isalpha(*w) && !th_isspace(*w)) w++;
|
||
}
|
||
return w - begin;
|
||
}
|
||
|
||
|
||
/*
|
||
/////////////////////////////////////////////////
|
||
*/
|
||
const twb_t _TwbType[0x100-0xa0] = {
|
||
#if 0
|
||
/* 80 <20> */ T,
|
||
/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
/* 90 <20> */ T,
|
||
/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
#endif
|
||
/* a0 <20> */ 0,
|
||
/* a1 <20> */ CS,
|
||
/* a2 <20> */ CS | CHE,
|
||
/* a3 <20> */ CC | CHE,
|
||
/* a4 <20> */ CS | CHE,
|
||
/* a5 <20> */ CC | CHE,
|
||
/* a6 <20> */ CS,
|
||
/* a7 <20> */ CS | CHB,
|
||
/* a8 <20> */ CS,
|
||
/* a9 <20> */ CC | CHE,
|
||
/* aa <20> */ CS,
|
||
/* ab <20> */ CC | CHE,
|
||
/* ac <20> */ CC | CHB | CHE,
|
||
/* ad <20> */ CS | CHB,
|
||
/* ae <20> */ CS | CHB,
|
||
/* af <20> */ CS | CHB,
|
||
/* b0 <20> */ CS,
|
||
/* b1 <20> */ CS | CHB | CHE,
|
||
/* b2 <20> */ CS | CHB | CHE,
|
||
/* b3 <20> */ CS | CHB,
|
||
/* b4 <20> */ CS,
|
||
/* b5 <20> */ CS,
|
||
/* b6 <20> */ CS,
|
||
/* b7 <20> */ CS,
|
||
/* b8 <20> */ CS,
|
||
/* b9 <20> */ CS,
|
||
/* ba <20> */ CS,
|
||
/* bb <20> */ CS,
|
||
/* bc <20> */ CC | CHE,
|
||
/* bd <20> */ CC | CHE,
|
||
/* be <20> */ CS,
|
||
/* bf <20> */ CS,
|
||
/* c0 <20> */ CS | CHE,
|
||
/* c1 <20> */ CS,
|
||
/* c2 <20> */ CS,
|
||
/* c3 <20> */ CS | C2 | CHE, /* ? add CHE */
|
||
/* c4 <20> */ VC | CHE,
|
||
/* c5 <20> */ CS | C2,
|
||
/* c6 <20> */ VC | CHE,
|
||
/* c7 <20> */ VC | C2,
|
||
/* c8 <20> */ CS,
|
||
/* c9 <20> */ CS | CHB,
|
||
/* ca <20> */ CS | CHE,
|
||
/* cb <20> */ CC | CHE,
|
||
/* CC <20> */ CS | CHB | CHE,
|
||
/* cd <20> */ VC,
|
||
/* ce <20> */ CC | CHE,
|
||
/* cf <20> */ T,
|
||
/* d0 <20> */ VRE | VRA,
|
||
/* d1 <20> */ VRS,
|
||
/* d2 <20> */ VRX | VRA,
|
||
/* d3 <20> */ VRE,
|
||
/* d4 <20> */ VRX | VRA,
|
||
/* d5 <20> */ VRX | VRA,
|
||
/* d6 <20> */ VRS,
|
||
/* d7 <20> */ VRS | VRA,
|
||
/* d8 <20> */ VRX,
|
||
/* d9 <20> */ VRX,
|
||
/* da <20> */ T,
|
||
/* db <20> */ 0,
|
||
/* dc <20> */ 0,
|
||
/* dd <20> */ 0,
|
||
/* de <20> */ 0,
|
||
/* df <20> */ T,
|
||
/* e0 <20> */ VLA,
|
||
/* e1 <20> */ VLO,
|
||
/* e2 <20> */ VLO,
|
||
/* e3 <20> */ VLI,
|
||
/* e4 <20> */ VLI,
|
||
/* e5 <20> */ VRE,
|
||
/* e6 <20> */ M,
|
||
/* e7 <20> */ M,
|
||
/* e8 <20> */ M | MT,
|
||
/* e9 <20> */ M | MT,
|
||
/* ea <20> */ M | MT,
|
||
/* eb <20> */ M | MT,
|
||
/* ec <20> */ M,
|
||
/* ed <20> */ T,
|
||
/* ee <20> */ T,
|
||
/* ef <20> */ T,
|
||
/* f0 <20> */ T,
|
||
/* f1 <20> */ T,
|
||
/* f2 <20> */ T,
|
||
/* f3 <20> */ T,
|
||
/* f4 <20> */ T,
|
||
/* f5 <20> */ T,
|
||
/* f6 <20> */ T,
|
||
/* f7 <20> */ T,
|
||
/* f8 <20> */ T,
|
||
/* f9 <20> */ T,
|
||
/* fa <20> */ T,
|
||
/* fb <20> */ T,
|
||
/* fc <20> */ 0,
|
||
/* fd <20> */ 0,
|
||
/* fe <20> */ 0,
|
||
/* ff <20> */ 0
|
||
};
|