check in thai rule-base line breaker (temp untill the real one). Thanks Samphan Raruenrom

This commit is contained in:
ftang%netscape.com 1999-06-07 20:22:11 +00:00
parent cbc6fd02d4
commit eb500c658e
3 changed files with 436 additions and 0 deletions

363
intl/lwbrk/src/rulebrk.c Normal file
View File

@ -0,0 +1,363 @@
/*
The contents of this file are subject to the Mozilla Public License
Version 1.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS"
basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
See the License for the specific language governing rights and
limitations under the License.
The Original Code is LibInThai.
The Initial Developer of the Original Code is Samphan Raruenrom.
Portions created by Samphan Raruenrom are Copyright (C) 1998
Samphan Raruenrom. All Rights Reserved.
Contributor(s):
*/
#define TH_UNICODE
#include <stdlib.h>
#include <assert.h>
#include "th_char.h"
#define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
#define th_isspace(c) ((c)==' '||(c)=='\t')
/*
/////////////////////////////////////////////////
// Thai character type array
*/
typedef unsigned short twb_t; // platform dependent
extern const twb_t _TwbType[0x100-0xa0];
/*
// bit definition
*/
#define VRS 0x0001
#define VRE 0x0002
#define VRX 0x0004
#define VRA 0x0008
#define VLA 0x0010
#define VLO 0x0020
#define VLI 0x0040
#define VC 0x0080
#define CC 0x0100
#define CS 0x0200
#define C2 0x0400
#define CHB 0x0800
#define CHE 0x1000
#define MT 0x2000
//_#define me 0x2000
#define M 0x4000
#define T 0x8000
#define VL (VLA|VLO|VLI)
#define VR (VRS|VRE|VRX)
#define NE (VL|VRS)
#define NB (VR|M)
#define V (VL|VR)
#define CX (CC|CS)
#define C (CX|VC)
#define A (C|V|M)
#define twbtype(c) (_TwbType[th_zcode(c)])
#ifndef TRUE
#define TRUE 1
#define FALSE 0
#endif
#define RETURN(b) return (b)
/*
/////////////////////////////////////////////////
*/
int TrbWordBreakPos(const th_char *pstr, int left,
const th_char *rstr, int right)
/* const ThBreakIterator *it, const th_char **p)*/
{
//int left, right;
//const th_char *s = *p;
const th_char *lstr = pstr + left;
th_char _c[6];
twb_t _t[6];
#define c(i) (_c[(i)+3])
#define t(i) (_t[(i)+3])
int i, j;
//left = s - it->begin;
if(left < 0) return -1;
//right = (it->end == NULL) ? 4 : it->begin - s;
if(right < 1) return -1;
// get c(0), t(0)
c(0) = rstr[0]; // may be '\0'
if(!th_isthai(c(0))) return -1;
t(0) = twbtype(c(0));
if(!(t(0) & A)) return -1;
// get c(-1), t(-1)
if(left >= 1) {
c(-1) = lstr[-1];
if(!th_isthai(c(-1))) return 0;
t(-1) = twbtype(c(-1));
if(!(t(-1) & A)) return 0; // handle punctuation marks here
} else { c(-1) = 0; t(-1) = 0; }
// get c(1..2), t(1..2)
for(i = 1; i <= 2; i++) {
if(i >= right) { c(i) = 0; t(i) = 0; }
else {
c(i) = rstr[i]; // may be '\0';
if(!th_isthai(c(i))) right = i--;
else {
t(i) = twbtype(c(i));
if(!(t(i) & A)) right = i--;
}
}
}
// get c(-2..-3), t(-2..-3)
for(i = -2, j = -2; i >= -3 ; j--) {
if(j < -left) { c(i) = 0; t(i) = 0; i--; }
else {
c(i) = lstr[j];
if(!th_isthai(c(i))) left = 0;
else {
t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
if(!(t(i) & A)) left = 0;
else {
if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
c(i+1) = c(i); t(i+1) = t(i);
} else i--;
}
}
}
}
// prohibit the unlikely
if((t(-1) & C) && (t(0) & C)) {
if((t(-1) & CHE) || (t(0) & CHB)) return -1;
}
// special case : vlao, C/ sara_a|aa, !sara_a
if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
(c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
// prohibit break
if(t(0) & NB) return -1;
if(t(-1) & NE) return -1;
// apply 100% rules
if(t(-1) & VRE) {
if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
return -1; // usually too short syllable, part of word
}
if(t(-2) & VRE) return -1;
if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { //?C, NB
if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; // exception
if(t(-1) & (V|M)) return 0; // !C/ C, NB
if(t(-2) & VRS) return 0; // VRS, C / C, NB
if(!(t(0) & C2) && c(1) == TH_SARA_I) { // / !C2 or /c, sara_i
if(t(-2) & VRX) return 0; // VRX, C / C, NB ? 100%?
if(t(-2) & VC) return 0; // VC, C / C, NB ? 100%
}
}
if((t(-1) & VRX) && (t(0) & CC)) return 0; // VRX/ CC
if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;// VRS, C/ !C
if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
if((t(-2) & A) && (t(-1) & CX)) return 0; // A, CX / CX, C2
if((t(-2) & CX) && (t(-1) & MT)) return 0; // CX, MT / CX, C2
}
// apply 90% rules
if(t(0) & VL) return 0;
if(t(1) & VL) return -1;
if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
//return -1;
// apply 80% rules
if(t(0) & CHE) {
if((t(-2) & VRS) && (t(-1) & C)) return 0; // VRS, C/ CHE
//if(t(-1) & VRX) return 0; // VRX/ CHE
if(t(-1) & VC) return 0; // VC/ CHE
}
if(t(-1) & CHB) {
if((t(0) & C) && (t(1) & VR)) return 0; // CHB/ CC, VR
if(t(0) & VC) return 0; // CHB/ VC
}
if((t(-2) & VL) && (t(1) & VR)) { // VL, C? C, VR
if(t(-2) & VLI) return 0; // VLI,C/C,VR
else { // vlao, C ? C , VR
if(c(1) == TH_SARA_A) return 2; // vlao, C, C, sara_a/
if(t(-2) & VLO) return 0; // VLO, C/ C, !sara_a
if(!(t(1) & VRA)) return 0; // VLA, C/ C, !vca
}
}
// C,MT,C/
if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
return -1;
}
int TrbFollowing(const th_char *begin, int length, int offset)
/*
//(ThBreakIterator *this, int offset)
*/
{
const th_char *w = begin + offset;
const th_char *end = begin + length;
while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
if(w < end && *w && !th_isthai(*w)) {
int english = FALSE;
while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
if(th_isalpha(*w)) english = TRUE;
w++;
}
if(english || w == end ||
(!th_isthai(*w) && th_isspace(*w))) return w - begin;
}
if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
w++;
if(w < end && *w && th_isthai(*w)) {
int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
while (brk < 0) {
w++;
if(w == end || *w == 0 || !th_isthai(*w)) break;
brk = TrbWordBreakPos(begin, w-begin, w, end-w);
}
if (brk > 0) w += brk;
}
if(w < end && *w && !th_isthai(*w)) {
while(w < end && *w && !th_isthai(*w) &&
!th_isalpha(*w) && !th_isspace(*w)) w++;
}
return w - begin;
}
/*
/////////////////////////////////////////////////
*/
const twb_t _TwbType[0x100-0xa0] = {
#if 0
/* 80 € */ T,
/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/* 90 <20> */ T,
/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#endif
/* a0   */ 0,
/* a1 ¡ */ CS,
/* a2 ¢ */ CS | CHE,
/* a3 £ */ CC | CHE,
/* a4 ¤ */ CS | CHE,
/* a5 ¥ */ CC | CHE,
/* a6 ¦ */ CS,
/* a7 § */ CS | CHB,
/* a8 ¨ */ CS,
/* a9 © */ CC | CHE,
/* aa ª */ CS,
/* ab « */ CC | CHE,
/* ac ¬ */ CC | CHB | CHE,
/* ad ­ */ CS | CHB,
/* ae ® */ CS | CHB,
/* af ¯ */ CS | CHB,
/* b0 ° */ CS,
/* b1 ± */ CS | CHB | CHE,
/* b2 ² */ CS | CHB | CHE,
/* b3 ³ */ CS | CHB,
/* b4 ´ */ CS,
/* b5 µ */ CS,
/* b6 ¶ */ CS,
/* b7 · */ CS,
/* b8 ¸ */ CS,
/* b9 ¹ */ CS,
/* ba º */ CS,
/* bb » */ CS,
/* bc ¼ */ CC | CHE,
/* bd ½ */ CC | CHE,
/* be ¾ */ CS,
/* bf ¿ */ CS,
/* c0 À */ CS | CHE,
/* c1 Á */ CS,
/* c2 Â */ CS,
/* c3 Ã */ CS | C2 | CHE, // ? add CHE
/* c4 Ä */ VC | CHE,
/* c5 Å */ CS | C2,
/* c6 Æ */ VC | CHE,
/* c7 Ç */ VC | C2,
/* c8 È */ CS,
/* c9 É */ CS | CHB,
/* ca Ê */ CS | CHE,
/* cb Ë */ CC | CHE,
/* CC Ì */ CS | CHB | CHE,
/* cd Í */ VC,
/* ce Î */ CC | CHE,
/* cf Ï */ T,
/* d0 Ð */ VRE | VRA,
/* d1 Ñ */ VRS,
/* d2 Ò */ VRX | VRA,
/* d3 Ó */ VRE,
/* d4 Ô */ VRX | VRA,
/* d5 Õ */ VRX | VRA,
/* d6 Ö */ VRS,
/* d7 × */ VRS | VRA,
/* d8 Ø */ VRX,
/* d9 Ù */ VRX,
/* da Ú */ T,
/* db Û */ 0,
/* dc Ü */ 0,
/* dd Ý */ 0,
/* de Þ */ 0,
/* df ß */ T,
/* e0 à */ VLA,
/* e1 á */ VLO,
/* e2 â */ VLO,
/* e3 ã */ VLI,
/* e4 ä */ VLI,
/* e5 å */ VRE,
/* e6 æ */ M,
/* e7 ç */ M,
/* e8 è */ M | MT,
/* e9 é */ M | MT,
/* ea ê */ M | MT,
/* eb ë */ M | MT,
/* ec ì */ M,
/* ed í */ T,
/* ee î */ T,
/* ef ï */ T,
/* f0 ð */ T,
/* f1 ñ */ T,
/* f2 ò */ T,
/* f3 ó */ T,
/* f4 ô */ T,
/* f5 õ */ T,
/* f6 ö */ T,
/* f7 ÷ */ T,
/* f8 ø */ T,
/* f9 ù */ T,
/* fa ú */ T,
/* fb û */ T,
/* fc ü */ 0,
/* fd ý */ 0,
/* fe þ */ 0,
/* ff */ 0
};

26
intl/lwbrk/src/rulebrk.h Normal file
View File

@ -0,0 +1,26 @@
/*
Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
Permission to use, copy, modify, distribute and sell this software
and its documentation for any purpose is hereby granted without fee,
provided that the above copyright notice appear in all copies and
that both that copyright notice and this permission notice appear
in supporting documentation. Samphan Raruenrom makes no
representations about the suitability of this software for any
purpose. It is provided "as is" without express or implied warranty.
*/
#ifndef __RULEBRK_H__
#define __RULEBRK_H__
#include "th_char.h"
#ifdef __cplusplus
extern "C" {
#endif
int TrbWordBreakPos(const th_char *pstr, int left,
const th_char *rstr, int right);
int TrbFollowing(const th_char *begin, int length, int offset);
#ifdef __cplusplus
}
#endif
#endif

47
intl/lwbrk/src/th_char.h Normal file
View File

@ -0,0 +1,47 @@
/*
Copyright (c) 1999 Samphan Raruenrom <samphan@thai.com>
Permission to use, copy, modify, distribute and sell this software
and its documentation for any purpose is hereby granted without fee,
provided that the above copyright notice appear in all copies and
that both that copyright notice and this permission notice appear
in supporting documentation. Samphan Raruenrom makes no
representations about the suitability of this software for any
purpose. It is provided "as is" without express or implied warranty.
*/
#ifndef __TH_CHAR_H__
#define __TH_CHAR_H__
#include "nscore.h"
typedef unsigned char tis_char;
#ifdef TH_UNICODE
typedef PRUnichar th_char;
#define TH_THAIBEGIN_ 0x0e00
#define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f)
#else
typedef tis_char th_char;
#define TH_THAIBEGIN_ 0xa0
#define th_isthai(c) ((c) >= 0xa0)
#endif
#define th_zcode(c) ((c) - TH_THAIBEGIN_)
enum TH_CHARNAME {
TH_THAIBEGIN = TH_THAIBEGIN_,
TH_KOKAI,TH_KHOKHAI,TH_KHOKHUAT,TH_KHOKHWAI,TH_KHOKHON,TH_KHORAKHANG,
TH_NGONGU,TH_CHOCHAN,TH_CHOCHING,TH_CHOCHANG,TH_SOSO,TH_CHOCHOE,TH_YOYING,
TH_DOCHADA,TH_TOPATAK,TH_THOTHAN,TH_THONANGMONTHO,TH_THOPHUTHAO,TH_NONEN,
TH_DODEK,TH_TOTAO,TH_THOTHUNG,TH_THOTHAHAN,TH_THOTHONG,TH_NONU,TH_BOBAIMAI,
TH_POPLA,TH_PHOPHUNG,TH_FOFA,TH_PHOPHAN,TH_FOFAN,TH_PHOSAMPHAO,TH_MOMA,
TH_YOYAK,TH_RORUA,TH_RU,TH_LOLING,TH_LU,TH_WOWAEN,TH_SOSALA,TH_SORUSI,
TH_SOSUA,TH_HOHIP,TH_LOCHULA,TH_OANG,TH_HONOKHUK,TH_PAIYANNOI,TH_SARA_A,
TH_MAIHANAKAT,TH_SARA_AA,TH_SARA_AM,TH_SARA_I,TH_SARA_II,TH_SARA_UE,
TH_SARA_UEE,TH_SARA_U,TH_SARA_UU,TH_PHINTHU,TH_REM_CHERNG_,TH_TAC_WBRK_,
TH_UNDEF_DD,TH_UNDEF_DE,TH_BAHT,TH_SARA_E,TH_SARA_AE,TH_SARA_O,TH_MAIMUAN,
TH_MAIMALAI,TH_LAKKHANGYAO,TH_MAIYAMOK,TH_MAITAIKHU,TH_MAIEK,TH_MAITHO,
TH_MAITRI,TH_MAICHATTAWA,TH_THANTHAKHAT,TH_NIKHAHIT,TH_YAMAKKAN,TH_FONGMAN,
TH_THAIZERO,TH_THAIONE,TH_THAITWO,TH_THAITHREE,TH_THAIFOUR,TH_THAIFIVE,
TH_THAISIX,TH_THAISEVEN,TH_THAIEIGHT,TH_THAININE,TH_ANGKHANKHU,TH_KHOMUT,
TH_UNDEF_FC,TH_UNDEF_FD,TH_UNDEF_FE,TH_THAIEND
};
#endif