mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-25 13:51:41 +00:00
515 lines
12 KiB
C
515 lines
12 KiB
C
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
*
|
|
* The contents of this file are subject to the Netscape Public License
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
* http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
* for the specific language governing rights and limitations under the
|
|
* NPL.
|
|
*
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
* Reserved.
|
|
*/
|
|
/* kinsukof.c */
|
|
|
|
#include "intlpriv.h"
|
|
|
|
/* The table is defined in kinsukod.c */
|
|
extern const char *ProhibitBegin_SJIS[];
|
|
extern const char *ProhibitBegin_EUCJP[];
|
|
extern const char *ProhibitBegin_BIG5[];
|
|
extern const char *ProhibitBegin_GB[];
|
|
extern const char *ProhibitBegin_KSC[];
|
|
extern const char *ProhibitBegin_UTF8[];
|
|
extern const char *ProhibitBegin_CNS[];
|
|
|
|
extern const char *ProhibitEnd_SJIS[];
|
|
extern const char *ProhibitEnd_EUCJP[];
|
|
extern const char *ProhibitEnd_BIG5[];
|
|
extern const char *ProhibitEnd_GB[];
|
|
extern const char *ProhibitEnd_KSC[];
|
|
extern const char *ProhibitEnd_UTF8[];
|
|
extern const char *ProhibitEnd_CNS[];
|
|
|
|
PUBLIC const char *INTL_NonBreakingSpace(uint16 win_csid)
|
|
{
|
|
|
|
#ifdef XP_MAC
|
|
return "\07"; /* 0x07 */
|
|
#else
|
|
return "\240"; /* 0xA0 */
|
|
#endif
|
|
|
|
}
|
|
/*
|
|
INTL_CharClass is used for multibyte to divide character to different type
|
|
*/
|
|
#define IN_BETWEEN(a,b,c) (((a) <= (b)) && ((b) <= (c)))
|
|
PUBLIC int
|
|
INTL_CharClass(int charset, unsigned char *pstr)
|
|
{
|
|
int c1, c2, c3;
|
|
|
|
c1 = *pstr;
|
|
|
|
switch (charset)
|
|
{
|
|
case CS_SJIS:
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x7F]
|
|
HALFWIDTH_PRONOUNCE_CHAR: [0xA0-0xE0]
|
|
FULLWIDTH_ASCII_CHAR: [0x82] [0x60-0x9A]
|
|
[0x83] [0x9f-0xB6] ( Really no ASCII but Greek and Cyrillic )
|
|
[0x83] [0xBF-0x8F]
|
|
[0x84] [0x40-0x60]
|
|
[0x84] [0x70-0x8F]
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0x82] [0x9F-0xF1]
|
|
[0x83] [0x40-0x96]
|
|
[0x81] [0x5B-0x5D]
|
|
KANJI_CHAR: [0x88-0xFC] [xxxxxxxxx] (Except above)
|
|
|
|
Note: We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
|
|
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
if (IN_BETWEEN(0xA0, c1, 0xE0))
|
|
return HALFWIDTH_PRONOUNCE_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
|
|
switch(c1)
|
|
{
|
|
case 0x81:
|
|
if(IN_BETWEEN(0x5B, c2, 0x5D))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0x82:
|
|
if(IN_BETWEEN(0x60, c2, 0x9A))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
|
|
if(IN_BETWEEN(0x9F, c2, 0xF1))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0x83:
|
|
if(IN_BETWEEN(0x9F, c2, 0xB6) || IN_BETWEEN(0xBF, c2, 0xD0))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
|
|
if(IN_BETWEEN(0x40, c2, 0x96))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0x84:
|
|
if(IN_BETWEEN(0x40, c2, 0x8F) || IN_BETWEEN(0xBF, c2, 0xD0))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
}
|
|
|
|
|
|
if (IN_BETWEEN(0x88, c1, 0xFC))
|
|
return KANJI_CHAR;
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_EUCJP: /* TO BE TEST ON UNIX */
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x7F]
|
|
HALFWIDTH_PRONOUNCE_CHAR: [0x8E]
|
|
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
|
|
[0xE1-0xFA]
|
|
[0xA6] [0xA1-0xB8]
|
|
[0xC1-0xD8]
|
|
[0xA7] [0xA1-0xC1]
|
|
[0xD1-0xF1]
|
|
[0x8F] [0xA6-0xAF]
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0xA4] [xxxxxxx]
|
|
[0xA5] [xxxxxxx]
|
|
[0x81] [0x5B-0x5D]
|
|
KANJI_CHAR: [0xB0-0xFF] [xxxx]
|
|
[0x8F] [>0xB0]
|
|
|
|
Note: We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
|
|
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
|
|
switch(c1)
|
|
{
|
|
case 0x8E:
|
|
return HALFWIDTH_PRONOUNCE_CHAR;
|
|
|
|
case 0x8F:
|
|
if(IN_BETWEEN(0xA6, c2, 0xAF))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xA3:
|
|
if(IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xA4: case 0xA5:
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
case 0xA6:
|
|
if(IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xA7:
|
|
if(IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
}
|
|
|
|
|
|
if(
|
|
(c1 >= 0xB0) ||
|
|
((c1 == 0x8F) && (c2 > 0xB0))
|
|
)
|
|
{
|
|
return KANJI_CHAR;
|
|
}
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_KSC_8BIT:
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x80]
|
|
HALFWIDTH_PRONOUNCE_CHAR: None
|
|
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
|
|
[0xE1-0xFA]
|
|
[0xA5] [0xC1-0xD8]
|
|
[0xE1-0xF8]
|
|
[0xAC] [0xA1-0xC2]
|
|
[0xD1-0xF2]
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0xA4] [0xA1-0xFE]
|
|
[0xB0-0xC8] [xxxxxxxxx]
|
|
KANJI_CHAR: [0xCA-0xFD] [xxxxxxxxx]
|
|
|
|
Note: We didn't handle Hiragana and Katakana here
|
|
We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
|
|
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
if (
|
|
((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
|
|
((c1== 0xA5) && (IN_BETWEEN(0xC1, c2, 0xD8) || IN_BETWEEN(0xE1, c2, 0xF8))) ||
|
|
((c1== 0xAC) && (IN_BETWEEN(0xA1, c2, 0xC2) || IN_BETWEEN(0xD1, c2, 0xF2)))
|
|
)
|
|
{
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
}
|
|
|
|
if (
|
|
((c1== 0xA4) && (IN_BETWEEN(0xA1, c2, 0xFE))) ||
|
|
(IN_BETWEEN(0xB0, c1, 0xC8))
|
|
)
|
|
{
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
}
|
|
|
|
if (IN_BETWEEN(0xCA, c1, 0xFD))
|
|
return KANJI_CHAR;
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_GB_8BIT:
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x7F]
|
|
HALFWIDTH_PRONOUNCE_CHAR:
|
|
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
|
|
[0xE1-0xFA]
|
|
[0xA6] [0xA1-0xB8] Greek
|
|
[0xC1-0xD8]
|
|
[0xA7] [0xA1-0xC1] Cyrillic
|
|
[0xD1-0xF1]
|
|
[0xA8] [0xA1-0xBA] European
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0xA4,0xA5,0xA8] [xxxx]
|
|
KANJI_CHAR:
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
if (
|
|
((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
|
|
((c1== 0xA6) && (IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))) ||
|
|
((c1== 0xA7) && (IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))) ||
|
|
((c1== 0xA8) && (IN_BETWEEN(0xA1, c2, 0xBA)) )
|
|
)
|
|
{
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
}
|
|
|
|
if ((c1 == 0xA4) || (c1 == 0xA5) || (c1 == 0xA8))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
if (IN_BETWEEN(0xB0, c1, 0xF7))
|
|
return KANJI_CHAR;
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_BIG5:
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x7F]
|
|
HALFWIDTH_PRONOUNCE_CHAR:
|
|
FULLWIDTH_ASCII_CHAR: [0xA2] [0xCF-0xFF]
|
|
[0xA3] [0x40-0x73]
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0xA3] [0x74-0x7E]
|
|
[0xA1-0xBF]
|
|
KANJI_CHAR: [0xA4-0xFF] [xxxxxxxxx]
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
|
|
switch(c1)
|
|
{
|
|
case 0xA2:
|
|
if (IN_BETWEEN(0xCF, c2, 0xFF))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xA3:
|
|
if (IN_BETWEEN(0x74, c2, 0x7E) || IN_BETWEEN(0xA1, c2, 0xBF))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
if (IN_BETWEEN(0x40, c2, 0x73))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
|
|
break;
|
|
}
|
|
|
|
if (c1 >= 0xA4)
|
|
return KANJI_CHAR;
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_CNS_8BIT: /* TO BE TEST ON UNIX */
|
|
/*
|
|
SEVEN_BIT_CHAR: [0x00-0x7F]
|
|
HALFWIDTH_PRONOUNCE_CHAR:
|
|
FULLWIDTH_ASCII_CHAR: [0xA4] [0xC1-0xFE]
|
|
[0xA5] [0xA1-0xC6]
|
|
FULLWIDTH_PRONOUNCE_CHAR: [0xA5] [0xC7-0xF0]
|
|
KANJI_CHAR: [0xC4-0xFF] [xxxxxxxxx]
|
|
[0x8E]
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
c2 = *(pstr + 1);
|
|
|
|
switch(c1)
|
|
{
|
|
case 0xA4:
|
|
if(IN_BETWEEN(0xC1, c2, 0xFE))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xA5:
|
|
if(IN_BETWEEN(0xC7, c2, 0xF0))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
if(IN_BETWEEN(0xA1, c2, 0xC6))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
}
|
|
|
|
if (IN_BETWEEN(0xC4, c1, 0x8E))
|
|
return KANJI_CHAR;
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
|
|
case CS_UTF8:
|
|
/*
|
|
SEVEN_BIT_CHAR:
|
|
|
|
FULLWIDTH_ASCII_CHAR:
|
|
U+0000 - U+10FF
|
|
[C0-E0] [xxxx] Done
|
|
[E1] [80-83] [xxxx] Done
|
|
U+1E00 - U+1FFF
|
|
[E1] [B8-BF] Done
|
|
U+FF21 - U+FF3A
|
|
[EF] [BC] [A1-BA] Done
|
|
U+FF41 - U+FF5A
|
|
[EF] [BD] [81-9A] Done
|
|
|
|
FULLWIDTH_PRONOUNCE_CHAR:
|
|
U+1100 - U+11FF
|
|
[E1] [84-87] Done
|
|
U+3040 - U+318F
|
|
[E3] [81-85] [xx] Done
|
|
[E3] [86] [80-8F] Done
|
|
U+FF66 - U+FFDC
|
|
[EF] [BD] [AC-]
|
|
[EF] [BE]
|
|
[EF] [BF] [-9C]
|
|
U+AC00 - U+D7FF
|
|
[EA] [B0-] Done
|
|
[EB-EC] [xxx] Done
|
|
[ED] [-9F] Done
|
|
|
|
KANJI_CHAR:
|
|
U+4E00 - U+9FFF
|
|
[E4] [B8-] Done
|
|
[E5-E9] [xx] Done
|
|
*/
|
|
if (c1 < 0x80)
|
|
return SEVEN_BIT_CHAR;
|
|
|
|
if (IN_BETWEEN(0xC0, c1, 0xE0))
|
|
{
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
}
|
|
|
|
c2 = *(pstr + 1);
|
|
|
|
switch(c1)
|
|
{
|
|
case 0xE1:
|
|
if (IN_BETWEEN(0x80, c2, 0x83) || IN_BETWEEN(0xB8, c2, 0xBF))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
if (IN_BETWEEN(0x84, c2, 0x87))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
break;
|
|
|
|
case 0xE3:
|
|
if (IN_BETWEEN(0x81, c2, 0x85))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
|
|
if (c2 == 0x86)
|
|
{
|
|
c3 = *(pstr + 2);
|
|
if (IN_BETWEEN(0x80, c3, 0x8F))
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
}
|
|
|
|
break;
|
|
|
|
case 0xE4:
|
|
if (c2 >= 0xB8)
|
|
return KANJI_CHAR;
|
|
break;
|
|
|
|
case 0xE5: case 0xE6: case 0xE7: case 0xE8: case 0xE9:
|
|
return KANJI_CHAR;
|
|
break;
|
|
|
|
case 0xEA:
|
|
if (c2 >= 0xB0)
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0xEB: case 0xEC:
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0xED:
|
|
if (c2 <= 0x9F)
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0xEF:
|
|
c3 = *(pstr + 2);
|
|
switch(c2)
|
|
{
|
|
case 0xBC:
|
|
if (IN_BETWEEN(0xA1, c3, 0xBA))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
break;
|
|
|
|
case 0xBD:
|
|
if (IN_BETWEEN(0x81, c3, 0x9A))
|
|
return FULLWIDTH_ASCII_CHAR;
|
|
if (c3 >= 0xAC)
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0xBE:
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
|
|
case 0xBF:
|
|
if (c3 <= 0x9C)
|
|
return FULLWIDTH_PRONOUNCE_CHAR;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return UNCLASSIFIED_CHAR;
|
|
}
|
|
|
|
#define IF_A_IN_ARRAY_B_THEN_RETURN_C(a,b,c) \
|
|
{ \
|
|
int j; \
|
|
for (j = 0; (b)[j][0]; j++) \
|
|
if (XP_STRNCMP((char *)a, (b)[j], XP_STRLEN((b)[j])) == 0) \
|
|
return (c); \
|
|
}
|
|
|
|
#define IF_PROHIBIT_CLASS_THEN_RETURN(a,ba,ea) \
|
|
{ \
|
|
IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ba,PROHIBIT_BEGIN_OF_LINE); \
|
|
IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ea,PROHIBIT_END_OF_LINE); \
|
|
}
|
|
|
|
PUBLIC int INTL_KinsokuClass(int16 win_csid, unsigned char *pstr)
|
|
{
|
|
switch (win_csid)
|
|
{
|
|
case CS_SJIS:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_SJIS,ProhibitEnd_SJIS);
|
|
break;
|
|
case CS_EUCJP:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_EUCJP,ProhibitEnd_EUCJP);
|
|
break;
|
|
case CS_GB_8BIT:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_GB,ProhibitEnd_GB);
|
|
break;
|
|
case CS_BIG5:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_BIG5,ProhibitEnd_BIG5);
|
|
break;
|
|
case CS_CNS_8BIT:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_CNS,ProhibitEnd_CNS);
|
|
break;
|
|
case CS_KSC_8BIT:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_KSC,ProhibitEnd_KSC);
|
|
break;
|
|
case CS_UTF8:
|
|
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_UTF8,ProhibitEnd_UTF8);
|
|
if( *pstr <= 0xE2) /* UCS2 < 0x2000 */
|
|
return PROHIBIT_WORD_BREAK;
|
|
break;
|
|
}
|
|
|
|
return PROHIBIT_NOWHERE;
|
|
}
|
|
|
|
|
|
|