gecko-dev/lib/libi18n/autocvt.c

337 lines
10 KiB
C
Raw Normal View History

1998-03-28 02:44:41 +00:00
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
/* autodetect.c */
/*
* CODESET 1st Byte 2nd Byte 3rd Byte
* JIS 0x21-0x7E 0x21-0x7E n/a
* SJIS 0xA1-0xDF n/a n/a
* 0x81-0x9F 0x40-0xFC n/a
* 0xE0-0xEF 0x40-0xFC n/a
* EUCJP 0x8E (SS2) 0xA1-0xDF n/a
* 0xA1-0xFE 0xA1-0xFE n/a
* 0x8F (SS3) 0xA1-0xFE 0xA1-0xFE
* Invalid 7F,80,A0,FF
*/
#include "intlpriv.h"
#define ALLOW_NBSP 1
/*
* JIS X 0201-Roman ESC ( J
* Half-width Katakana ESC ( I
* JIS X 0208-1978 ESC $ @
* JIS X 0208-1983 ESC $ B
* JIS X 0212-1990 ESC $ ( D
*/
#define IsJISEscSeq(cp, len) \
((cp[0] == ESC) && ((len) > 2) && ( \
((cp[1] == '$') && (cp[2] == 'B')) || \
((cp[1] == '$') && (cp[2] == '@')) || \
((cp[1] == '(') && (cp[2] == 'J')) || \
((cp[1] == '(') && (cp[2] == 'I')) || \
(((len) > 3) && (cp[1] == '$') && (cp[2] == '(') && (cp[3] == 'D')) ) )
#define IsRoman(c) ((c) < 0x80)
#define IsSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xFD))
#define IsLoSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xA1))
#define IsHiSJIS2ndByte(c) (((c) > 0xA0) && ((c) < 0xFD))
#define IsEUCJPKana(b1) (((b1) > 0xA0) && ((b1) < 0xE0))
#define IsEUCJPKanji(b1or2) (((b1or2) > 0xA0) && ((b1or2) < 0xFF))
#define YES 1
#define NO 0
#define MAYBE -1
PRIVATE int
isSJIS(const unsigned char *cp, int32 len)
{
while (len) {
if (IsRoman(*cp)) {
cp++, len--;
} else if (*cp == 0x80) { /* illegal SJIS 1st byte */
return NO;
} else if ((*cp < 0xA0)) { /* byte 1 of 2byte SJIS 1st range */
if (len > 1) {
if (IsSJIS2ndByte(cp[1])) {
if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0))
return YES;
cp += 2, len -= 2; /* valid 2 byte SJIS */
} else {
return NO; /* invalid SJIS 2nd byte */
}
} else
break; /* buffer ended w/1of2 byte SJIS */
} else if (*cp == 0xA0) { /* illegal EUCJP byte */
#if ALLOW_NBSP
cp++, len--; /* allow nbsp */
#endif
} else if (*cp < 0xE0) { /* SJIS half-width kana */
cp++, len--;
} else if (*cp < 0xF0) { /* byte 1 of 2byte SJIS 2nd range */
if (len > 1) {
if (IsSJIS2ndByte(cp[1])) {
cp += 2, len -= 2; /* valid 2 byte SJIS */
} else {
return NO; /* invalid SJIS */
}
} else
break; /* buffer ended w/1of2 byte SJIS */
} else {
return NO; /* invalid SJIS 1st byte */
}
}
return MAYBE; /* No illegal SJIS values found */
}
PRIVATE int
isEUCJP(const unsigned char *cp, int32 len)
{
while (len) {
if (IsRoman(*cp)) { /* Roman */
cp++, len--;
} else if (*cp == SS2) { /* EUCJP JIS201 half-width kana */
if (len > 1) {
if (IsEUCJPKana(cp[1]))
cp += 2, len -= 2; /* valid half-width kana */
else
return NO; /* invalid 2of3 byte EUC */
} else
break; /* buffer ended w/1of2 byte EUC */
} else if (*cp == SS3) { /* EUCJP JIS212 */
if (len > 1) {
if (IsEUCJPKanji(cp[1])) {
if (len > 2) {
if (IsEUCJPKanji(cp[2]))
cp += 2, len -= 2; /* valid 3 byte EUCJP */
else
return NO; /* invalid 3of3 byte EUCJP */
} else
break; /* buffer ended w/2of3 byte EUCJP */
} else
return NO; /* invalid 2of3 byte EUCJP */
} else
break; /* buffer ended w/1of3 byte EUCJP */
} else if (*cp == 0xA0) { /* illegal EUCJP byte */
#if ALLOW_NBSP
cp++, len--; /* allow nbsp */
#else
return NO;
#endif
} else if (*cp < 0xF0) { /* EUCJP JIS208 (overlaps SJIS) */
if (len > 1) {
if (IsEUCJPKanji(cp[1]))
cp += 2, len -= 2; /* valid 2 byte EUCJP */
else
return NO; /* invalid 2of2 byte EUCJP */
} else
break; /* buffer ended w/1of2 byte EUCJP */
} else if (*cp < 0xFF) { /* EUCJP JIS208 only: */
if (len > 1) {
if (IsEUCJPKanji(cp[1]))
return YES; /* valid 2 byte EUCJP, invalid SJIS */
else
return NO; /* invalid 2of2 byte EUCJP */
} else
break; /* buffer ended w/1of2 byte EUCJP */
} else {
return NO; /* invalid EUCJP 1st byte: 0xFF */
}
}
return MAYBE;
}
MODULE_PRIVATE int16
intl_detect_JCSID (uint16 defaultCSID, const unsigned char *buf, int32 len)
{
register const unsigned char *cp = buf;
int sjisFlag;
int eucjpFlag;
/* JIS is 7bit. Scan to end of 7bit data or legitimate JIS ESC sequence. */
while (len && (IsRoman(*cp) || (*cp == 0xA0))) { /* allow nbsp */
if (IsJISEscSeq(cp, len))
return CS_JIS;
cp++, len--;
}
/* If len > 0, must be either SJIS or EUC because there's 8bit data */
while (len) {
if (*cp == 0x80) {
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
}
if (*cp < 0x8E)
return CS_SJIS; /* Illegal EUCJP 1st byte */
if (*cp == 0xA0) {
#if ALLOW_NBSP
cp++; len--;
continue; /* allow nbsp */
#else
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
#endif
}
if ( (*cp > 0xEF) && (*cp < 0xFF) ) /* illegal SJIS 1st byte */
return CS_EUCJP;
if (*cp == 0xFF) {
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
}
/* At this point. 1st byte is 0x8E, 0x8F, or 0xA1-0xEF. */
/* If 1st Byte is 0xE0-0xEF inclusive, then it's 2byte SJIS or EUC */
if ((*cp > 0xDF) && (*cp < 0xF0)) {
if (len > 1) {
if (cp[1] < 0x41) { /* illegal byte2 (SJIS & EUCJP) */
return CS_DEFAULT;
}
if (cp[1] < 0xA1)
return CS_SJIS; /* Illegal EUCJP 2nd byte */
if (cp[1] > 0xFC)
return CS_EUCJP; /* illegal SJIS 2nd byte */
cp += 2, len -= 2; /* Skip 2 byte character */
/* Gobble up single byte characters and continue outer loop */
while (len && IsRoman(*cp)) {
cp++, len--;
}
continue;
} else {
len = 0;
break; /* No more chars in buffer */
}
}
/* 1st Byte is 0xA1-DF inclusive:
* 1byte SJIS kana or 1of2 byte SJIS or EUC
*/
break; /* break and handle ambiguous cases */
}
if (len) {
eucjpFlag = isEUCJP(cp, len);
if (YES == eucjpFlag)
return CS_EUCJP;
sjisFlag = isSJIS(cp, len);
if (YES == sjisFlag)
return CS_SJIS;
/* Neither one is YES, look at NO : MAYBE Pair */
if ((NO == eucjpFlag) && (MAYBE == sjisFlag))
return CS_SJIS;
if ((MAYBE == eucjpFlag) && (NO == sjisFlag))
return CS_EUCJP;
}
/* Some servers relied upon the previous Nav3.0 default for ambiguous SJIS/EUC encoding. */
#define USE_ACKBAR_LOGIC 1
/* Now, both are NO or both are MAYBE, look at default */
if (len) { /* Must be ambiguous -- EUC or SJIS */
#if USE_ACKBAR_LOGIC
#ifdef XP_MAC
defaultCSID = CS_SJIS_AUTO; /* simulate Akbar old charset hints */
#else
defaultCSID = CS_JIS;
#endif
if (defaultCSID == CS_SJIS) {
eucjpFlag = isEUCJP(cp, len);
if (eucjpFlag == YES)
return CS_EUCJP;
else
return CS_SJIS;
} else if (defaultCSID == CS_EUCJP) {
sjisFlag = isSJIS(cp, len);
if (sjisFlag == YES)
return CS_SJIS;
else
return CS_EUCJP;
} else { /* default is JIS */
sjisFlag = isSJIS(cp, len);
if (sjisFlag == YES)
return CS_SJIS;
eucjpFlag = isEUCJP(cp, len);
if (eucjpFlag == YES)
return CS_EUCJP;
if (sjisFlag == NO) {
if (eucjpFlag != NO) /* SJIS-NO, EUCJP-MAYBE */
return CS_EUCJP;
} else {
if (eucjpFlag == NO) /* SJIS-MAYBE, EUCJP-NO */
return CS_SJIS;
else { /* both MAYBE */
return CS_EUCJP; /* have to pick one... */
}
}
}
#else
if (CS_SJIS == defaultCSID) {
if (MAYBE == sjisFlag)
return CS_SJIS;
} else if (CS_EUCJP == defaultCSID) {
if (MAYBE == eucjpFlag)
return CS_EUCJP;
} else { /* default is JIS */
if ((MAYBE == eucjpFlag) && (MAYBE == sjisFlag)) /* pick one- EUC */
return CS_EUCJP;
}
#endif
}
return CS_ASCII; /* Could be any of the 3... */
}
/* Auto Detect Japanese Char Code Conversion */
MODULE_PRIVATE unsigned char *
autoJCCC (CCCDataObject obj, const unsigned char *s, int32 l)
{
int16 doc_csid = 0;
uint16 detected_doc_csid;
/* try to determine doc Japanese CSID. */
doc_csid = intl_detect_JCSID((uint16)(INTL_GetCCCDefaultCSID(obj)&~CS_AUTO),
(const unsigned char *) s,l);
if (doc_csid == CS_ASCII) { /* return s unconverted and */
INTL_SetCCCLen(obj, l);
return (unsigned char *)s; /* autodetect next block of stream data */
}
if (doc_csid == CS_DEFAULT) { /* found unexpected chars */
doc_csid = INTL_GetCCCDefaultCSID(obj) & ~CS_AUTO;
detected_doc_csid = CS_DEFAULT;
} else {
detected_doc_csid = doc_csid | CS_AUTO;
}
/* Setup converter function for success streams data blocks */
(void) INTL_GetCharCodeConverter(doc_csid, INTL_GetCCCToCSID(obj), obj);
INTL_CallCCCReportAutoDetect(obj, detected_doc_csid);
/* If no conversion needed, change put_block module for successive
* data blocks. For current data block, return unmodified buffer.
*/
if (INTL_GetCCCCvtfunc(obj) == NULL) {
INTL_SetCCCLen(obj, l);
return((unsigned char *) s);
}
/* For initial block, must call converter directly. Success calls
* to the converter will be called directly from net_CharCodeConv()
*/
return (unsigned char *)(INTL_GetCCCCvtfunc(obj)) (obj, (const unsigned char *)s, l);
}