mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-01-24 21:58:06 +00:00
337 lines
10 KiB
C
337 lines
10 KiB
C
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
*
|
|
* The contents of this file are subject to the Netscape Public License
|
|
* Version 1.0 (the "NPL"); you may not use this file except in
|
|
* compliance with the NPL. You may obtain a copy of the NPL at
|
|
* http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the NPL is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
|
|
* for the specific language governing rights and limitations under the
|
|
* NPL.
|
|
*
|
|
* The Initial Developer of this code under the NPL is Netscape
|
|
* Communications Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
|
|
* Reserved.
|
|
*/
|
|
/* autodetect.c */
|
|
|
|
/*
|
|
* CODESET 1st Byte 2nd Byte 3rd Byte
|
|
* JIS 0x21-0x7E 0x21-0x7E n/a
|
|
* SJIS 0xA1-0xDF n/a n/a
|
|
* 0x81-0x9F 0x40-0xFC n/a
|
|
* 0xE0-0xEF 0x40-0xFC n/a
|
|
* EUCJP 0x8E (SS2) 0xA1-0xDF n/a
|
|
* 0xA1-0xFE 0xA1-0xFE n/a
|
|
* 0x8F (SS3) 0xA1-0xFE 0xA1-0xFE
|
|
* Invalid 7F,80,A0,FF
|
|
*/
|
|
|
|
#include "intlpriv.h"
|
|
|
|
#define ALLOW_NBSP 1
|
|
|
|
/*
|
|
* JIS X 0201-Roman ESC ( J
|
|
* Half-width Katakana ESC ( I
|
|
* JIS X 0208-1978 ESC $ @
|
|
* JIS X 0208-1983 ESC $ B
|
|
* JIS X 0212-1990 ESC $ ( D
|
|
*/
|
|
#define IsJISEscSeq(cp, len) \
|
|
((cp[0] == ESC) && ((len) > 2) && ( \
|
|
((cp[1] == '$') && (cp[2] == 'B')) || \
|
|
((cp[1] == '$') && (cp[2] == '@')) || \
|
|
((cp[1] == '(') && (cp[2] == 'J')) || \
|
|
((cp[1] == '(') && (cp[2] == 'I')) || \
|
|
(((len) > 3) && (cp[1] == '$') && (cp[2] == '(') && (cp[3] == 'D')) ) )
|
|
|
|
#define IsRoman(c) ((c) < 0x80)
|
|
#define IsSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xFD))
|
|
#define IsLoSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xA1))
|
|
#define IsHiSJIS2ndByte(c) (((c) > 0xA0) && ((c) < 0xFD))
|
|
#define IsEUCJPKana(b1) (((b1) > 0xA0) && ((b1) < 0xE0))
|
|
#define IsEUCJPKanji(b1or2) (((b1or2) > 0xA0) && ((b1or2) < 0xFF))
|
|
|
|
#define YES 1
|
|
#define NO 0
|
|
#define MAYBE -1
|
|
|
|
PRIVATE int
|
|
isSJIS(const unsigned char *cp, int32 len)
|
|
{
|
|
while (len) {
|
|
if (IsRoman(*cp)) {
|
|
cp++, len--;
|
|
} else if (*cp == 0x80) { /* illegal SJIS 1st byte */
|
|
return NO;
|
|
} else if ((*cp < 0xA0)) { /* byte 1 of 2byte SJIS 1st range */
|
|
if (len > 1) {
|
|
if (IsSJIS2ndByte(cp[1])) {
|
|
if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0))
|
|
return YES;
|
|
cp += 2, len -= 2; /* valid 2 byte SJIS */
|
|
} else {
|
|
return NO; /* invalid SJIS 2nd byte */
|
|
}
|
|
} else
|
|
break; /* buffer ended w/1of2 byte SJIS */
|
|
} else if (*cp == 0xA0) { /* illegal EUCJP byte */
|
|
#if ALLOW_NBSP
|
|
cp++, len--; /* allow nbsp */
|
|
#endif
|
|
} else if (*cp < 0xE0) { /* SJIS half-width kana */
|
|
cp++, len--;
|
|
} else if (*cp < 0xF0) { /* byte 1 of 2byte SJIS 2nd range */
|
|
if (len > 1) {
|
|
if (IsSJIS2ndByte(cp[1])) {
|
|
cp += 2, len -= 2; /* valid 2 byte SJIS */
|
|
} else {
|
|
return NO; /* invalid SJIS */
|
|
}
|
|
} else
|
|
break; /* buffer ended w/1of2 byte SJIS */
|
|
} else {
|
|
return NO; /* invalid SJIS 1st byte */
|
|
}
|
|
}
|
|
return MAYBE; /* No illegal SJIS values found */
|
|
}
|
|
|
|
PRIVATE int
|
|
isEUCJP(const unsigned char *cp, int32 len)
|
|
{
|
|
while (len) {
|
|
if (IsRoman(*cp)) { /* Roman */
|
|
cp++, len--;
|
|
} else if (*cp == SS2) { /* EUCJP JIS201 half-width kana */
|
|
if (len > 1) {
|
|
if (IsEUCJPKana(cp[1]))
|
|
cp += 2, len -= 2; /* valid half-width kana */
|
|
else
|
|
return NO; /* invalid 2of3 byte EUC */
|
|
} else
|
|
break; /* buffer ended w/1of2 byte EUC */
|
|
} else if (*cp == SS3) { /* EUCJP JIS212 */
|
|
if (len > 1) {
|
|
if (IsEUCJPKanji(cp[1])) {
|
|
if (len > 2) {
|
|
if (IsEUCJPKanji(cp[2]))
|
|
cp += 2, len -= 2; /* valid 3 byte EUCJP */
|
|
else
|
|
return NO; /* invalid 3of3 byte EUCJP */
|
|
} else
|
|
break; /* buffer ended w/2of3 byte EUCJP */
|
|
} else
|
|
return NO; /* invalid 2of3 byte EUCJP */
|
|
} else
|
|
break; /* buffer ended w/1of3 byte EUCJP */
|
|
} else if (*cp == 0xA0) { /* illegal EUCJP byte */
|
|
#if ALLOW_NBSP
|
|
cp++, len--; /* allow nbsp */
|
|
#else
|
|
return NO;
|
|
#endif
|
|
} else if (*cp < 0xF0) { /* EUCJP JIS208 (overlaps SJIS) */
|
|
if (len > 1) {
|
|
if (IsEUCJPKanji(cp[1]))
|
|
cp += 2, len -= 2; /* valid 2 byte EUCJP */
|
|
else
|
|
return NO; /* invalid 2of2 byte EUCJP */
|
|
} else
|
|
break; /* buffer ended w/1of2 byte EUCJP */
|
|
} else if (*cp < 0xFF) { /* EUCJP JIS208 only: */
|
|
if (len > 1) {
|
|
if (IsEUCJPKanji(cp[1]))
|
|
return YES; /* valid 2 byte EUCJP, invalid SJIS */
|
|
else
|
|
return NO; /* invalid 2of2 byte EUCJP */
|
|
} else
|
|
break; /* buffer ended w/1of2 byte EUCJP */
|
|
} else {
|
|
return NO; /* invalid EUCJP 1st byte: 0xFF */
|
|
}
|
|
}
|
|
return MAYBE;
|
|
}
|
|
|
|
MODULE_PRIVATE int16
|
|
intl_detect_JCSID (uint16 defaultCSID, const unsigned char *buf, int32 len)
|
|
{
|
|
register const unsigned char *cp = buf;
|
|
int sjisFlag;
|
|
int eucjpFlag;
|
|
|
|
/* JIS is 7bit. Scan to end of 7bit data or legitimate JIS ESC sequence. */
|
|
while (len && (IsRoman(*cp) || (*cp == 0xA0))) { /* allow nbsp */
|
|
if (IsJISEscSeq(cp, len))
|
|
return CS_JIS;
|
|
cp++, len--;
|
|
}
|
|
|
|
/* If len > 0, must be either SJIS or EUC because there's 8bit data */
|
|
while (len) {
|
|
if (*cp == 0x80) {
|
|
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
|
|
}
|
|
if (*cp < 0x8E)
|
|
return CS_SJIS; /* Illegal EUCJP 1st byte */
|
|
if (*cp == 0xA0) {
|
|
#if ALLOW_NBSP
|
|
cp++; len--;
|
|
continue; /* allow nbsp */
|
|
#else
|
|
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
|
|
#endif
|
|
}
|
|
if ( (*cp > 0xEF) && (*cp < 0xFF) ) /* illegal SJIS 1st byte */
|
|
return CS_EUCJP;
|
|
if (*cp == 0xFF) {
|
|
return CS_DEFAULT;/* illegal byte1 (SJIS & EUCJP) */
|
|
}
|
|
|
|
/* At this point. 1st byte is 0x8E, 0x8F, or 0xA1-0xEF. */
|
|
/* If 1st Byte is 0xE0-0xEF inclusive, then it's 2byte SJIS or EUC */
|
|
if ((*cp > 0xDF) && (*cp < 0xF0)) {
|
|
if (len > 1) {
|
|
if (cp[1] < 0x41) { /* illegal byte2 (SJIS & EUCJP) */
|
|
return CS_DEFAULT;
|
|
}
|
|
if (cp[1] < 0xA1)
|
|
return CS_SJIS; /* Illegal EUCJP 2nd byte */
|
|
if (cp[1] > 0xFC)
|
|
return CS_EUCJP; /* illegal SJIS 2nd byte */
|
|
cp += 2, len -= 2; /* Skip 2 byte character */
|
|
|
|
/* Gobble up single byte characters and continue outer loop */
|
|
while (len && IsRoman(*cp)) {
|
|
cp++, len--;
|
|
}
|
|
continue;
|
|
} else {
|
|
len = 0;
|
|
break; /* No more chars in buffer */
|
|
}
|
|
}
|
|
/* 1st Byte is 0xA1-DF inclusive:
|
|
* 1byte SJIS kana or 1of2 byte SJIS or EUC
|
|
*/
|
|
break; /* break and handle ambiguous cases */
|
|
}
|
|
|
|
if (len) {
|
|
eucjpFlag = isEUCJP(cp, len);
|
|
if (YES == eucjpFlag)
|
|
return CS_EUCJP;
|
|
|
|
sjisFlag = isSJIS(cp, len);
|
|
if (YES == sjisFlag)
|
|
return CS_SJIS;
|
|
|
|
/* Neither one is YES, look at NO : MAYBE Pair */
|
|
if ((NO == eucjpFlag) && (MAYBE == sjisFlag))
|
|
return CS_SJIS;
|
|
if ((MAYBE == eucjpFlag) && (NO == sjisFlag))
|
|
return CS_EUCJP;
|
|
}
|
|
|
|
/* Some servers relied upon the previous Nav3.0 default for ambiguous SJIS/EUC encoding. */
|
|
#define USE_ACKBAR_LOGIC 1
|
|
|
|
/* Now, both are NO or both are MAYBE, look at default */
|
|
if (len) { /* Must be ambiguous -- EUC or SJIS */
|
|
#if USE_ACKBAR_LOGIC
|
|
|
|
#ifdef XP_MAC
|
|
defaultCSID = CS_SJIS_AUTO; /* simulate Akbar old charset hints */
|
|
#else
|
|
defaultCSID = CS_JIS;
|
|
#endif
|
|
if (defaultCSID == CS_SJIS) {
|
|
eucjpFlag = isEUCJP(cp, len);
|
|
if (eucjpFlag == YES)
|
|
return CS_EUCJP;
|
|
else
|
|
return CS_SJIS;
|
|
} else if (defaultCSID == CS_EUCJP) {
|
|
sjisFlag = isSJIS(cp, len);
|
|
if (sjisFlag == YES)
|
|
return CS_SJIS;
|
|
else
|
|
return CS_EUCJP;
|
|
} else { /* default is JIS */
|
|
sjisFlag = isSJIS(cp, len);
|
|
if (sjisFlag == YES)
|
|
return CS_SJIS;
|
|
eucjpFlag = isEUCJP(cp, len);
|
|
if (eucjpFlag == YES)
|
|
return CS_EUCJP;
|
|
if (sjisFlag == NO) {
|
|
if (eucjpFlag != NO) /* SJIS-NO, EUCJP-MAYBE */
|
|
return CS_EUCJP;
|
|
} else {
|
|
if (eucjpFlag == NO) /* SJIS-MAYBE, EUCJP-NO */
|
|
return CS_SJIS;
|
|
else { /* both MAYBE */
|
|
return CS_EUCJP; /* have to pick one... */
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
if (CS_SJIS == defaultCSID) {
|
|
if (MAYBE == sjisFlag)
|
|
return CS_SJIS;
|
|
} else if (CS_EUCJP == defaultCSID) {
|
|
if (MAYBE == eucjpFlag)
|
|
return CS_EUCJP;
|
|
} else { /* default is JIS */
|
|
if ((MAYBE == eucjpFlag) && (MAYBE == sjisFlag)) /* pick one- EUC */
|
|
return CS_EUCJP;
|
|
}
|
|
#endif
|
|
}
|
|
return CS_ASCII; /* Could be any of the 3... */
|
|
}
|
|
|
|
/* Auto Detect Japanese Char Code Conversion */
|
|
MODULE_PRIVATE unsigned char *
|
|
autoJCCC (CCCDataObject obj, const unsigned char *s, int32 l)
|
|
{
|
|
int16 doc_csid = 0;
|
|
uint16 detected_doc_csid;
|
|
|
|
/* try to determine doc Japanese CSID. */
|
|
doc_csid = intl_detect_JCSID((uint16)(INTL_GetCCCDefaultCSID(obj)&~CS_AUTO),
|
|
(const unsigned char *) s,l);
|
|
if (doc_csid == CS_ASCII) { /* return s unconverted and */
|
|
INTL_SetCCCLen(obj, l);
|
|
return (unsigned char *)s; /* autodetect next block of stream data */
|
|
}
|
|
if (doc_csid == CS_DEFAULT) { /* found unexpected chars */
|
|
doc_csid = INTL_GetCCCDefaultCSID(obj) & ~CS_AUTO;
|
|
detected_doc_csid = CS_DEFAULT;
|
|
} else {
|
|
detected_doc_csid = doc_csid | CS_AUTO;
|
|
}
|
|
/* Setup converter function for success streams data blocks */
|
|
(void) INTL_GetCharCodeConverter(doc_csid, INTL_GetCCCToCSID(obj), obj);
|
|
INTL_CallCCCReportAutoDetect(obj, detected_doc_csid);
|
|
|
|
/* If no conversion needed, change put_block module for successive
|
|
* data blocks. For current data block, return unmodified buffer.
|
|
*/
|
|
if (INTL_GetCCCCvtfunc(obj) == NULL) {
|
|
INTL_SetCCCLen(obj, l);
|
|
return((unsigned char *) s);
|
|
}
|
|
/* For initial block, must call converter directly. Success calls
|
|
* to the converter will be called directly from net_CharCodeConv()
|
|
*/
|
|
return (unsigned char *)(INTL_GetCCCCvtfunc(obj)) (obj, (const unsigned char *)s, l);
|
|
}
|
|
|
|
|
|
|