gecko-dev/lib/libi18n/euc2jis.c

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.0 (the "NPL"); you may not use this file except in
 * compliance with the NPL.  You may obtain a copy of the NPL at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the NPL is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
 * for the specific language governing rights and limitations under the
 * NPL.
 *
 * The Initial Developer of this code under the NPL is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
 * Reserved.
 */
/*	euc2jis.c	*/

#include "intlpriv.h"
#if defined(MOZ_MAIL_NEWS)
#include "katakana.h"
#endif


extern int MK_OUT_OF_MEMORY;


/* net_euc2jis(obj, eucbuf, eucbufsz)
 * Args:
 *	eucbuf:		Ptr to a buf of EUC chars
 *	eucbufsz:	Size in bytes of eucbuf
 *	obj->eucmode:	Ptr to encoding mode, use as arg for next call to
 *		mz_euc2jis() for rest of current EUC data.  First call should
 *		initialize mode to ASCII (0).					
 *	uncvtbuf:	If entire buffer was converted, uncvtbuf[0] will be nul,
 *		else this points to EUC chars that were NOT converted
 *		and mz_euc2jis() with additional EUC chars appended.
 * Return:
 *	Returns NULL on failure, otherwise it returns a pointer to a buffer of
 *	converted characters.  Caller must XP_FREE() this memory.
 *
 * Description:
 *
 *	Allocate destination buffer.
 *
 *	Ouput JIS ESC sequences based upon which EUC code set.
 *
 *  No conversion is needed for ASCII/JIS Roman characters.
 *
 *	Clear 8th bit of 1-byte Half-width Katakana.  Half-width Katakana
 *  is not widely used and its ESC sequence may not be recognized
 *  by some software.  It's use on the internet is discouraged...
 *
 *	Clear 8th bits of 2-byte JIS X 212-1990 chars.  JIS-212
 *  is not widely used and its ESC sequence may not be recognized
 *  by some software.  These chars do not have corresponding chars
 *	in JIS-208 or SJIS.
 *
 *	Clear 8th bits of 2-byte JIS X 208-1993 chars.  These are the commonly
 *	used chars (along with JIS-Roman).
 *
 *	Bytes which do not fall in the EUC valid character codes are treated
 *	like JIS-Roman.
 *
 *	If either EUC buffer does not contain a complete EUC char or dest buffer
 *	is full, then return unconverted EUC to caller.  Caller should
 *	append more data and recall mz_euc2jis.
 */


MODULE_PRIVATE unsigned char *
mz_euc2jis(	CCCDataObject		obj,
			const unsigned char	*eucbuf,	/* EUC buffer for conversion	*/
			int32				eucbufsz)	/* EUC buffer size in bytes		*/
{
 	unsigned char			*tobuf = NULL;
 	int32					tobufsz;
 	register unsigned char	*tobufp, *eucp;		/* current byte in bufs	*/
 	register unsigned char	*tobufep, *eucep;	/* end of buffers		*/
 	int32					uncvtlen;
	unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);
#if defined(MOZ_MAIL_NEWS)
  	unsigned char kanabuf[4];					/* for half-width kana */
 	uint32	byteused;							/* for half-width kana */
#endif

 										/* Allocate a dest buffer:		*/
		/* JIS is longer than EUC because of ESC seq.  In the worst case
		 * ( <SS2> <Half-width Kana> <Roman> ... ), the converted JIS will
		 * be 2-2/3 times the size of the original EUC + 1 for nul byte.
		 * Worst case: single half-width kana:
		 *	ESC ( I KANA ESC ( J
		 */
	uncvtlen = strlen((const char *)uncvtbuf);
										/* 3 times length of EUC	*/
	tobufsz = eucbufsz + uncvtlen + ((eucbufsz + uncvtlen)<<2) + 8;
	if ((tobuf = (unsigned char *)XP_ALLOC(tobufsz)) == (unsigned char *)NULL) {
		INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);
		return(NULL);
	}
										/* Initialize pointers, etc.	*/
 	eucp = (unsigned char *)eucbuf;
 	eucep = eucp + eucbufsz - 1;

#define uncvtp	tobufp	/* use tobufp as temp */ 		 		
							/* If prev. unconverted chars, append unconverted
							 * chars w/new chars and try to process.
							 */
 	if (uncvtbuf[0] != '\0') {
 		uncvtp = (unsigned char *)uncvtbuf + uncvtlen;
 		while (uncvtp < ((unsigned char *)uncvtbuf + UNCVTBUF_SIZE) &&
														eucp <= eucep)
 			*uncvtp++ = *eucp++;
 		*uncvtp = '\0';						/* nul terminate	*/
 		eucp = (unsigned char *)uncvtbuf; /* process unconverted first */
 		eucep = uncvtp - 1;
 	}
#undef uncvtp
 	
 	tobufp = tobuf;
 	tobufep = tobufp + tobufsz - 2;		/* save space for terminating null */
 	
WHILELOOP: 	
									/* While EUC data && space in dest. buf. */
 	while ((tobufp <= tobufep) && (eucp <= eucep)) {
		if (*eucp < SS2) {				/* ASCII/JIS-Roman or invalid EUC	*/
 			if (INTL_GetCCCJismode(obj) != JIS_Roman) {
 				InsASCII_ESC(tobufp, obj);
 			}
 			*tobufp++ = *eucp++;
		} else if (*eucp == SS2) {		/* Half-width Katakana			*/
 			if (eucp+1 > eucep)			/* No 2nd byte in EUC buffer?	*/
 				break;
#if defined(MOZ_MAIL_NEWS)
			if (!INTL_GetCCCCvtflag_SendHankakuKana(obj)) {
 				if (INTL_GetCCCJismode(obj) != JIS_208_83) {
 					Ins208_83_ESC(tobufp, obj);
	 			}
				INTL_SjisHalf2FullKana(eucp, (uint32)eucep - (uint32)eucp + 1, kanabuf, &byteused);
 				*tobufp++ = kanabuf[0] & 0x7F;
 				*tobufp++ = kanabuf[1] & 0x7F;
				eucp += byteused;
			} else {
	 			if (INTL_GetCCCJismode(obj) != JIS_HalfKana) {
	 				InsHalfKana_ESC(tobufp, obj);
	 			}
				eucp++;							/* skip SS2	*/
	 			*tobufp++ = *eucp & 0x7F;
				eucp++;
 			}
#else
 			if (INTL_GetCCCJismode(obj) != JIS_HalfKana) {
 				InsHalfKana_ESC(tobufp, obj);
 			}
			eucp++;							/* skip SS2	*/
 			*tobufp++ = *eucp & 0x7F;
			eucp++;
#endif

		} else if (*eucp == SS3) {		/* JIS X 0212-1990				*/
 			if (eucp+2 > eucep)			/* No 2nd & 3rd bytes in EUC buffer? */
 				break;
			if (*(eucp+1) <= 0xA0 || *(eucp+2) <= 0xA0) { /* Invalid EUC212	*/
 				if (INTL_GetCCCJismode(obj) != JIS_Roman) {
 					InsASCII_ESC(tobufp, obj);
				}
				*tobufp++ = *eucp++;			/* process 1 byte as Roman */
 			} else {
 				if (INTL_GetCCCJismode(obj) != JIS_212_90) {
					Ins212_90_ESC(tobufp, obj);
 				}
				eucp++;							/* skip SS3	*/
 				*tobufp++ = *eucp & 0x7F;
				eucp++;
 				*tobufp++ = *eucp & 0x7F;
				eucp++;
			}
		} else if (*eucp < 0xA0) {		/* Invalid EUC: treat as Roman	*/
 			if (INTL_GetCCCJismode(obj) != JIS_Roman) {
 				InsASCII_ESC(tobufp, obj);
 			}
 			*tobufp++ = *eucp++;
		} else {						/* JIS X 0208-1990				*/
 			if (eucp+1 > eucep)			/* No 2nd byte in EUC buffer?	*/
 				break;
			if (*(eucp+1) < 0xA0) {		/* 1st byte OK, check if 2nd is valid */
 				if (INTL_GetCCCJismode(obj) != JIS_Roman) {
 					InsASCII_ESC(tobufp, obj);
				}
				*tobufp++ = *eucp++;			/* process 1 byte as Roman	*/
 			} else {
 				if (INTL_GetCCCJismode(obj) != JIS_208_83) {
 					Ins208_83_ESC(tobufp, obj);
 				}
 				*tobufp++ = *eucp & 0x7F;
				eucp++;
 				*tobufp++ = *eucp & 0x7F;
				eucp++;
			}
		}
	}
	
 	if (uncvtbuf[0] != '\0') {
										/* Just processed unconverted chars:
										 * eucp pts to 1st unprocessed char in
 										 * eucbuf.  Some may have been processed
 										 * while processing unconverted chars,
 										 * so set up ptrs not to process them
 										 * twice.
 										 */
										/* If nothing was converted, this can
										 * only happen if there was not
										 * enough EUC data.  Stop and get
										 * more data.
										 */
		if (eucp == (unsigned char *)uncvtbuf) {	/* Nothing converted */
			*tobufp = '\0';
			return(NULL);
		}
 		eucp = (unsigned char *)eucbuf +
 							(eucp - (unsigned char *)uncvtbuf - uncvtlen);
 		eucep = (unsigned char *)eucbuf + eucbufsz - 1;	/* save space for nul */
 		uncvtbuf[0] = '\0';		/* No more uncoverted chars.	*/
 		goto WHILELOOP;					/* Process new data				*/
 	}
 	
 	if (INTL_GetCCCJismode(obj) != JIS_Roman) {
 		INTL_SetCCCJismode(obj, JIS_Roman);
 		InsASCII_ESC(tobufp, obj);
	}

	*tobufp = '\0';						/* null terminate dest. data */
	INTL_SetCCCLen(obj,  tobufp - tobuf);			/* length not counting null	*/

 	if (eucp <= eucep) {				/* uncoverted EUC?			*/
		tobufp = (unsigned char *)uncvtbuf;/* reuse the tobufp as a TEMP */
 		while (eucp <= eucep)
 			*tobufp++ = *eucp++;
 		*tobufp = '\0';					/* null terminate			*/
 	}
	return(tobuf);
}
Free the lizard 1998-03-28 02:44:41 +00:00			`/* -- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 --`
			`*`
			`* The contents of this file are subject to the Netscape Public License`
			`* Version 1.0 (the "NPL"); you may not use this file except in`
			`* compliance with the NPL. You may obtain a copy of the NPL at`
			`* http://www.mozilla.org/NPL/`
			`*`
			`* Software distributed under the NPL is distributed on an "AS IS" basis,`
			`* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL`
			`* for the specific language governing rights and limitations under the`
			`* NPL.`
			`*`
			`* The Initial Developer of this code under the NPL is Netscape`
			`* Communications Corporation. Portions created by Netscape are`
			`* Copyright (C) 1998 Netscape Communications Corporation. All Rights`
			`* Reserved.`
			`*/`
			`/* euc2jis.c */`

			`#include "intlpriv.h"`
Normandy Merge .... 1998-06-22 21:21:28 +00:00			`#if defined(MOZ_MAIL_NEWS)`
Free the lizard 1998-03-28 02:44:41 +00:00			`#include "katakana.h"`
			`#endif`


			`extern int MK_OUT_OF_MEMORY;`


			`/* net_euc2jis(obj, eucbuf, eucbufsz)`
			`* Args:`
			`* eucbuf: Ptr to a buf of EUC chars`
			`* eucbufsz: Size in bytes of eucbuf`
			`* obj->eucmode: Ptr to encoding mode, use as arg for next call to`
			`* mz_euc2jis() for rest of current EUC data. First call should`
			`* initialize mode to ASCII (0).`
			`* uncvtbuf: If entire buffer was converted, uncvtbuf[0] will be nul,`
			`* else this points to EUC chars that were NOT converted`
			`* and mz_euc2jis() with additional EUC chars appended.`
			`* Return:`
			`* Returns NULL on failure, otherwise it returns a pointer to a buffer of`
			`* converted characters. Caller must XP_FREE() this memory.`
			`*`
			`* Description:`
			`*`
			`* Allocate destination buffer.`
			`*`
			`* Ouput JIS ESC sequences based upon which EUC code set.`
			`*`
			`* No conversion is needed for ASCII/JIS Roman characters.`
			`*`
			`* Clear 8th bit of 1-byte Half-width Katakana. Half-width Katakana`
			`* is not widely used and its ESC sequence may not be recognized`
			`* by some software. It's use on the internet is discouraged...`
			`*`
			`* Clear 8th bits of 2-byte JIS X 212-1990 chars. JIS-212`
			`* is not widely used and its ESC sequence may not be recognized`
			`* by some software. These chars do not have corresponding chars`
			`* in JIS-208 or SJIS.`
			`*`
			`* Clear 8th bits of 2-byte JIS X 208-1993 chars. These are the commonly`
			`* used chars (along with JIS-Roman).`
			`*`
			`* Bytes which do not fall in the EUC valid character codes are treated`
			`* like JIS-Roman.`
			`*`
			`* If either EUC buffer does not contain a complete EUC char or dest buffer`
			`* is full, then return unconverted EUC to caller. Caller should`
			`* append more data and recall mz_euc2jis.`
			`*/`


			`MODULE_PRIVATE unsigned char *`
			`mz_euc2jis( CCCDataObject obj,`
			`const unsigned char eucbuf, / EUC buffer for conversion */`
			`int32 eucbufsz) /* EUC buffer size in bytes */`
			`{`
			`unsigned char *tobuf = NULL;`
			`int32 tobufsz;`
			`register unsigned char tobufp, eucp; /* current byte in bufs */`
			`register unsigned char tobufep, eucep; /* end of buffers */`
			`int32 uncvtlen;`
			`unsigned char *uncvtbuf = INTL_GetCCCUncvtbuf(obj);`
Normandy Merge .... 1998-06-22 21:21:28 +00:00			`#if defined(MOZ_MAIL_NEWS)`
			`unsigned char kanabuf[4]; /* for half-width kana */`
			`uint32 byteused; /* for half-width kana */`
Free the lizard 1998-03-28 02:44:41 +00:00			`#endif`

			`/* Allocate a dest buffer: */`
			`/* JIS is longer than EUC because of ESC seq. In the worst case`
			`* ( <SS2> <Half-width Kana> <Roman> ... ), the converted JIS will`
			`* be 2-2/3 times the size of the original EUC + 1 for nul byte.`
			`* Worst case: single half-width kana:`
			`* ESC ( I KANA ESC ( J`
			`*/`
			`uncvtlen = strlen((const char *)uncvtbuf);`
			`/* 3 times length of EUC */`
			`tobufsz = eucbufsz + uncvtlen + ((eucbufsz + uncvtlen)<<2) + 8;`
			`if ((tobuf = (unsigned char )XP_ALLOC(tobufsz)) == (unsigned char )NULL) {`
			`INTL_SetCCCRetval(obj, MK_OUT_OF_MEMORY);`
			`return(NULL);`
			`}`
			`/* Initialize pointers, etc. */`
			`eucp = (unsigned char *)eucbuf;`
			`eucep = eucp + eucbufsz - 1;`

			`#define uncvtp tobufp /* use tobufp as temp */`
			`/* If prev. unconverted chars, append unconverted`
			`* chars w/new chars and try to process.`
			`*/`
			`if (uncvtbuf[0] != '\0') {`
			`uncvtp = (unsigned char *)uncvtbuf + uncvtlen;`
			`while (uncvtp < ((unsigned char *)uncvtbuf + UNCVTBUF_SIZE) &&`
			`eucp <= eucep)`
			`uncvtp++ = eucp++;`
			`uncvtp = '\0'; / nul terminate */`
			`eucp = (unsigned char )uncvtbuf; / process unconverted first */`
			`eucep = uncvtp - 1;`
			`}`
			`#undef uncvtp`

			`tobufp = tobuf;`
			`tobufep = tobufp + tobufsz - 2; /* save space for terminating null */`

			`WHILELOOP:`
			`/* While EUC data && space in dest. buf. */`
			`while ((tobufp <= tobufep) && (eucp <= eucep)) {`
			`if (eucp < SS2) { / ASCII/JIS-Roman or invalid EUC */`
			`if (INTL_GetCCCJismode(obj) != JIS_Roman) {`
			`InsASCII_ESC(tobufp, obj);`
			`}`
			`tobufp++ = eucp++;`
			`} else if (eucp == SS2) { / Half-width Katakana */`
			`if (eucp+1 > eucep) /* No 2nd byte in EUC buffer? */`
			`break;`
Normandy Merge .... 1998-06-22 21:21:28 +00:00			`#if defined(MOZ_MAIL_NEWS)`
			`if (!INTL_GetCCCCvtflag_SendHankakuKana(obj)) {`
			`if (INTL_GetCCCJismode(obj) != JIS_208_83) {`
			`Ins208_83_ESC(tobufp, obj);`
			`}`
			`INTL_SjisHalf2FullKana(eucp, (uint32)eucep - (uint32)eucp + 1, kanabuf, &byteused);`
			`*tobufp++ = kanabuf[0] & 0x7F;`
			`*tobufp++ = kanabuf[1] & 0x7F;`
			`eucp += byteused;`
			`} else {`
			`if (INTL_GetCCCJismode(obj) != JIS_HalfKana) {`
			`InsHalfKana_ESC(tobufp, obj);`
			`}`
			`eucp++; /* skip SS2 */`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
Free the lizard 1998-03-28 02:44:41 +00:00			`}`
			`#else`
			`if (INTL_GetCCCJismode(obj) != JIS_HalfKana) {`
			`InsHalfKana_ESC(tobufp, obj);`
			`}`
			`eucp++; /* skip SS2 */`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
			`#endif`

			`} else if (eucp == SS3) { / JIS X 0212-1990 */`
			`if (eucp+2 > eucep) /* No 2nd & 3rd bytes in EUC buffer? */`
			`break;`
			`if ((eucp+1) <= 0xA0 \|\| (eucp+2) <= 0xA0) { /* Invalid EUC212 */`
			`if (INTL_GetCCCJismode(obj) != JIS_Roman) {`
			`InsASCII_ESC(tobufp, obj);`
			`}`
			`tobufp++ = eucp++; /* process 1 byte as Roman */`
			`} else {`
			`if (INTL_GetCCCJismode(obj) != JIS_212_90) {`
			`Ins212_90_ESC(tobufp, obj);`
			`}`
			`eucp++; /* skip SS3 */`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
			`}`
			`} else if (eucp < 0xA0) { / Invalid EUC: treat as Roman */`
			`if (INTL_GetCCCJismode(obj) != JIS_Roman) {`
			`InsASCII_ESC(tobufp, obj);`
			`}`
			`tobufp++ = eucp++;`
			`} else { /* JIS X 0208-1990 */`
			`if (eucp+1 > eucep) /* No 2nd byte in EUC buffer? */`
			`break;`
			`if ((eucp+1) < 0xA0) { / 1st byte OK, check if 2nd is valid */`
			`if (INTL_GetCCCJismode(obj) != JIS_Roman) {`
			`InsASCII_ESC(tobufp, obj);`
			`}`
			`tobufp++ = eucp++; /* process 1 byte as Roman */`
			`} else {`
			`if (INTL_GetCCCJismode(obj) != JIS_208_83) {`
			`Ins208_83_ESC(tobufp, obj);`
			`}`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
			`tobufp++ = eucp & 0x7F;`
			`eucp++;`
			`}`
			`}`
			`}`

			`if (uncvtbuf[0] != '\0') {`
			`/* Just processed unconverted chars:`
			`* eucp pts to 1st unprocessed char in`
			`* eucbuf. Some may have been processed`
			`* while processing unconverted chars,`
			`* so set up ptrs not to process them`
			`* twice.`
			`*/`
			`/* If nothing was converted, this can`
			`* only happen if there was not`
			`* enough EUC data. Stop and get`
			`* more data.`
			`*/`
			`if (eucp == (unsigned char )uncvtbuf) { / Nothing converted */`
			`*tobufp = '\0';`
			`return(NULL);`
			`}`
			`eucp = (unsigned char *)eucbuf +`
			`(eucp - (unsigned char *)uncvtbuf - uncvtlen);`
			`eucep = (unsigned char )eucbuf + eucbufsz - 1; / save space for nul */`
			`uncvtbuf[0] = '\0'; /* No more uncoverted chars. */`
			`goto WHILELOOP; /* Process new data */`
			`}`

			`if (INTL_GetCCCJismode(obj) != JIS_Roman) {`
			`INTL_SetCCCJismode(obj, JIS_Roman);`
			`InsASCII_ESC(tobufp, obj);`
			`}`

			`tobufp = '\0'; / null terminate dest. data */`
			`INTL_SetCCCLen(obj, tobufp - tobuf); /* length not counting null */`

			`if (eucp <= eucep) { /* uncoverted EUC? */`
			`tobufp = (unsigned char )uncvtbuf;/ reuse the tobufp as a TEMP */`
			`while (eucp <= eucep)`
			`tobufp++ = eucp++;`
			`tobufp = '\0'; / null terminate */`
			`}`
			`return(tobuf);`
			`}`