gecko-dev/intl/unicharutil/nsUnicodeNormalizer.cpp

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* This file is modified from JPNIC's mDNKit, it is under both MPL and
 * JPNIC's license.
 */

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * Copyright (c) 2000,2002 Japan Network Information Center.
 * All rights reserved.
 *
 * By using this file, you agree to the terms and conditions set forth bellow.
 *
 * 			LICENSE TERMS AND CONDITIONS
 *
 * The following License Terms and Conditions apply, unless a different
 * license is obtained from Japan Network Information Center ("JPNIC"),
 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
 * Chiyoda-ku, Tokyo 101-0047, Japan.
 *
 * 1. Use, Modification and Redistribution (including distribution of any
 *    modified or derived work) in source and/or binary forms is permitted
 *    under this License Terms and Conditions.
 *
 * 2. Redistribution of source code must retain the copyright notices as they
 *    appear in each source code file, this License Terms and Conditions.
 *
 * 3. Redistribution in binary form must reproduce the Copyright Notice,
 *    this License Terms and Conditions, in the documentation and/or other
 *    materials provided with the distribution.  For the purposes of binary
 *    distribution the "Copyright Notice" refers to the following language:
 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
 *
 * 4. The name of JPNIC may not be used to endorse or promote products
 *    derived from this Software without specific prior written approval of
 *    JPNIC.
 *
 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 */

#include <string.h>

#include "nsMemory.h"
#include "nsUnicodeNormalizer.h"
#include "nsString.h"
#include "mozilla/BinarySearch.h"

NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)


nsUnicodeNormalizer::nsUnicodeNormalizer()
{
}

nsUnicodeNormalizer::~nsUnicodeNormalizer()
{
}


#define END_BIT		0x80000000


/*
 * Some constants for Hangul decomposition/composition.
 * These things were taken from unicode book.
 */
#define SBase		0xac00
#define LBase		0x1100
#define VBase		0x1161
#define TBase		0x11a7
#define LCount		19
#define VCount		21
#define TCount		28
#define SLast		(SBase + LCount * VCount * TCount)

struct composition {
	uint32_t c2;	/* 2nd character */
	uint32_t comp;	/* composed character */
};


#include "normalization_data.h"

/*
 * Macro for multi-level index table.
 */
#define LOOKUPTBL(vprefix, mprefix, v) \
	DMAP(vprefix)[\
		IMAP(vprefix)[\
			IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
		]\
	].tbl[IDX2(mprefix, v)]

#define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
#define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
#define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))

#define IDX_0(v, bits1, bits2)	((v) >> ((bits1) + (bits2)))
#define IDX_1(v, bits1, bits2)	(((v) >> (bits2)) & ((1 << (bits1)) - 1))
#define IDX_2(v, bits1, bits2)	((v) & ((1 << (bits2)) - 1))

#define BITS1(mprefix)	mprefix ## _BITS_1
#define BITS2(mprefix)	mprefix ## _BITS_2

#define IMAP(vprefix)	vprefix ## _imap
#define DMAP(vprefix)	vprefix ## _table
#define SEQ(vprefix)	vprefix ## _seq

static int32_t
canonclass(uint32_t c) {
	/* Look up canonicalclass table. */
	return (LOOKUPTBL(canon_class, CANON_CLASS, c));
}

static int32_t
decompose_char(uint32_t c, const uint32_t **seqp)
{
	/* Look up decomposition table. */
	int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
	*seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
	return (seqidx);
}

static int32_t
compose_char(uint32_t c,
				const struct composition **compp)
{
	/* Look up composition table. */
	int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
	*compp = SEQ(compose) + (seqidx & 0xffff);
	return (seqidx >> 16);
}

static nsresult
mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
		       uint32_t c, int32_t *decomp_lenp)
{
	uint32_t *vorg = v;
	int32_t seqidx;
	const uint32_t *seq;

	//assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);

	/*
	 * First, check for Hangul.
	 */
	if (SBase <= c && c < SLast) {
		int32_t idx, t_offset, v_offset, l_offset;

		idx = c - SBase;
		t_offset = idx % TCount;
		idx /= TCount;
		v_offset = idx % VCount;
		l_offset = idx / VCount;
		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
			return (NS_ERROR_UNORM_MOREOUTPUT);
		*v++ = LBase + l_offset;
		*v++ = VBase + v_offset;
		if (t_offset > 0)
			*v++ = TBase + t_offset;
		*decomp_lenp = v - vorg;
		return (NS_OK);
	}

	/*
	 * Look up decomposition table.  If no decomposition is defined
	 * or if it is a compatibility decomosition when canonical
	 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
	 */
	seqidx = decompose_char(c, &seq);
	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
		return (NS_SUCCESS_UNORM_NOTFOUND);

	/*
	 * Copy the decomposed sequence.  The end of the sequence are
	 * marked with END_BIT.
	 */
	do {
		uint32_t c;
		int32_t dlen;
		nsresult r;

		c = *seq & ~END_BIT;

		/* Decompose recursively. */
		r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
		if (r == NS_OK) {
			v += dlen;
			vlen -= dlen;
		} else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
			if (vlen < 1)
				return (NS_ERROR_UNORM_MOREOUTPUT);
			*v++ = c;
			vlen--;
		} else {
			return (r);
		}

	} while ((*seq++ & END_BIT) == 0);

	*decomp_lenp = v - vorg;

	return (NS_OK);
}

static int32_t
mdn__unicode_iscompositecandidate(uint32_t c)
{
	const struct composition *dummy;

	/* Check for Hangul */
	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
		return (1);

	/*
	 * Look up composition table.  If there are no composition
	 * that begins with the given character, it is not a
	 * composition candidate.
	 */
	if (compose_char(c, &dummy) == 0)
		return (0);
	else
		return (1);
}

namespace {

struct SequenceAdaptor
{
	const composition* const mSequence;
	explicit SequenceAdaptor(const composition* aSequence) : mSequence(aSequence) {}
	uint32_t operator[](size_t aIdx) const {
		return mSequence[aIdx].c2;
	}
};

} // namespace

static nsresult
mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
{
	int32_t n;
	const struct composition *cseq;

	//assert(compp != nullptr);

	/*
	 * Check for Hangul.
	 */
	if (LBase <= c1 && c1 < LBase + LCount &&
	    VBase <= c2 && c2 < VBase + VCount) {
		/*
		 * Hangul L and V.
		 */
		*compp = SBase +
			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
		return (NS_OK);
	} else if (SBase <= c1 && c1 < SLast &&
		   TBase <= c2 && c2 < TBase + TCount &&
		   (c1 - SBase) % TCount == 0) {
		/*
		 * Hangul LV and T.
		 */
		*compp = c1 + (c2 - TBase);
		return (NS_OK);
	}

	/*
	 * Look up composition table.  If the result is 0, no composition
	 * is defined.  Otherwise, upper 16bits of the result contains
	 * the number of composition that begins with 'c1', and the lower
	 * 16bits is the offset in 'compose_seq'.
	 */
	if ((n = compose_char(c1, &cseq)) == 0)
		return (NS_SUCCESS_UNORM_NOTFOUND);

	/*
	 * The composite sequences are sorted by the 2nd character 'c2'.
	 * So we can use binary search.
	 */
	size_t idx;
	if (mozilla::BinarySearch(SequenceAdaptor(cseq), 0, n, c2, &idx)) {
		*compp = cseq[idx].comp;
		return (NS_OK);
	}

	return (NS_SUCCESS_UNORM_NOTFOUND);
}


#define WORKBUF_SIZE		128
#define WORKBUF_SIZE_MAX	10000

typedef struct {
	int32_t cur;		/* pointing now processing character */
	int32_t last;		/* pointing just after the last character */
	int32_t size;		/* size of UCS and CLASS array */
	uint32_t *ucs;	/* UCS-4 characters */
	int32_t *cclass;		/* and their canonical classes */
	uint32_t ucs_buf[WORKBUF_SIZE];	/* local buffer */
	int32_t class_buf[WORKBUF_SIZE];		/* ditto */
} workbuf_t;

static nsresult	decompose(workbuf_t *wb, uint32_t c, int32_t compat);
static void		get_class(workbuf_t *wb);
static void		reorder(workbuf_t *wb);
static void		compose(workbuf_t *wb);
static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
static void		workbuf_init(workbuf_t *wb);
static void		workbuf_free(workbuf_t *wb);
static nsresult	workbuf_extend(workbuf_t *wb);
static nsresult	workbuf_append(workbuf_t *wb, uint32_t c);
static void		workbuf_shift(workbuf_t *wb, int32_t shift);
static void		workbuf_removevoid(workbuf_t *wb);


static nsresult
mdn_normalize(bool do_composition, bool compat,
	  const nsAString& aSrcStr, nsAString& aToStr)
{
	workbuf_t wb;
	nsresult r = NS_OK;
	/*
	 * Initialize working buffer.
	 */
	workbuf_init(&wb);

	nsAString::const_iterator start, end;
	aSrcStr.BeginReading(start);
	aSrcStr.EndReading(end);

	while (start != end) {
		uint32_t c;
		char16_t curChar;

		//assert(wb.cur == wb.last);

		/*
		 * Get one character from 'from'.
		 */
		curChar= *start++;

		if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
			c = SURROGATE_TO_UCS4(curChar, *start);
			++start;
		} else {
			c = curChar;
		}

		/*
		 * Decompose it.
		 */
		if ((r = decompose(&wb, c, compat)) != NS_OK)
			break;

		/*
		 * Get canonical class.
		 */
		get_class(&wb);

		/*
		 * Reorder & compose.
		 */
		for (; wb.cur < wb.last; wb.cur++) {
			if (wb.cur == 0) {
				continue;
			} else if (wb.cclass[wb.cur] > 0) {
				/*
				 * This is not a starter. Try reordering.
				 * Note that characters up to it are
				 * already in canonical order.
				 */
				reorder(&wb);
				continue;
			}

			/*
			 * This is a starter character, and there are
			 * some characters before it.  Those characters
			 * have been reordered properly, and
			 * ready for composition.
			 */
			if (do_composition && wb.cclass[0] == 0)
				compose(&wb);

			/*
			 * If CUR points to a starter character,
			 * then process of characters before CUR are
			 * already finished, because any further
			 * reordering/composition for them are blocked
			 * by the starter CUR points.
			 */
			if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
				/* Flush everything before CUR. */
				r = flush_before_cur(&wb, aToStr);
				if (r != NS_OK)
					break;
			}
		}
	}

	if (r == NS_OK) {
		if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
			/*
			 * There is some characters left in WB.
			 * They are ordered, but not composed yet.
			 * Now CUR points just after the last character in WB,
			 * and since compose() tries to compose characters
			 * between top and CUR inclusive, we must make CUR
			 * one character back during compose().
			 */
			wb.cur--;
			compose(&wb);
			wb.cur++;
		}
		/*
		 * Call this even when WB.CUR == 0, to make TO
		 * NUL-terminated.
		 */
		r = flush_before_cur(&wb, aToStr);
	}

	workbuf_free(&wb);

	return (r);
}

static nsresult
decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
	nsresult r;
	int32_t dec_len;

again:
	r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
				   wb->size - wb->last, c, &dec_len);
	switch (r) {
	case NS_OK:
		wb->last += dec_len;
		return (NS_OK);
	case NS_SUCCESS_UNORM_NOTFOUND:
		return (workbuf_append(wb, c));
	case NS_ERROR_UNORM_MOREOUTPUT:
		if ((r = workbuf_extend(wb)) != NS_OK)
			return (r);
		if (wb->size > WORKBUF_SIZE_MAX) {
			// "mdn__unormalize_form*: " "working buffer too large\n"
			return (NS_ERROR_FAILURE);
		}
		goto again;
	default:
		return (r);
	}
	/* NOTREACHED */
}

static void
get_class(workbuf_t *wb) {
	int32_t i;

	for (i = wb->cur; i < wb->last; i++)
		wb->cclass[i] = canonclass(wb->ucs[i]);
}

static void
reorder(workbuf_t *wb) {
	uint32_t c;
	int32_t i;
	int32_t cclass;

	//assert(wb != nullptr);

	i = wb->cur;
	c = wb->ucs[i];
	cclass = wb->cclass[i];

	while (i > 0 && wb->cclass[i - 1] > cclass) {
		wb->ucs[i] = wb->ucs[i - 1];
		wb->cclass[i] =wb->cclass[i - 1];
		i--;
		wb->ucs[i] = c;
		wb->cclass[i] = cclass;
	}
}

static void
compose(workbuf_t *wb) {
	int32_t cur;
	uint32_t *ucs;
	int32_t *cclass;
	int32_t last_class;
	int32_t nvoids;
	int32_t i;

	//assert(wb != nullptr && wb->cclass[0] == 0);

	cur = wb->cur;
	ucs = wb->ucs;
	cclass = wb->cclass;

	/*
	 * If there are no decomposition sequence that begins with
	 * the top character, composition is impossible.
	 */
	if (!mdn__unicode_iscompositecandidate(ucs[0]))
		return;

	last_class = 0;
	nvoids = 0;
	for (i = 1; i <= cur; i++) {
		uint32_t c;
		int32_t cl = cclass[i];

		if ((last_class < cl || cl == 0) &&
		    mdn__unicode_compose(ucs[0], ucs[i],
					 &c) == NS_OK) {
			/*
			 * Replace the top character with the composed one.
			 */
			ucs[0] = c;
			cclass[0] = canonclass(c);

			cclass[i] = -1;	/* void this character */
			nvoids++;
		} else {
			last_class = cl;
		}
	}

	/* Purge void characters, if any. */
	if (nvoids > 0)
		workbuf_removevoid(wb);
}

static nsresult
flush_before_cur(workbuf_t *wb, nsAString& aToStr)
{
	int32_t i;

	for (i = 0; i < wb->cur; i++) {
		if (!IS_IN_BMP(wb->ucs[i])) {
			aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
			aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
		} else {
			aToStr.Append((char16_t)(wb->ucs[i]));
		}
	}

	workbuf_shift(wb, wb->cur);

	return (NS_OK);
}

static void
workbuf_init(workbuf_t *wb) {
	wb->cur = 0;
	wb->last = 0;
	wb->size = WORKBUF_SIZE;
	wb->ucs = wb->ucs_buf;
	wb->cclass = wb->class_buf;
}

static void
workbuf_free(workbuf_t *wb) {
	if (wb->ucs != wb->ucs_buf) {
		free(wb->ucs);
		free(wb->cclass);
	}
}

static nsresult
workbuf_extend(workbuf_t *wb) {
	int32_t newsize = wb->size * 3;

	if (wb->ucs == wb->ucs_buf) {
		wb->ucs = (uint32_t*)moz_xmalloc(sizeof(wb->ucs[0]) * newsize);
		if (!wb->ucs)
			return NS_ERROR_OUT_OF_MEMORY;
		wb->cclass = (int32_t*)moz_xmalloc(sizeof(wb->cclass[0]) * newsize);
		if (!wb->cclass) {
			free(wb->ucs);
			wb->ucs = nullptr;
			return NS_ERROR_OUT_OF_MEMORY;
		}
	} else {
		void* buf = moz_xrealloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
		if (!buf)
			return NS_ERROR_OUT_OF_MEMORY;
		wb->ucs = (uint32_t*)buf;
		buf = moz_xrealloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
		if (!buf)
			return NS_ERROR_OUT_OF_MEMORY;
		wb->cclass = (int32_t*)buf;
	}
	return (NS_OK);
}

static nsresult
workbuf_append(workbuf_t *wb, uint32_t c) {
	nsresult r;

	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
		return (r);
	wb->ucs[wb->last++] = c;
	return (NS_OK);
}

static void
workbuf_shift(workbuf_t *wb, int32_t shift) {
	int32_t nmove;

	//assert(wb != nullptr && wb->cur >= shift);

	nmove = wb->last - shift;
	memmove(&wb->ucs[0], &wb->ucs[shift],
		      nmove * sizeof(wb->ucs[0]));
	memmove(&wb->cclass[0], &wb->cclass[shift],
		      nmove * sizeof(wb->cclass[0]));
	wb->cur -= shift;
	wb->last -= shift;
}

static void
workbuf_removevoid(workbuf_t *wb) {
	int32_t i, j;
	int32_t last = wb->last;

	for (i = j = 0; i < last; i++) {
		if (wb->cclass[i] >= 0) {
			if (j < i) {
				wb->ucs[j] = wb->ucs[i];
				wb->cclass[j] = wb->cclass[i];
			}
			j++;
		}
	}
	wb->cur -= last - j;
	wb->last = j;
}

nsresult
nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
{
  return mdn_normalize(false, false, aSrc, aDest);
}

nsresult
nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
{
  return mdn_normalize(true, false, aSrc, aDest);
}

nsresult
nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
{
  return mdn_normalize(false, true, aSrc, aDest);
}

nsresult
nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
{
  return mdn_normalize(true, true, aSrc, aDest);
}

bool
nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
{
  return mdn__unicode_compose(a, b, ab) == NS_OK;
}

bool
nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
{
  // We can't use mdn__unicode_decompose here, because that does a recursive
  // decomposition that may yield more than two characters, but the harfbuzz
  // callback wants just a single-step decomp that is guaranteed to produce
  // no more than two characters. So we do a low-level lookup in the table
  // of decomp sequences.
  const uint32_t *seq;
  uint32_t seqidx = decompose_char(c, &seq);
  if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
    return false;
  }
  *c1 = *seq & ~END_BIT;
  if (*seq & END_BIT) {
    *c2 = 0;
  } else {
    *c2 = *++seq & ~END_BIT;
  }
  return true;
}