gecko-dev/intl/uconv/nsUnicodeToUTF8.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//----------------------------------------------------------------------
// Global functions and data [declaration]
#include "nsUnicodeToUTF8.h"
#include "mozilla/CheckedInt.h"

NS_IMPL_ISUPPORTS(nsUnicodeToUTF8, nsIUnicodeEncoder)

//----------------------------------------------------------------------
// nsUnicodeToUTF8 class [implementation]

NS_IMETHODIMP nsUnicodeToUTF8::GetMaxLength(const char16_t* aSrc,
                                            int32_t aSrcLength,
                                            int32_t* aDestLength)
{
  MOZ_ASSERT(aDestLength);

  // aSrc is interpreted as UTF16, 3 is normally enough.
  // But when previous buffer only contains part of the surrogate pair, we
  // need to complete it here. If the first word in following buffer is not
  // in valid surrogate range, we need to convert the remaining of last buffer
  // to 3 bytes.
  mozilla::CheckedInt32 length = aSrcLength;
  length *= 3;
  length += 3;

  if (!length.isValid()) {
    return NS_ERROR_OUT_OF_MEMORY;
  }

  *aDestLength = length.value();
  return NS_OK;
}

NS_IMETHODIMP nsUnicodeToUTF8::Convert(const char16_t* aSrc,
                                       int32_t* aSrcLength,
                                       char* aDest,
                                       int32_t* aDestLength)
{
  const char16_t* src = aSrc;
  const char16_t* srcEnd = aSrc + *aSrcLength;
  char* dest = aDest;
  int32_t destLen = *aDestLength;
  uint32_t n;

  //complete remaining of last conversion
  if (mHighSurrogate) {
    if (src < srcEnd) {
      *aDestLength = 0;
      return NS_OK_UENC_MOREINPUT;
    }
    if (*aDestLength < 4) {
      *aSrcLength = 0;
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    if (*src < (char16_t)0xdc00 || *src > (char16_t)0xdfff) { //not a pair
      *dest++ = (char)0xef; //replacement character
      *dest++ = (char)0xbf;
      *dest++ = (char)0xbd;
      destLen -= 3;
    } else {
      n = ((mHighSurrogate - (char16_t)0xd800) << 10) +
              (*src - (char16_t)0xdc00) + 0x10000;
      *dest++ = (char)0xf0 | (n >> 18);
      *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
      *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
      *dest++ = (char)0x80 | (n & 0x3f);
      ++src;
      destLen -= 4;
    }
    mHighSurrogate = 0;
  }

  while (src < srcEnd) {
    if ( *src <= 0x007f) {
      if (destLen < 1)
        goto error_more_output;
      *dest++ = (char)*src;
      --destLen;
    } else if (*src <= 0x07ff) {
      if (destLen < 2)
        goto error_more_output;
      *dest++ = (char)0xc0 | (*src >> 6);
      *dest++ = (char)0x80 | (*src & 0x003f);
      destLen -= 2;
    } else if (*src >= (char16_t)0xd800 && *src <= (char16_t)0xdfff) {
      if (*src >= (char16_t)0xdc00) { //not a pair
        if (destLen < 3)
          goto error_more_output;
        *dest++ = (char)0xef; //replacement character
        *dest++ = (char)0xbf;
        *dest++ = (char)0xbd;
        destLen -= 3;
        ++src;
        continue;
      }
      if ((src+1) >= srcEnd) {
        //we need another surrogate to complete this unicode char
        mHighSurrogate = *src;
        *aDestLength = dest - aDest;
        return NS_OK_UENC_MOREINPUT;
      }
      //handle surrogate
      if (destLen < 4)
        goto error_more_output;
      if (*(src+1) < (char16_t)0xdc00 || *(src+1) > 0xdfff) { //not a pair
        *dest++ = (char)0xef; //replacement character
        *dest++ = (char)0xbf;
        *dest++ = (char)0xbd;
        destLen -= 3;
      } else {
        n = ((*src - (char16_t)0xd800) << 10) + (*(src+1) - (char16_t)0xdc00) + (uint32_t)0x10000;
        *dest++ = (char)0xf0 | (n >> 18);
        *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
        *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
        *dest++ = (char)0x80 | (n & 0x3f);
        destLen -= 4;
        ++src;
      }
    } else {
      if (destLen < 3)
        goto error_more_output;
      //treat rest of the character as BMP
      *dest++ = (char)0xe0 | (*src >> 12);
      *dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
      *dest++ = (char)0x80 | (*src & 0x003f);
      destLen -= 3;
    }
    ++src;
  }

  *aDestLength = dest - aDest;
  return NS_OK;

error_more_output:
  *aSrcLength = src - aSrc;
  *aDestLength = dest - aDest;
  return NS_OK_UENC_MOREOUTPUT;
}

NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength)
{
  char * dest = aDest;

  if (mHighSurrogate) {
    if (*aDestLength < 3) {
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    *dest++ = (char)0xef; //replacement character
    *dest++ = (char)0xbf;
    *dest++ = (char)0xbd;
    mHighSurrogate = 0;
    *aDestLength = 3;
    return NS_OK;
  }

  *aDestLength  = 0;
  return NS_OK;
}