gecko-dev/intl/uconv/ucvlatin/nsUnicodeToTSCII.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:expandtab:shiftwidth=2:tabstop=2:
 */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicodeToTSCII.h"
#include "nsMemory.h"
#include "tamil.h"

/* 
 *  TSCII is an 8-bit encoding consisting of:
 *  0x00..0x7F:       ASCII
 *  0x80..0x90, 0x95..0x9F, 0xAB..0xFE:
 *                    Tamil letters and glyphs
 *  0xA1..0xA5, 0xAA: Tamil combining letters (after the base character)
 *  0xA6..0xA8:       Tamil combining letters (before the base character)
 *  0x91..0x94:       Punctuation
 *  0xA9:             Symbols
 */

//----------------------------------------------------------------------
// Class nsUnicodeToTSCII [implementation]

NS_IMPL_ISUPPORTS1(nsUnicodeToTSCII, nsIUnicodeEncoder)
  
/* 
 * During UCS-4 to TSCII conversion, mState contains 
 * the last byte (or sometimes the last two bytes) to be output.
 * This can be:
 *   0x00                     Nothing pending.
 *   0xB8..0xC9, 0x83..0x86   A consonant.
 *   0xEC, 0x8A               A consonant with VIRAMA sign (final or joining).
 *   0x87, 0xC38A             Two consonants combined through a VIRAMA sign. 
 */

static const uint8_t UnicharToTSCII[] =
{
     0,    0,    0, 0xb7,    0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87
  0xae, 0xaf, 0xb0,    0,    0,    0, 0xb1, 0xb2, // 0x0B88..0x0B8F
  0xb3,    0, 0xb4, 0xb5, 0xb6, 0xb8,    0,    0, // 0x0B90..0x0B97
     0, 0xb9, 0xba,    0, 0x83,    0, 0xbb, 0xbc, // 0x0B98..0x0B9F
     0,    0,    0, 0xbd, 0xbe,    0,    0,    0, // 0x0BA0..0x0BA7
  0xbf, 0xc9, 0xc0,    0,    0,    0, 0xc1, 0xc2, // 0x0BA8..0x0BAF
  0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5,    0, 0x84, // 0x0BB0..0x0BB7
  0x85, 0x86,    0,    0,    0,    0, 0xa1, 0xa2, // 0x0BB8..0x0BBF
  0xa3, 0xa4, 0xa5,    0,    0,    0, 0xa6, 0xa7, // 0x0BC0..0x0BC7
  0xa8,    0,    0,    0,    0,    0,    0,    0, // 0x0BC8..0x0BCF
     0,    0,    0,    0,    0,    0,    0, 0xaa, // 0x0BD0..0x0BD7
     0,    0,    0,    0,    0,    0,    0,    0, // 0x0BD8..0x0BDF
     0,    0,    0,    0,    0,    0, 0x80, 0x81, // 0x0BE0..0x0BE7
  0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF
  0x9d, 0x9e, 0x9f,    0,    0,    0,    0,    0, // 0x0BF0..0x0BF7
     0,    0,    0,    0,    0,    0,    0,    0  // 0x0BF8..0x0BFF
};

static const uint8_t consonant_with_u[] =
{
  0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
  0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb
};

static const uint8_t consonant_with_uu[] =
{
  0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
  0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb
};

static const uint8_t consonant_with_virama[18] =
{
  0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4,
  0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd
};


// Modified implementation of Unicode to TSCII converter in glibc by 
// Bruno Haible.  My modifications are based on Unicode 3.0 chap. 9 and 
// the code chart for Tamil. 
NS_IMETHODIMP 
nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, int32_t * aSrcLength, 
                          char * aDest, int32_t * aDestLength)
{
  const PRUnichar * src = aSrc;
  const PRUnichar * srcEnd = aSrc + *aSrcLength;
  char * dest = aDest;
  char * destEnd = dest + *aDestLength;

  nsresult rv = NS_OK;
                      
  while (src < srcEnd && dest < destEnd) {
    PRUnichar ch = *src;
    if (mBuffer) {                        
      // Attempt to combine the last character with this one.
      uint32_t last = mBuffer;
                            
      // last : consonant 
      if (IS_TSC_CONSONANT(last)) {                      
        if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) {                      
          *dest++ = consonant_with_u[last - TSC_KA];
          mBuffer = 0;                  
          ++src;
          continue;
        }                      
  
        if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) {                      
          *dest++ = consonant_with_uu[last - TSC_KA];          
          mBuffer = 0;                  
          ++src;                  
          continue;                  
        }                      
  
        // reorder. vowel sign goes to the left of consonant
        if (IS_UNI_LEFT_VOWELSIGN(ch)) {                      
          if (dest + 2 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWELSIGN(ch);
          *dest++ = last;                
          mBuffer = 0;                
          ++src;                  
          continue;                  
        }                      
  
        // split and reorder. consonant goes bet. two parts
        if (IS_UNI_2PARTS_VOWELSIGN(ch)) {                      
          if (dest + 3 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = last;                
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
          mBuffer = 0;                
          ++src;                  
          continue;                  
        }                      
  
        // Virama
        if (ch == UNI_VIRAMA) {                      
          // consonant KA can form a conjunct with consonant SSA(SHA).
          // buffer dead consonant 'K' for the now.
          if (last == TSC_KA) {                 
            mBuffer = TSC_KA_DEAD;
          }
          // SA can form a conjunct when followed by 'RA'. 
          // buffer dead consonant 'S' for the now.
          else if (last == TSC_SA) {
            mBuffer = TSC_SA_DEAD;                
          }
          else {                    
            *dest++ = IS_TSC_CONSONANT1(last) ?
              consonant_with_virama[last - TSC_KA] : last + 5;
            mBuffer = 0;                
          }                    
          ++src;                  
          continue;                  
        }                      

        // consonant TA forms a ligature with vowel 'I' or 'II'.
        if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) {                      
          *dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA);
          mBuffer = 0;                  
          ++src;                  
          continue;                  
        }                      
      }                      
      else if (last == TSC_KA_DEAD) {                      
        // Kd + SSA =  K.SSA
        if (ch == UNI_SSA) {                      
          mBuffer = TSC_KSSA; 
          ++src;                  
          continue;                  
        }                      
      }                      
      else if (last == TSC_SA_DEAD) {                      
        // Sd + RA = S.RA. Buffer RA + Sd. 
        if (ch == UNI_RA) {                      
          mBuffer = 0xc38a;                
          ++src;                  
          continue;                  
        }                      
      }                      
      else if (last == TSC_KSSA) {                      
        if (ch == UNI_VIRAMA) {
          *dest++ = (char) TSC_KSSA_DEAD;
          mBuffer = 0;                  
          ++src;                  
          continue;                  
        }                      

        // vowel splitting/reordering should be done around conjuncts as well.
        // reorder. vowel sign goes to the left of consonant
        if (IS_UNI_LEFT_VOWELSIGN(ch)) {                      
          if (dest + 2 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWELSIGN(ch);
          *dest++ = last;                
          mBuffer = 0;                
          ++src;                  
          continue;                  
        }                      
  
        // split and reorder. consonant goes bet. two parts
        if (IS_UNI_2PARTS_VOWELSIGN(ch)) {                      
          if (dest + 3 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = last;                
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
          mBuffer = 0;                
          ++src;                  
          continue;                  
        }                      
      }                      
      else {
        NS_ASSERTION(last == 0xc38a, "No other value can be buffered");
        if (ch == UNI_VOWELSIGN_II) {                      
          *dest++ = (char) TSC_SRII_LIGA;
          mBuffer = 0;                  
          ++src;                  
          continue;                  
        }                      
        else {
          // put back TSC_SA_DEAD and TSC_RA
          *dest++ = (char) TSC_SA_DEAD;
          mBuffer = TSC_RA;
          ++src;                  
          continue;                  
        }  
      }                      
                          
      /* Output the buffered character.  */              
      if (last >> 8) {                      
        if (dest + 2 >  destEnd)
          goto error_more_output;
        *dest++ = last & 0xff;              
        *dest++ = (last >> 8) & 0xff;              
      }                      
      else                      
        *dest++ = last & 0xff;                
      mBuffer = 0;                    
      continue;                    
    }                        
                        
    if (ch < 0x80)   // Plain ASCII character.
      *dest++ = (char)ch;                    
    else if (IS_UNI_TAMIL(ch)) {                        
      uint8_t t = UnicharToTSCII[ch - UNI_TAMIL_START];
                            
      if (t != 0) {                      
          if (IS_TSC_CONSONANT(t))
            mBuffer = (uint32_t) t;              
          else                    
            *dest++ = t;                  
      }                      
      else if (IS_UNI_2PARTS_VOWELSIGN(ch)) {   
          // actually this is an illegal sequence.
          if (dest + 2 > destEnd)
            goto error_more_output;

          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
      }                      
      else {
        *aDestLength = dest - aDest;
        return NS_ERROR_UENC_NOMAPPING;
      }                      
    }                        
    else if (ch == 0x00A9)                  
      *dest++ = (char)ch;                    
    else if (IS_UNI_SINGLE_QUOTE(ch))
      *dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE;
    else if (IS_UNI_DOUBLE_QUOTE(ch))
      *dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE;
    else {
      *aDestLength = dest - aDest;
      return NS_ERROR_UENC_NOMAPPING;
    }                        
                        
    /* Now that we wrote the output increment the input pointer.  */        
    ++src;                      
  }

  // flush the buffer
  if (mBuffer >> 8) {                      
    // Write out the last character, two bytes. 
    if (dest + 2 > destEnd)
      goto error_more_output;
    *dest++ = (mBuffer >> 8) & 0xff;            
    *dest++ = mBuffer & 0xff;              
    mBuffer = 0;
  }                      
  else if (mBuffer) {
    // Write out the last character, a single byte.
    if (dest >= destEnd)
      goto error_more_output;
    *dest++ = mBuffer & 0xff;              
    mBuffer = 0;
  }                      

  *aSrcLength = src - aSrc;
  *aDestLength = dest - aDest;
  return rv;

error_more_output:
  *aSrcLength = src - aSrc;
  *aDestLength = dest - aDest;
  return NS_OK_UENC_MOREOUTPUT;
}

NS_IMETHODIMP 
nsUnicodeToTSCII::Finish(char* aDest, int32_t* aDestLength)
{
  if (!mBuffer) {
    *aDestLength = 0;
    return NS_OK;
  }

  if (mBuffer >> 8) {                      
    // Write out the last character, two bytes. 
    if (*aDestLength < 2) {
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    *aDest++ = (mBuffer >> 8) & 0xff;            
    *aDest++ = mBuffer & 0xff;              
    mBuffer = 0;
    *aDestLength = 2;
  }                      
  else {                      
    // Write out the last character, a single byte.
    if (*aDestLength < 1) {                    
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    *aDest++ = mBuffer & 0xff;              
    mBuffer = 0;
    *aDestLength = 1;
  }                      
  return NS_OK;
}

//================================================================
NS_IMETHODIMP 
nsUnicodeToTSCII::Reset()
{
  mBuffer = 0;
  return NS_OK;
}

NS_IMETHODIMP 
nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength,
                                 int32_t * aDestLength)
{
  // Some Tamil letters  can be decomposed into 2 glyphs in TSCII.
  *aDestLength = aSrcLength *  2;
  return NS_OK;
}


NS_IMETHODIMP 
nsUnicodeToTSCII::SetOutputErrorBehavior(int32_t aBehavior, 
                                           nsIUnicharEncoder *aEncoder, 
                                           PRUnichar aChar)
{
  return NS_OK;
}


// same as the mapping of the C1(0x80-0x9f) part of  Windows-1252 to Unicode
const static PRUnichar gTSCIIToTTF[] = {
  0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
  0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};

//----------------------------------------------------------------------
// Class nsUnicodeToTamilTTF [implementation]
//
NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII)

NS_IMETHODIMP 
nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc, 
                             int32_t * aSrcLength, char * aDest, 
                             int32_t * aDestLength)
{

  int32_t medLen, destLen;
  char *med;

  GetMaxLength(aSrc, *aSrcLength, &destLen);
  NS_ASSERTION(destLen  <= *aDestLength, "insufficient dest. buffer size");

  // TSCII converter is a single byte encoder and takes half the space 
  // taken by TamilTTF encoder.
  medLen = destLen / 2; 

  if (medLen > CHAR_BUFFER_SIZE) {
    med = (char *) nsMemory::Alloc(medLen);
    if (!med)
      return NS_ERROR_OUT_OF_MEMORY;
  }
  else 
    med = mStaticBuffer;

  nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen);

  if (NS_FAILED(rv)) {
    if (med != mStaticBuffer)
      nsMemory::Free(med);
    return rv;
  }

  int32_t i, j;

  // widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font
  for (i = 0, j = 0; i < medLen; i++) {
    // Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251.
    PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ? 
                     gTSCIIToTTF[med[i] & 0x7f] : uint8_t(med[i]);
    // A lot of TSCII fonts are still based on TSCII 1.6 so that 
    // they have Tamil vowel 'I' at 0xad instead of 0xfe.
    if (ucs2 == 0xfe) ucs2 = 0xad;
    aDest[j++] = uint8_t((ucs2 & 0xff00) >> 8);
    aDest[j++] = uint8_t(ucs2 & 0x00ff);
  }

  *aDestLength = j;

  if (med != mStaticBuffer)
    nsMemory::Free(med);

  return NS_OK;
}

NS_IMETHODIMP
nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength, int32_t * aDestLength)
{
  // Each Tamil character can generate at most two presentation forms,
  // but we're 'extending' them to 16bit shorts, which accounts for 
  // additional factor of 2.
  *aDestLength = (aSrcLength + 1) *  4; 
  
  return NS_OK;
}

NS_IMETHODIMP 
nsUnicodeToTamilTTF::SetOutputErrorBehavior(int32_t aBehavior, 
                                            nsIUnicharEncoder *aEncoder, 
                                            PRUnichar aChar)
{
  if (aBehavior == kOnError_CallBack && aEncoder == nullptr)
    return NS_ERROR_NULL_POINTER;
  mErrEncoder = aEncoder;
  mErrBehavior = aBehavior;
  mErrChar = aChar;
  return NS_OK;
}