gecko-dev/netwerk/mime/nsMIMEHeaderParamImpl.cpp

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set sw=4 ts=8 et tw=80 : */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <string.h>
#include "prmem.h"
#include "prprf.h"
#include "plstr.h"
#include "plbase64.h"
#include "nsCRT.h"
#include "nsMemory.h"
#include "nsTArray.h"
#include "nsCOMPtr.h"
#include "nsEscape.h"
#include "nsIUTF8ConverterService.h"
#include "nsUConvCID.h"
#include "nsIServiceManager.h"
#include "nsMIMEHeaderParamImpl.h"
#include "nsReadableUtils.h"
#include "nsNativeCharsetUtils.h"
#include "nsError.h"
#include "nsIUnicodeDecoder.h"
#include "mozilla/dom/EncodingUtils.h"

using mozilla::dom::EncodingUtils;

// static functions declared below are moved from mailnews/mime/src/comi18n.cpp

static char *DecodeQ(const char *, uint32_t);
static bool Is7bitNonAsciiString(const char *, uint32_t);
static void CopyRawHeader(const char *, uint32_t, const char *, nsACString &);
static nsresult DecodeRFC2047Str(const char *, const char *, bool, nsACString&);
static nsresult internalDecodeParameter(const nsACString&, const char*,
                                        const char*, bool, bool, nsACString&);

// XXX The chance of UTF-7 being used in the message header is really
// low, but in theory it's possible.
#define IS_7BIT_NON_ASCII_CHARSET(cset)            \
    (!nsCRT::strncasecmp((cset), "ISO-2022", 8) || \
     !nsCRT::strncasecmp((cset), "HZ-GB", 5)    || \
     !nsCRT::strncasecmp((cset), "UTF-7", 5))

NS_IMPL_ISUPPORTS(nsMIMEHeaderParamImpl, nsIMIMEHeaderParam)

NS_IMETHODIMP
nsMIMEHeaderParamImpl::GetParameter(const nsACString& aHeaderVal,
                                    const char *aParamName,
                                    const nsACString& aFallbackCharset,
                                    bool aTryLocaleCharset,
                                    char **aLang, nsAString& aResult)
{
  return DoGetParameter(aHeaderVal, aParamName, MIME_FIELD_ENCODING,
                        aFallbackCharset, aTryLocaleCharset, aLang, aResult);
}

NS_IMETHODIMP
nsMIMEHeaderParamImpl::GetParameterHTTP(const nsACString& aHeaderVal,
                                        const char *aParamName,
                                        const nsACString& aFallbackCharset,
                                        bool aTryLocaleCharset,
                                        char **aLang, nsAString& aResult)
{
  return DoGetParameter(aHeaderVal, aParamName, HTTP_FIELD_ENCODING,
                        aFallbackCharset, aTryLocaleCharset, aLang, aResult);
}

// XXX : aTryLocaleCharset is not yet effective.
nsresult
nsMIMEHeaderParamImpl::DoGetParameter(const nsACString& aHeaderVal,
                                      const char *aParamName,
                                      ParamDecoding aDecoding,
                                      const nsACString& aFallbackCharset,
                                      bool aTryLocaleCharset,
                                      char **aLang, nsAString& aResult)
{
    aResult.Truncate();
    nsresult rv;

    // get parameter (decode RFC 2231/5987 when applicable, as specified by
    // aDecoding (5987 being a subset of 2231) and return charset.)
    nsXPIDLCString med;
    nsXPIDLCString charset;
    rv = DoParameterInternal(PromiseFlatCString(aHeaderVal).get(), aParamName,
                             aDecoding, getter_Copies(charset), aLang,
                             getter_Copies(med));
    if (NS_FAILED(rv))
        return rv;

    // convert to UTF-8 after charset conversion and RFC 2047 decoding
    // if necessary.

    nsAutoCString str1;
    rv = internalDecodeParameter(med, charset.get(), nullptr, false,
                                 // was aDecoding == MIME_FIELD_ENCODING
                                 // see bug 875615
                                 true,
                                 str1);
    NS_ENSURE_SUCCESS(rv, rv);

    if (!aFallbackCharset.IsEmpty())
    {
        nsAutoCString charset;
        EncodingUtils::FindEncodingForLabel(aFallbackCharset, charset);
        nsAutoCString str2;
        nsCOMPtr<nsIUTF8ConverterService>
          cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID));
        if (cvtUTF8 &&
            NS_SUCCEEDED(cvtUTF8->ConvertStringToUTF8(str1,
                PromiseFlatCString(aFallbackCharset).get(), false,
                                   !charset.EqualsLiteral("UTF-8"),
                                   1, str2))) {
          CopyUTF8toUTF16(str2, aResult);
          return NS_OK;
        }
    }

    if (IsUTF8(str1)) {
      CopyUTF8toUTF16(str1, aResult);
      return NS_OK;
    }

    if (aTryLocaleCharset && !NS_IsNativeUTF8())
      return NS_CopyNativeToUnicode(str1, aResult);

    CopyASCIItoUTF16(str1, aResult);
    return NS_OK;
}

// remove backslash-encoded sequences from quoted-strings
// modifies string in place, potentially shortening it
void RemoveQuotedStringEscapes(char *src)
{
  char *dst = src;

  for (char *c = src; *c; ++c)
  {
    if (c[0] == '\\' && c[1])
    {
      // skip backslash if not at end
      ++c;
    }
    *dst++ = *c;
  }
  *dst = 0;
}

// true is character is a hex digit
bool IsHexDigit(char aChar)
{
  char c = aChar;

  return (c >= 'a' && c <= 'f') ||
         (c >= 'A' && c <= 'F') ||
         (c >= '0' && c <= '9');
}

// validate that a C String containing %-escapes is syntactically valid
bool IsValidPercentEscaped(const char *aValue, int32_t len)
{
  for (int32_t i = 0; i < len; i++) {
    if (aValue[i] == '%') {
      if (!IsHexDigit(aValue[i + 1]) || !IsHexDigit(aValue[i + 2])) {
        return false;
      }
    }
  }
  return true;
}

// Support for continuations (RFC 2231, Section 3)

// only a sane number supported
#define MAX_CONTINUATIONS 999

// part of a continuation

class Continuation {
  public:
    Continuation(const char *aValue, uint32_t aLength,
                 bool aNeedsPercentDecoding, bool aWasQuotedString) {
      value = aValue;
      length = aLength;
      needsPercentDecoding = aNeedsPercentDecoding;
      wasQuotedString = aWasQuotedString;
    }
    Continuation() {
      // empty constructor needed for nsTArray
      value = 0L;
      length = 0;
      needsPercentDecoding = false;
      wasQuotedString = false;
    }
    ~Continuation() {}

    const char *value;
    uint32_t length;
    bool needsPercentDecoding;
    bool wasQuotedString;
};

// combine segments into a single string, returning the allocated string
// (or nullptr) while emptying the list
char *combineContinuations(nsTArray<Continuation>& aArray)
{
  // Sanity check
  if (aArray.Length() == 0)
    return nullptr;

  // Get an upper bound for the length
  uint32_t length = 0;
  for (uint32_t i = 0; i < aArray.Length(); i++) {
    length += aArray[i].length;
  }

  // Allocate
  char *result = (char *) nsMemory::Alloc(length + 1);

  // Concatenate
  if (result) {
    *result = '\0';

    for (uint32_t i = 0; i < aArray.Length(); i++) {
      Continuation cont = aArray[i];
      if (! cont.value) break;

      char *c = result + strlen(result);
      strncat(result, cont.value, cont.length);
      if (cont.needsPercentDecoding) {
        nsUnescape(c);
      }
      if (cont.wasQuotedString) {
        RemoveQuotedStringEscapes(c);
      }
    }

    // return null if empty value
    if (*result == '\0') {
      nsMemory::Free(result);
      result = nullptr;
    }
  } else {
    // Handle OOM
    NS_WARNING("Out of memory\n");
  }

  return result;
}

// add a continuation, return false on error if segment already has been seen
bool addContinuation(nsTArray<Continuation>& aArray, uint32_t aIndex,
                     const char *aValue, uint32_t aLength,
                     bool aNeedsPercentDecoding, bool aWasQuotedString)
{
  if (aIndex < aArray.Length() && aArray[aIndex].value) {
    NS_WARNING("duplicate RC2231 continuation segment #\n");
    return false;
  }

  if (aIndex > MAX_CONTINUATIONS) {
    NS_WARNING("RC2231 continuation segment # exceeds limit\n");
    return false;
  }

  if (aNeedsPercentDecoding && aWasQuotedString) {
    NS_WARNING("RC2231 continuation segment can't use percent encoding and quoted string form at the same time\n");
    return false;
  }

  Continuation cont(aValue, aLength, aNeedsPercentDecoding, aWasQuotedString);

  if (aArray.Length() <= aIndex) {
    aArray.SetLength(aIndex + 1);
  }
  aArray[aIndex] = cont;

  return true;
}

// parse a segment number; return -1 on error
int32_t parseSegmentNumber(const char *aValue, int32_t aLen)
{
  if (aLen < 1) {
    NS_WARNING("segment number missing\n");
    return -1;
  }

  if (aLen > 1 && aValue[0] == '0') {
    NS_WARNING("leading '0' not allowed in segment number\n");
    return -1;
  }

  int32_t segmentNumber = 0;

  for (int32_t i = 0; i < aLen; i++) {
    if (! (aValue[i] >= '0' && aValue[i] <= '9')) {
      NS_WARNING("invalid characters in segment number\n");
      return -1;
    }

    segmentNumber *= 10;
    segmentNumber += aValue[i] - '0';
    if (segmentNumber > MAX_CONTINUATIONS) {
      NS_WARNING("Segment number exceeds sane size\n");
      return -1;
    }
  }

  return segmentNumber;
}

// validate a given octet sequence for compliance with the specified
// encoding
bool IsValidOctetSequenceForCharset(nsACString& aCharset, const char *aOctets)
{
  nsCOMPtr<nsIUTF8ConverterService> cvtUTF8(do_GetService
    (NS_UTF8CONVERTERSERVICE_CONTRACTID));
  if (!cvtUTF8) {
    NS_WARNING("Can't get UTF8ConverterService\n");
    return false;
  }

  nsAutoCString tmpRaw;
  tmpRaw.Assign(aOctets);
  nsAutoCString tmpDecoded;

  nsresult rv = cvtUTF8->ConvertStringToUTF8(tmpRaw,
                                             PromiseFlatCString(aCharset).get(),
                                             false, false, 1, tmpDecoded);

  if (rv != NS_OK) {
    // we can't decode; charset may be unsupported, or the octet sequence
    // is broken (illegal or incomplete octet sequence contained)
    NS_WARNING("RFC2231/5987 parameter value does not decode according to specified charset\n");
    return false;
  }

  return true;
}

// moved almost verbatim from mimehdrs.cpp
// char *
// MimeHeaders_get_parameter (const char *header_value, const char *parm_name,
//                            char **charset, char **language)
//
// The format of these header lines  is
// <token> [ ';' <token> '=' <token-or-quoted-string> ]*
NS_IMETHODIMP
nsMIMEHeaderParamImpl::GetParameterInternal(const char *aHeaderValue,
                                            const char *aParamName,
                                            char **aCharset,
                                            char **aLang,
                                            char **aResult)
{
  return DoParameterInternal(aHeaderValue, aParamName, MIME_FIELD_ENCODING,
                             aCharset, aLang, aResult);
}


nsresult
nsMIMEHeaderParamImpl::DoParameterInternal(const char *aHeaderValue,
                                           const char *aParamName,
                                           ParamDecoding aDecoding,
                                           char **aCharset,
                                           char **aLang,
                                           char **aResult)
{

  if (!aHeaderValue ||  !*aHeaderValue || !aResult)
    return NS_ERROR_INVALID_ARG;

  *aResult = nullptr;

  if (aCharset) *aCharset = nullptr;
  if (aLang) *aLang = nullptr;

  nsAutoCString charset;

  // change to (aDecoding != HTTP_FIELD_ENCODING) when we want to disable
  // them for HTTP header fields later on, see bug 776324
  bool acceptContinuations = true;

  const char *str = aHeaderValue;

  // skip leading white space.
  for (; *str &&  nsCRT::IsAsciiSpace(*str); ++str)
    ;
  const char *start = str;

  // aParamName is empty. return the first (possibly) _unnamed_ 'parameter'
  // For instance, return 'inline' in the following case:
  // Content-Disposition: inline; filename=.....
  if (!aParamName || !*aParamName)
    {
      for (; *str && *str != ';' && !nsCRT::IsAsciiSpace(*str); ++str)
        ;
      if (str == start)
        return NS_ERROR_FIRST_HEADER_FIELD_COMPONENT_EMPTY;

      *aResult = (char *) nsMemory::Clone(start, (str - start) + 1);
      NS_ENSURE_TRUE(*aResult, NS_ERROR_OUT_OF_MEMORY);
      (*aResult)[str - start] = '\0';  // null-terminate
      return NS_OK;
    }

  /* Skip forward to first ';' */
  for (; *str && *str != ';' && *str != ','; ++str)
    ;
  if (*str)
    str++;
  /* Skip over following whitespace */
  for (; *str && nsCRT::IsAsciiSpace(*str); ++str)
    ;

  // Some broken http servers just specify parameters
  // like 'filename' without specifying disposition
  // method. Rewind to the first non-white-space
  // character.

  if (!*str)
    str = start;

  // RFC2231 - The legitimate parm format can be:
  // A. title=ThisIsTitle
  // B. title*=us-ascii'en-us'This%20is%20wierd.
  // C. title*0*=us-ascii'en'This%20is%20wierd.%20We
  //    title*1*=have%20to%20support%20this.
  //    title*2="Else..."
  // D. title*0="Hey, what you think you are doing?"
  //    title*1="There is no charset and lang info."
  // RFC5987: only A and B

  // collect results for the different algorithms (plain filename,
  // RFC5987/2231-encoded filename, + continuations) separately and decide
  // which to use at the end
  char *caseAResult = nullptr;
  char *caseBResult = nullptr;
  char *caseCDResult = nullptr;

  // collect continuation segments
  nsTArray<Continuation> segments;


  // our copies of the charset parameter, kept separately as they might
  // differ for the two formats
  nsDependentCSubstring charsetB, charsetCD;

  nsDependentCSubstring lang;

  int32_t paramLen = strlen(aParamName);

  while (*str) {
    // find name/value

    const char *nameStart = str;
    const char *nameEnd = nullptr;
    const char *valueStart = str;
    const char *valueEnd = nullptr;
    bool isQuotedString = false;

    NS_ASSERTION(!nsCRT::IsAsciiSpace(*str), "should be after whitespace.");

    // Skip forward to the end of this token.
    for (; *str && !nsCRT::IsAsciiSpace(*str) && *str != '=' && *str != ';'; str++)
      ;
    nameEnd = str;

    int32_t nameLen = nameEnd - nameStart;

    // Skip over whitespace, '=', and whitespace
    while (nsCRT::IsAsciiSpace(*str)) ++str;
    if (!*str) {
      break;
    }
    if (*str++ != '=') {
      // don't accept parameters without "="
      goto increment_str;
    }
    while (nsCRT::IsAsciiSpace(*str)) ++str;

    if (*str != '"') {
      // The value is a token, not a quoted string.
      valueStart = str;
      for (valueEnd = str;
           *valueEnd && !nsCRT::IsAsciiSpace (*valueEnd) && *valueEnd != ';';
           valueEnd++)
        ;
      str = valueEnd;
    } else {
      isQuotedString = true;

      ++str;
      valueStart = str;
      for (valueEnd = str; *valueEnd; ++valueEnd) {
        if (*valueEnd == '\\' && *(valueEnd + 1))
          ++valueEnd;
        else if (*valueEnd == '"')
          break;
      }
      str = valueEnd;
      // *valueEnd != null means that *valueEnd is quote character.
      if (*valueEnd)
        str++;
    }

    // See if this is the simplest case (case A above),
    // a 'single' line value with no charset and lang.
    // If so, copy it and return.
    if (nameLen == paramLen &&
        !nsCRT::strncasecmp(nameStart, aParamName, paramLen)) {

      if (caseAResult) {
        // we already have one caseA result, ignore subsequent ones
        goto increment_str;
      }

      // if the parameter spans across multiple lines we have to strip out the
      //     line continuation -- jht 4/29/98
      nsAutoCString tempStr(valueStart, valueEnd - valueStart);
      tempStr.StripChars("\r\n");
      char *res = ToNewCString(tempStr);
      NS_ENSURE_TRUE(res, NS_ERROR_OUT_OF_MEMORY);

      if (isQuotedString)
        RemoveQuotedStringEscapes(res);

      caseAResult = res;
      // keep going, we may find a RFC 2231/5987 encoded alternative
    }
    // case B, C, and D
    else if (nameLen > paramLen &&
             !nsCRT::strncasecmp(nameStart, aParamName, paramLen) &&
             *(nameStart + paramLen) == '*') {

      // 1st char past '*'
      const char *cp = nameStart + paramLen + 1;

      // if param name ends in "*" we need do to RFC5987 "ext-value" decoding
      bool needExtDecoding = *(nameEnd - 1) == '*';

      bool caseB = nameLen == paramLen + 1;
      bool caseCStart = (*cp == '0') && needExtDecoding;

      // parse the segment number
      int32_t segmentNumber = -1;
      if (!caseB) {
        int32_t segLen = (nameEnd - cp) - (needExtDecoding ? 1 : 0);
        segmentNumber = parseSegmentNumber(cp, segLen);

        if (segmentNumber == -1) {
          acceptContinuations = false;
          goto increment_str;
        }
      }

      // CaseB and start of CaseC: requires charset and optional language
      // in quotes (quotes required even if lang is blank)
      if (caseB || (caseCStart && acceptContinuations)) {
        // look for single quotation mark(')
        const char *sQuote1 = PL_strchr(valueStart, 0x27);
        const char *sQuote2 = sQuote1 ? PL_strchr(sQuote1 + 1, 0x27) : nullptr;

        // Two single quotation marks must be present even in
        // absence of charset and lang.
        if (!sQuote1 || !sQuote2) {
          NS_WARNING("Mandatory two single quotes are missing in header parameter\n");
        }

        const char *charsetStart = nullptr;
        int32_t charsetLength = 0;
        const char *langStart = nullptr;
        int32_t langLength = 0;
        const char *rawValStart = nullptr;
        int32_t rawValLength = 0;

        if (sQuote2 && sQuote1) {
          // both delimiters present: charSet'lang'rawVal
          rawValStart = sQuote2 + 1;
          rawValLength = valueEnd - rawValStart;

          langStart = sQuote1 + 1;
          langLength = sQuote2 - langStart;

          charsetStart = valueStart;
          charsetLength = sQuote1 - charsetStart;
        }
        else if (sQuote1) {
          // one delimiter; assume charset'rawVal
          rawValStart = sQuote1 + 1;
          rawValLength = valueEnd - rawValStart;

          charsetStart = valueStart;
          charsetLength = sQuote1 - valueStart;
        }
        else {
          // no delimiter: just rawVal
          rawValStart = valueStart;
          rawValLength = valueEnd - valueStart;
        }

        if (langLength != 0) {
          lang.Assign(langStart, langLength);
        }

        // keep the charset for later
        if (caseB) {
          charsetB.Assign(charsetStart, charsetLength);
        } else {
          // if caseCorD
          charsetCD.Assign(charsetStart, charsetLength);
        }

        // non-empty value part
        if (rawValLength > 0) {
          if (!caseBResult && caseB) {
            if (!IsValidPercentEscaped(rawValStart, rawValLength)) {
              goto increment_str;
            }

            // allocate buffer for the raw value
            char *tmpResult = (char *) nsMemory::Clone(rawValStart, rawValLength + 1);
            if (!tmpResult) {
              goto increment_str;
            }
            *(tmpResult + rawValLength) = 0;

            nsUnescape(tmpResult);
            caseBResult = tmpResult;
          } else {
            // caseC
            bool added = addContinuation(segments, 0, rawValStart,
                                         rawValLength, needExtDecoding,
                                         isQuotedString);

            if (!added) {
              // continuation not added, stop processing them
              acceptContinuations = false;
            }
          }
        }
      }  // end of if-block :  title*0*=  or  title*=
      // caseD: a line of multiline param with no need for unescaping : title*[0-9]=
      // or 2nd or later lines of a caseC param : title*[1-9]*=
      else if (acceptContinuations && segmentNumber != -1) {
        uint32_t valueLength = valueEnd - valueStart;

        bool added = addContinuation(segments, segmentNumber, valueStart,
                                     valueLength, needExtDecoding,
                                     isQuotedString);

        if (!added) {
          // continuation not added, stop processing them
          acceptContinuations = false;
        }
      } // end of if-block :  title*[0-9]= or title*[1-9]*=
    }

    // str now points after the end of the value.
    //   skip over whitespace, ';', whitespace.
increment_str:
    while (nsCRT::IsAsciiSpace(*str)) ++str;
    if (*str == ';') {
      ++str;
    } else {
      // stop processing the header field; either we are done or the
      // separator was missing
      break;
    }
    while (nsCRT::IsAsciiSpace(*str)) ++str;
  }

  caseCDResult = combineContinuations(segments);

  if (caseBResult && !charsetB.IsEmpty()) {
    // check that the 2231/5987 result decodes properly given the
    // specified character set
    if (!IsValidOctetSequenceForCharset(charsetB, caseBResult))
      caseBResult = nullptr;
  }

  if (caseCDResult && !charsetCD.IsEmpty()) {
    // check that the 2231/5987 result decodes properly given the
    // specified character set
    if (!IsValidOctetSequenceForCharset(charsetCD, caseCDResult))
      caseCDResult = nullptr;
  }

  if (caseBResult) {
    // prefer simple 5987 format over 2231 with continuations
    *aResult = caseBResult;
    caseBResult = nullptr;
    charset.Assign(charsetB);
  }
  else if (caseCDResult) {
    // prefer 2231/5987 with or without continuations over plain format
    *aResult = caseCDResult;
    caseCDResult = nullptr;
    charset.Assign(charsetCD);
  }
  else if (caseAResult) {
    *aResult = caseAResult;
    caseAResult = nullptr;
  }

  // free unused stuff
  nsMemory::Free(caseAResult);
  nsMemory::Free(caseBResult);
  nsMemory::Free(caseCDResult);

  // if we have a result
  if (*aResult) {
    // then return charset and lang as well
    if (aLang && !lang.IsEmpty()) {
      uint32_t len = lang.Length();
      *aLang = (char *) nsMemory::Clone(lang.BeginReading(), len + 1);
      if (*aLang) {
        *(*aLang + len) = 0;
      }
   }
    if (aCharset && !charset.IsEmpty()) {
      uint32_t len = charset.Length();
      *aCharset = (char *) nsMemory::Clone(charset.BeginReading(), len + 1);
      if (*aCharset) {
        *(*aCharset + len) = 0;
      }
    }
  }

  return *aResult ? NS_OK : NS_ERROR_INVALID_ARG;
}

nsresult
internalDecodeRFC2047Header(const char* aHeaderVal, const char* aDefaultCharset,
                            bool aOverrideCharset, bool aEatContinuations,
                            nsACString& aResult)
{
  aResult.Truncate();
  if (!aHeaderVal)
    return NS_ERROR_INVALID_ARG;
  if (!*aHeaderVal)
    return NS_OK;


  // If aHeaderVal is RFC 2047 encoded or is not a UTF-8 string  but
  // aDefaultCharset is specified, decodes RFC 2047 encoding and converts
  // to UTF-8. Otherwise, just strips away CRLF.
  if (PL_strstr(aHeaderVal, "=?") ||
      (aDefaultCharset && (!IsUTF8(nsDependentCString(aHeaderVal)) ||
      Is7bitNonAsciiString(aHeaderVal, strlen(aHeaderVal))))) {
    DecodeRFC2047Str(aHeaderVal, aDefaultCharset, aOverrideCharset, aResult);
  } else if (aEatContinuations &&
             (PL_strchr(aHeaderVal, '\n') || PL_strchr(aHeaderVal, '\r'))) {
    aResult = aHeaderVal;
  } else {
    aEatContinuations = false;
    aResult = aHeaderVal;
  }

  if (aEatContinuations) {
    nsAutoCString temp(aResult);
    temp.ReplaceSubstring("\n\t", " ");
    temp.ReplaceSubstring("\r\t", " ");
    temp.StripChars("\r\n");
    aResult = temp;
  }

  return NS_OK;
}

NS_IMETHODIMP
nsMIMEHeaderParamImpl::DecodeRFC2047Header(const char* aHeaderVal,
                                           const char* aDefaultCharset,
                                           bool aOverrideCharset,
                                           bool aEatContinuations,
                                           nsACString& aResult)
{
  return internalDecodeRFC2047Header(aHeaderVal, aDefaultCharset,
                                     aOverrideCharset, aEatContinuations,
                                     aResult);
}

// true if the character is allowed in a RFC 5987 value
// see RFC 5987, Section 3.2.1, "attr-char"
bool IsRFC5987AttrChar(char aChar)
{
  char c = aChar;

  return (c >= 'a' && c <= 'z') ||
         (c >= 'A' && c <= 'Z') ||
         (c >= '0' && c <= '9') ||
         (c == '!' || c == '#' || c == '$' || c == '&' ||
          c == '+' || c == '-' || c == '.' || c == '^' ||
          c == '_' || c == '`' || c == '|' || c == '~');
}

// percent-decode a value
// returns false on failure
bool PercentDecode(nsACString& aValue)
{
  char *c = (char *) nsMemory::Alloc(aValue.Length() + 1);
  if (!c) {
    return false;
  }

  strcpy(c, PromiseFlatCString(aValue).get());
  nsUnescape(c);
  aValue.Assign(c);
  nsMemory::Free(c);

  return true;
}

// Decode a parameter value using the encoding defined in RFC 5987
//
// charset  "'" [ language ] "'" value-chars
NS_IMETHODIMP
nsMIMEHeaderParamImpl::DecodeRFC5987Param(const nsACString& aParamVal,
                                          nsACString& aLang,
                                          nsAString& aResult)
{
  nsAutoCString charset;
  nsAutoCString language;
  nsAutoCString value;

  uint32_t delimiters = 0;
  const char *encoded = PromiseFlatCString(aParamVal).get();
  const char *c = encoded;

  while (*c) {
    char tc = *c++;

    if (tc == '\'') {
      // single quote
      delimiters++;
    } else if (((unsigned char)tc) >= 128) {
      // fail early, not ASCII
      NS_WARNING("non-US-ASCII character in RFC5987-encoded param");
      return NS_ERROR_INVALID_ARG;
    } else {
      if (delimiters == 0) {
        // valid characters are checked later implicitly
        charset.Append(tc);
      } else if (delimiters == 1) {
        // no value checking for now
        language.Append(tc);
      } else if (delimiters == 2) {
        if (IsRFC5987AttrChar(tc)) {
          value.Append(tc);
        } else if (tc == '%') {
          if (!IsHexDigit(c[0]) || !IsHexDigit(c[1])) {
            // we expect two more characters
            NS_WARNING("broken %-escape in RFC5987-encoded param");
            return NS_ERROR_INVALID_ARG;
          }
          value.Append(tc);
          // we consume two more
          value.Append(*c++);
          value.Append(*c++);
        } else {
          // character not allowed here
          NS_WARNING("invalid character in RFC5987-encoded param");
          return NS_ERROR_INVALID_ARG;
        }
      }
    }
  }

  if (delimiters != 2) {
    NS_WARNING("missing delimiters in RFC5987-encoded param");
    return NS_ERROR_INVALID_ARG;
  }

  // abort early for unsupported encodings
  if (!charset.LowerCaseEqualsLiteral("utf-8")) {
    NS_WARNING("unsupported charset in RFC5987-encoded param");
    return NS_ERROR_INVALID_ARG;
  }

  // percent-decode
  if (!PercentDecode(value)) {
    return NS_ERROR_OUT_OF_MEMORY;
  }

  // return the encoding
  aLang.Assign(language);

  // finally convert octet sequence to UTF-8 and be done
  nsresult rv = NS_OK;
  nsCOMPtr<nsIUTF8ConverterService> cvtUTF8 =
    do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID, &rv);
  NS_ENSURE_SUCCESS(rv, rv);

  nsAutoCString utf8;
  rv = cvtUTF8->ConvertStringToUTF8(value, charset.get(), true, false, 1, utf8);
  NS_ENSURE_SUCCESS(rv, rv);

  CopyUTF8toUTF16(utf8, aResult);
  return NS_OK;
}

nsresult
internalDecodeParameter(const nsACString& aParamValue, const char* aCharset,
                        const char* aDefaultCharset, bool aOverrideCharset,
                        bool aDecode2047, nsACString& aResult)
{
  aResult.Truncate();
  // If aCharset is given, aParamValue was obtained from RFC2231/5987
  // encoding and we're pretty sure that it's in aCharset.
  if (aCharset && *aCharset)
  {
    nsCOMPtr<nsIUTF8ConverterService> cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID));
    if (cvtUTF8)
      return cvtUTF8->ConvertStringToUTF8(aParamValue, aCharset,
          true, true, 1, aResult);
  }

  const nsAFlatCString& param = PromiseFlatCString(aParamValue);
  nsAutoCString unQuoted;
  nsACString::const_iterator s, e;
  param.BeginReading(s);
  param.EndReading(e);

  // strip '\' when used to quote CR, LF, '"' and '\'
  for ( ; s != e; ++s) {
    if ((*s == '\\')) {
      if (++s == e) {
        --s; // '\' is at the end. move back and append '\'.
      }
      else if (*s != nsCRT::CR && *s != nsCRT::LF && *s != '"' && *s != '\\') {
        --s; // '\' is not foll. by CR,LF,'"','\'. move back and append '\'
      }
      // else : skip '\' and append the quoted character.
    }
    unQuoted.Append(*s);
  }

  aResult = unQuoted;
  nsresult rv = NS_OK;

  if (aDecode2047) {
    nsAutoCString decoded;

    // Try RFC 2047 encoding, instead.
    rv = internalDecodeRFC2047Header(unQuoted.get(), aDefaultCharset,
                                     aOverrideCharset, true, decoded);

    if (NS_SUCCEEDED(rv) && !decoded.IsEmpty())
      aResult = decoded;
  }

  return rv;
}

NS_IMETHODIMP
nsMIMEHeaderParamImpl::DecodeParameter(const nsACString& aParamValue,
                                       const char* aCharset,
                                       const char* aDefaultCharset,
                                       bool aOverrideCharset,
                                       nsACString& aResult)
{
  return internalDecodeParameter(aParamValue, aCharset, aDefaultCharset,
                                 aOverrideCharset, true, aResult);
}

#define ISHEXCHAR(c) \
        ((0x30 <= uint8_t(c) && uint8_t(c) <= 0x39)  ||  \
         (0x41 <= uint8_t(c) && uint8_t(c) <= 0x46)  ||  \
         (0x61 <= uint8_t(c) && uint8_t(c) <= 0x66))

// Decode Q encoding (RFC 2047).
// static
char *DecodeQ(const char *in, uint32_t length)
{
  char *out, *dest = 0;

  out = dest = (char *)PR_Calloc(length + 1, sizeof(char));
  if (dest == nullptr)
    return nullptr;
  while (length > 0) {
    unsigned c = 0;
    switch (*in) {
    case '=':
      // check if |in| in the form of '=hh'  where h is [0-9a-fA-F].
      if (length < 3 || !ISHEXCHAR(in[1]) || !ISHEXCHAR(in[2]))
        goto badsyntax;
      PR_sscanf(in + 1, "%2X", &c);
      *out++ = (char) c;
      in += 3;
      length -= 3;
      break;

    case '_':
      *out++ = ' ';
      in++;
      length--;
      break;

    default:
      if (*in & 0x80) goto badsyntax;
      *out++ = *in++;
      length--;
    }
  }
  *out++ = '\0';

  for (out = dest; *out ; ++out) {
    if (*out == '\t')
      *out = ' ';
  }

  return dest;

 badsyntax:
  PR_Free(dest);
  return nullptr;
}

// check if input is HZ (a 7bit encoding for simplified Chinese : RFC 1842))
// or has  ESC which may be an  indication that  it's in one of many ISO
// 2022 7bit  encodings (e.g. ISO-2022-JP(-2)/CN : see RFC 1468, 1922, 1554).
// static
bool Is7bitNonAsciiString(const char *input, uint32_t len)
{
  int32_t c;

  enum { hz_initial, // No HZ seen yet
         hz_escaped, // Inside an HZ ~{ escape sequence
         hz_seen, // Have seen at least one complete HZ sequence
         hz_notpresent // Have seen something that is not legal HZ
  } hz_state;

  hz_state = hz_initial;
  while (len) {
    c = uint8_t(*input++);
    len--;
    if (c & 0x80) return false;
    if (c == 0x1B) return true;
    if (c == '~') {
      switch (hz_state) {
      case hz_initial:
      case hz_seen:
        if (*input == '{') {
          hz_state = hz_escaped;
        } else if (*input == '~') {
          // ~~ is the HZ encoding of ~.  Skip over second ~ as well
          hz_state = hz_seen;
          input++;
          len--;
        } else {
          hz_state = hz_notpresent;
        }
        break;

      case hz_escaped:
        if (*input == '}') hz_state = hz_seen;
        break;
      default:
        break;
      }
    }
  }
  return hz_state == hz_seen;
}

#define REPLACEMENT_CHAR "\357\277\275" // EF BF BD (UTF-8 encoding of U+FFFD)

// copy 'raw' sequences of octets in aInput to aOutput.
// If aDefaultCharset is specified, the input is assumed to be in the
// charset and converted to UTF-8. Otherwise, a blind copy is made.
// If aDefaultCharset is specified, but the conversion to UTF-8
// is not successful, each octet is replaced by Unicode replacement
// chars. *aOutput is advanced by the number of output octets.
// static
void CopyRawHeader(const char *aInput, uint32_t aLen,
                   const char *aDefaultCharset, nsACString &aOutput)
{
  int32_t c;

  // If aDefaultCharset is not specified, make a blind copy.
  if (!aDefaultCharset || !*aDefaultCharset) {
    aOutput.Append(aInput, aLen);
    return;
  }

  // Copy as long as it's US-ASCII.  An ESC may indicate ISO 2022
  // A ~ may indicate it is HZ
  while (aLen && (c = uint8_t(*aInput++)) != 0x1B && c != '~' && !(c & 0x80)) {
    aOutput.Append(char(c));
    aLen--;
  }
  if (!aLen) {
    return;
  }
  aInput--;

  // skip ASCIIness/UTF8ness test if aInput is supected to be a 7bit non-ascii
  // string and aDefaultCharset is a 7bit non-ascii charset.
  bool skipCheck = (c == 0x1B || c == '~') &&
                     IS_7BIT_NON_ASCII_CHARSET(aDefaultCharset);

  // If not UTF-8, treat as default charset
  nsCOMPtr<nsIUTF8ConverterService>
    cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID));
  nsAutoCString utf8Text;
  if (cvtUTF8 &&
      NS_SUCCEEDED(
      cvtUTF8->ConvertStringToUTF8(Substring(aInput, aInput + aLen),
                                   aDefaultCharset, skipCheck, true, 1,
                                   utf8Text))) {
    aOutput.Append(utf8Text);
  } else { // replace each octet with Unicode replacement char in UTF-8.
    for (uint32_t i = 0; i < aLen; i++) {
      c = uint8_t(*aInput++);
      if (c & 0x80)
        aOutput.Append(REPLACEMENT_CHAR);
      else
        aOutput.Append(char(c));
    }
  }
}

nsresult DecodeQOrBase64Str(const char *aEncoded, size_t aLen, char aQOrBase64,
                            const char *aCharset, nsACString &aResult)
{
  char *decodedText;
  NS_ASSERTION(aQOrBase64 == 'Q' || aQOrBase64 == 'B', "Should be 'Q' or 'B'");
  if(aQOrBase64 == 'Q')
    decodedText = DecodeQ(aEncoded, aLen);
  else if (aQOrBase64 == 'B') {
    decodedText = PL_Base64Decode(aEncoded, aLen, nullptr);
  } else {
    return NS_ERROR_INVALID_ARG;
  }

  if (!decodedText) {
    return NS_ERROR_INVALID_ARG;
  }

  nsresult rv;
  nsCOMPtr<nsIUTF8ConverterService>
    cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID, &rv));
  nsAutoCString utf8Text;
  if (NS_SUCCEEDED(rv)) {
    // skip ASCIIness/UTF8ness test if aCharset is 7bit non-ascii charset.
    rv = cvtUTF8->ConvertStringToUTF8(nsDependentCString(decodedText),
                                      aCharset,
                                      IS_7BIT_NON_ASCII_CHARSET(aCharset),
                                      true, 1, utf8Text);
  }
  PR_Free(decodedText);
  if (NS_FAILED(rv)) {
    return rv;
  }
  aResult.Append(utf8Text);

  return NS_OK;
}

static const char especials[] = "()<>@,;:\\\"/[]?.=";

// |decode_mime_part2_str| taken from comi18n.c
// Decode RFC2047-encoded words in the input and convert the result to UTF-8.
// If aOverrideCharset is true, charset in RFC2047-encoded words is
// ignored and aDefaultCharset is assumed, instead. aDefaultCharset
// is also used to convert raw octets (without RFC 2047 encoding) to UTF-8.
//static
nsresult DecodeRFC2047Str(const char *aHeader, const char *aDefaultCharset,
                          bool aOverrideCharset, nsACString &aResult)
{
  const char *p, *q = nullptr, *r;
  const char *begin; // tracking pointer for where we are in the input buffer
  int32_t isLastEncodedWord = 0;
  const char *charsetStart, *charsetEnd;
  nsAutoCString prevCharset, curCharset;
  nsAutoCString encodedText;
  char prevEncoding = '\0', curEncoding;
  nsresult rv;

  begin = aHeader;

  // To avoid buffer realloc, if possible, set capacity in advance. No
  // matter what,  more than 3x expansion can never happen for all charsets
  // supported by Mozilla. SCSU/BCSU with the sliding window set to a
  // non-BMP block may be exceptions, but Mozilla does not support them.
  // Neither any known mail/news program use them. Even if there's, we're
  // safe because we don't use a raw *char any more.
  aResult.SetCapacity(3 * strlen(aHeader));

  while ((p = PL_strstr(begin, "=?")) != 0) {
    if (isLastEncodedWord) {
      // See if it's all whitespace.
      for (q = begin; q < p; ++q) {
        if (!PL_strchr(" \t\r\n", *q)) break;
      }
    }

    if (!isLastEncodedWord || q < p) {
      if (!encodedText.IsEmpty()) {
        rv = DecodeQOrBase64Str(encodedText.get(), encodedText.Length(),
                                prevEncoding, prevCharset.get(), aResult);
        if (NS_FAILED(rv)) {
          aResult.Append(encodedText);
        }
        encodedText.Truncate();
        prevCharset.Truncate();
        prevEncoding = '\0';
      }
      // copy the part before the encoded-word
      CopyRawHeader(begin, p - begin, aDefaultCharset, aResult);
      begin = p;
    }

    p += 2;

    // Get charset info
    charsetStart = p;
    charsetEnd = 0;
    for (q = p; *q != '?'; q++) {
      if (*q <= ' ' || PL_strchr(especials, *q)) {
        goto badsyntax;
      }

      // RFC 2231 section 5
      if (!charsetEnd && *q == '*') {
        charsetEnd = q;
      }
    }
    if (!charsetEnd) {
      charsetEnd = q;
    }

    q++;
    curEncoding = nsCRT::ToUpper(*q);
    if (curEncoding != 'Q' && curEncoding != 'B')
      goto badsyntax;

    if (q[1] != '?')
      goto badsyntax;

    r = q;
    for (r = q + 2; *r != '?'; r++) {
      if (*r < ' ') goto badsyntax;
    }
    if (r[1] != '=')
        goto badsyntax;
    else if (r == q + 2) {
        // it's empty, skip
        begin = r + 2;
        isLastEncodedWord = 1;
        continue;
    }

    curCharset.Assign(charsetStart, charsetEnd - charsetStart);
    // Override charset if requested.  Never override labeled UTF-8.
    // Use default charset instead of UNKNOWN-8BIT
    if ((aOverrideCharset && 0 != nsCRT::strcasecmp(curCharset.get(), "UTF-8"))
    || (aDefaultCharset && 0 == nsCRT::strcasecmp(curCharset.get(), "UNKNOWN-8BIT"))
    ) {
      curCharset = aDefaultCharset;
    }

    const char *R;
    R = r;
    if (curEncoding == 'B') {
      // bug 227290. ignore an extraneous '=' at the end.
      // (# of characters in B-encoded part has to be a multiple of 4)
      int32_t n = r - (q + 2);
      R -= (n % 4 == 1 && !PL_strncmp(r - 3, "===", 3)) ? 1 : 0;
    }
    // Bug 493544. Don't decode the encoded text until it ends
    if (R[-1] != '='
      && (prevCharset.IsEmpty()
        || (curCharset == prevCharset && curEncoding == prevEncoding))
    ) {
      encodedText.Append(q + 2, R - (q + 2));
      prevCharset = curCharset;
      prevEncoding = curEncoding;

      begin = r + 2;
      isLastEncodedWord = 1;
      continue;
    }

    bool bDecoded; // If the current line has been decoded.
    bDecoded = false;
    if (!encodedText.IsEmpty()) {
      if (curCharset == prevCharset && curEncoding == prevEncoding) {
        encodedText.Append(q + 2, R - (q + 2));
        bDecoded = true;
      }
      rv = DecodeQOrBase64Str(encodedText.get(), encodedText.Length(),
                              prevEncoding, prevCharset.get(), aResult);
      if (NS_FAILED(rv)) {
        aResult.Append(encodedText);
      }
      encodedText.Truncate();
      prevCharset.Truncate();
      prevEncoding = '\0';
    }
    if (!bDecoded) {
      rv = DecodeQOrBase64Str(q + 2, R - (q + 2), curEncoding,
                              curCharset.get(), aResult);
      if (NS_FAILED(rv)) {
        aResult.Append(encodedText);
      }
    }

    begin = r + 2;
    isLastEncodedWord = 1;
    continue;

  badsyntax:
    if (!encodedText.IsEmpty()) {
      rv = DecodeQOrBase64Str(encodedText.get(), encodedText.Length(),
                              prevEncoding, prevCharset.get(), aResult);
      if (NS_FAILED(rv)) {
        aResult.Append(encodedText);
      }
      encodedText.Truncate();
      prevCharset.Truncate();
    }
    // copy the part before the encoded-word
    aResult.Append(begin, p - begin);
    begin = p;
    isLastEncodedWord = 0;
  }

  if (!encodedText.IsEmpty()) {
    rv = DecodeQOrBase64Str(encodedText.get(), encodedText.Length(),
                            prevEncoding, prevCharset.get(), aResult);
    if (NS_FAILED(rv)) {
      aResult.Append(encodedText);
    }
  }

  // put the tail back
  CopyRawHeader(begin, strlen(begin), aDefaultCharset, aResult);

  nsAutoCString tempStr(aResult);
  tempStr.ReplaceChar('\t', ' ');
  aResult = tempStr;

  return NS_OK;
}