
// -----------------------------------------------------------------------------
//
// Interface:                   G B - 1 8 0 3 0
//
// Description: helper class to convert GB-18030 coded string (chinese)
//                into UTF-8 string.
//
// Note: supports only a subset of entire GB-18030 code page
//
// Author: Martin Koch, Fa. ESE, 2013-09-18
//

//lint -e750 unreferenced macros

#include <cstring>

// components header
#include "asf/codecs/gb18030/GB18030Util.h"

///////////////////////////////////////////////////////////////////////////////
//
//      Extract of                ICU4C 4.6.1
//
//            ICU - International Components for Unicode
//                (http://site.icu-project.org/home)
//
//      Tiny, but just sufficient to convert incoming UTF-16 strings
//
//      ICU defines macros handling many cases of low-level
//      string operations inline
//
///////////////////////////////////////////////////////////////////////////////
//
//      UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
//          Unicode Data Files include all data files under the directories
//      http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
//      http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
//      online code charts under the directory http://www.unicode.org/Public/.
//      Software includes any source code published in the Unicode Standard or under
//      the directories http://www.unicode.org/Public/,
//      http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
//
//          NOTICE TO USER: Carefully read the following legal agreement. BY
//      DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
//      ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
//      AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
//      YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
//      FILES OR SOFTWARE.
//
//          COPYRIGHT AND PERMISSION NOTICE
//
//          Copyright © 1991-2011 Unicode, Inc. All rights reserved. Distributed under
//      the Terms of Use in http://www.unicode.org/copyright.html.
//
//          Permission is hereby granted, free of charge, to any person obtaining a
//      copy of the Unicode data files and any associated documentation (the "Data
//      Files") or Unicode software and any associated documentation (the "Software")
//      to deal in the Data Files or Software without restriction, including without
//      limitation the rights to use, copy, modify, merge, publish, distribute, and/or
//      sell copies of the Data Files or Software, and to permit persons to whom the
//      Data Files or Software are furnished to do so, provided that (a) the above
//      copyright notice(s) and this permission notice appear with all copies of the
//      Data Files or Software, (b) both the above copyright notice(s) and this
//      permission notice appear in associated documentation, and (c) there is clear
//      notice in each modified Data File or in the Software as well as in the
//      documentation associated with the Data File(s) or Software that the data or
//      software has been modified.
//
//          THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
//      KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
//      MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
//      PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
//      THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
//      DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
//      PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
//      ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
//      DATA FILES OR SOFTWARE.
//
//          Except as contained in this notice, the name of a copyright holder shall
//      not be used in advertising or otherwise to promote the sale, use or other
//      dealings in these Data Files or Software without prior written authorization
//      of the copyright holder.
//
///////////////////////////////////////////////////////////////////////////////
//
//                        common\unicode\umachine.h
//
//      This file defines basic types and constants for utf.h to be
//      platform-independent.
//
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

/**
 * Define UChar32 as a type for single Unicode code points.
 * UChar32 is a signed 32-bit integer (same as int32_t).
 *
 * The Unicode code point range is 0..0x10ffff.
 * All other values (negative or >=0x110000) are illegal as Unicode code points.
 * They may be used as sentinel values to indicate "done", "error"
 * or similar non-code point conditions.
 *
 * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
 * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
 * or else to be uint32_t.
 * That is, the definition of UChar32 was platform-dependent.
 *
 * @see U_SENTINEL
 * @stable ICU 2.4
 */
// typedef int32_t UChar32;

//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//
//                          common\unicode\utf.h
//
//      This file defines macros for checking whether a code point is
//      a surrogate or a non-character etc.
//
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

/**
 * This value is intended for sentinel values for APIs that
 * (take or) return single code points (UChar32).
 * It is outside of the Unicode code point range 0..0x10ffff.
 *
 * For example, a "done" or "error" value in a new API
 * could be indicated with U_SENTINEL.
 *
 * ICU APIs designed before ICU 2.4 usually define service-specific "done"
 * values, mostly 0xffff.
 * Those may need to be distinguished from
 * actual U+ffff text contents by calling functions like
 * CharacterIterator::hasNext() or UnicodeString::length().
 *
 * @return -1
 * @see UChar32
 * @stable ICU 2.4
 */

#define U_SENTINEL (-1)

/**
 * Is this code point a surrogate (U+d800..U+dfff)?
 * @param c 32-bit code point
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
#define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)

//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//
//                       common\unicode\utf8.h
//
//      This file defines macros to deal with 8-bit Unicode (UTF-8)
//      code units (bytes) and strings.
//
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

/**
 * How many code units (bytes) are used for the UTF-8 encoding
 * of this Unicode code point?
 * @param c 32-bit code point
 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
 * @stable ICU 2.4
 */
#define U8_LENGTH(c)                                                                              \
    ((uint32_t)(c) <= 0x7f                                                                        \
         ? 1                                                                                      \
         : ((uint32_t)(c) <= 0x7ff                                                                \
                ? 2                                                                               \
                : ((uint32_t)(c) <= 0xd7ff ? 3                                                    \
                                           : ((uint32_t)(c) <= 0xdfff || (uint32_t)(c) > 0x10ffff \
                                                  ? 0                                             \
                                                  : ((uint32_t)(c) <= 0xffff ? 3 : 4)))))

/**
 * Append a code point to a string, overwriting 1 to 4 bytes.
 * The offset points to the current end of the string contents
 * and is advanced (post-increment).
 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
 * Otherwise, the result is undefined.
 *
 * @param s const uint8_t * string buffer
 * @param i string offset
 * @param c code point to append
 * @see U8_APPEND
 * @stable ICU 2.4
 */
#define U8_APPEND_UNSAFE(s, i, c)                                        \
    {                                                                    \
        if ((uint32_t)(c) <= 0x7f) {                                     \
            (s)[(i)++] = (uint8_t)(c);                                   \
        } else {                                                         \
            if ((uint32_t)(c) <= 0x7ff) {                                \
                (s)[(i)++] = (uint8_t)(((c) >> 6) | 0xc0);               \
            } else {                                                     \
                if ((uint32_t)(c) <= 0xffff) {                           \
                    (s)[(i)++] = (uint8_t)(((c) >> 12) | 0xe0);          \
                } else {                                                 \
                    (s)[(i)++] = (uint8_t)(((c) >> 18) | 0xf0);          \
                    (s)[(i)++] = (uint8_t)((((c) >> 12) & 0x3f) | 0x80); \
                }                                                        \
                (s)[(i)++] = (uint8_t)((((c) >> 6) & 0x3f) | 0x80);      \
            }                                                            \
            (s)[(i)++] = (uint8_t)(((c)&0x3f) | 0x80);                   \
        }                                                                \
    }

//
//      E N D  of                ICU4C 4.6.1
//
///////////////////////////////////////////////////////////////////////////////

namespace utf {

// declarations of conversion table
extern bool bFindCodePoint(uint32_t GB18030Val, uint32_t& ru32CodePoint);

// ==========================================================================
//
//                         D e f i n i t i o n s
//

/** Maximum length/number of bytes of an UTF-8 character. */
#define MOST_FI_C_MAX_LENGTH_OF_UTF8_CHARACTER 8
/** Maximum length/number of bytes of an GB18030 character. */
#define MOST_FI_C_MAX_LENGTH_OF_GB18030_CHARACTER sizeof(uint32_t)
/** Maximum length/number of bytes of an unicode character (even if
 *  MOST_FI_GB18030_Unicode_Mapping_Table contains only two byte unicode values). */
#define MOST_FI_C_MAX_LENGTH_OF_UNICODE_CHARACTER sizeof(uint32_t)

/** */
#define MOST_FI_HALF_DIVISOR 2

#define MOST_FI_VAL_TWO 0x02             /**< define for hex value 0x02 */
#define MOST_FI_HEX_VAL_THIRTY 0x30      /**< define for hex value 0x30 */
#define MOST_FI_HEX_VAL_THIRTY_NINE 0x39 /**< define for hex value 0x39 */
#define MOST_FI_HEX_VAL_FOURTY 0x40      /**< define for hex value 0x40 */

#define MOST_FI_SHIFT_6_BITS 0x06
//#define MOST_FI_SHIFT_7_BITS                  0x07
#define MOST_FI_SHIFT_8_BITS 0x08
//#define MOST_FI_SHIFT_10_BITS                 0x0A
#define MOST_FI_SHIFT_12_BITS 0x0C
#define MOST_FI_SHIFT_16_BITS 0x10
#define MOST_FI_SHIFT_18_BITS 0x12
#define MOST_FI_SHIFT_24_BITS 0x18

/** Max value of an unicode character to be 1 byte UTF-8 coded. */
#define MOST_FI_UTF8_UNICODE_LIMIT1 (uint32_t)0x80
/** Max value of an unicode character to be 2 byte UTF-8 coded. */
#define MOST_FI_UTF8_UNICODE_LIMIT2 (uint32_t)0x800
/** Largest valid code point for a BMP value. */
#define MOST_FI_UTF16_MAX_BMP (uint32_t)0x0000FFFF
#define MOST_FI_UTF16_MAX_UTF32 (uint32_t)0x7FFFFFFF
#define MOST_FI_UTF8_CHAR_LIMIT_MINUS_TW0 0xFE

// ==========================================================================
//
//                        I m p l e m e n t a t i o n
//

// --------------------------------------------------------------------------

/* default constructor */ tclGB18030Reader::tclGB18030Reader()
    : _u32CodePointCount(0), _pau32CodePoints(0) {}

// --------------------------------------------------------------------------

/* virtual destructor */ tclGB18030Reader::~tclGB18030Reader() { delete[] _pau32CodePoints; }

// --------------------------------------------------------------------------

int32_t tclGB18030Reader::s32Parse(const char* sGB18030String, const uint32_t u32MaxLength) {
    // parse GB-18030 coded string, setup internal array of Unicode code points
    // and return number of read bytes from sGB18030String
    //
    // 2009-09-19  VTeam-Köhler   Initial Revision derived from media player.
    // 2013-09-19  Martin Koch    Ported to UTF component

    if (NULL == sGB18030String) return -1;

    // clear previous array
    delete[] _pau32CodePoints;
    _pau32CodePoints = 0;
    _u32CodePointCount = 0;

    // determine maximally possible number of unicode code points
    // and allocate array
    uint32_t u32MaxCodePointCount = strlen(sGB18030String);
    if (u32MaxLength < u32MaxCodePointCount) u32MaxCodePointCount = u32MaxLength;
    _pau32CodePoints = new uint32_t[u32MaxCodePointCount];

    uint32_t u32CodePoint;
    int32_t s32Position = 0;
    while (_pau32CodePoints && _u32CodePointCount < u32MaxCodePointCount) {
        uint8_t b1 = sGB18030String[s32Position++]; /* byte 1 of gb18030 character */
        if (0 == b1)
            break;
        else if (b1 <= (MOST_FI_UTF8_UNICODE_LIMIT1 - 1)) {
            _pau32CodePoints[_u32CodePointCount++] = b1;
        } else if (((MOST_FI_UTF8_UNICODE_LIMIT1 + 1) <= b1) &&
                   (b1 <= MOST_FI_UTF8_CHAR_LIMIT_MINUS_TW0)) {
            uint8_t b2 = sGB18030String[s32Position++];
            if (((MOST_FI_HEX_VAL_FOURTY <= b2) &&
                 (b2 <= (MOST_FI_UTF8_UNICODE_LIMIT1 - MOST_FI_VAL_TWO))) ||
                ((MOST_FI_UTF8_UNICODE_LIMIT1 <= b2) && (b2 <= 0xFE))) {
                uint32_t u32GB18030char = (b1 << MOST_FI_SHIFT_8_BITS) + b2;
                if (bFindCodePoint(u32GB18030char, u32CodePoint)) {
                    _pau32CodePoints[_u32CodePointCount++] = u32CodePoint;
                }
            } else {
                if ((MOST_FI_HEX_VAL_THIRTY <= b2) && (b2 <= MOST_FI_HEX_VAL_THIRTY_NINE)) {
                    uint8_t b3 = sGB18030String[s32Position++];
                    if (b3 == 0) {
                        //#ifdef VARIANT_S_FTR_ENABLE_TRC_GEN
                        //  ETG_TRACE_ERR(("GB18030 stream is corrupted!"));
                        //#endif
                        // stream is corrupted
                        return -1;
                    }

                    uint8_t b4 = sGB18030String[s32Position++];
                    uint32_t u32GB18030char = ((uint32_t)b1 << 24) + (b2 << 16) + (b3 << 8) + b4;
                    if (bFindCodePoint(u32GB18030char, u32CodePoint)) {
                        _pau32CodePoints[_u32CodePointCount++] = u32CodePoint;
                    }
                } else {
                    //#ifdef VARIANT_S_FTR_ENABLE_TRC_GEN
                    //  ETG_TRACE_ERR(("GB18030 stream is corrupted!"));
                    //#endif
                    // stream is corrupted
                    return -1;
                }
            }
        } else {
            //#ifdef VARIANT_S_FTR_ENABLE_TRC_GEN
            //  ETG_TRACE_ERR(("GB18030 stream is corrupted!"));
            //#endif
            // stream is corrupted
            return -1;
        }
        //#ifdef VARIANT_S_FTR_ENABLE_TRC_GEN
        //  ETG_TRACE_USR2(("GB18030 array, char%2u: %8x,", outpos-1,
        //  p_GB18030_Unicode_Mapping_Table[outpos-1]));
        //#endif
    }

    return s32Position;
}

// --------------------------------------------------------------------------

uint32_t tclGB18030Reader::u32GetUtf8Length() const {
    // determine length of buffer required to hold resulting UTF-8 string
    // (return value includes termination zero)

    uint32_t u32RetVal = 1;  // provide space for string termination

    if (_pau32CodePoints) {
        for (uint32_t i = 0; i < _u32CodePointCount; ++i)
            u32RetVal += U8_LENGTH(_pau32CodePoints[i]);
    }
    return u32RetVal;
}

// --------------------------------------------------------------------------

uint32_t tclGB18030Reader::u32ConvertToUtf8(uint8_t* pBuffer, uint32_t u32BufferLength) const {
    // stream internal array of unicode code points to given buffer
    // and return number of written bytes

    uint32_t u32RetVal = 0;

    if ((NULL == pBuffer) || (0 == u32BufferLength)) return u32RetVal;

    // convert stored code points to UTF-8 and append to buffer
    for (uint32_t i = 0; i < _u32CodePointCount; ++i) {
        if (_pau32CodePoints && (u32RetVal + U8_LENGTH(_pau32CodePoints[i]) < u32BufferLength)) {
            U8_APPEND_UNSAFE(pBuffer, u32RetVal, _pau32CodePoints[i]);
        } else
            break;
    }

    // append string termination
    pBuffer[u32RetVal++] = '\0';

    return u32RetVal;
}

}  // namespace utf
