/// \file GracenotePhoneticConversion.cpp
///
/// Conversion for X-SAMPA phonemes to L&H+
///
/// Copryright: (c) 2011 Robert Bosch GmbH
///
/// \author Ingo Reise CM-AI/PJ-GM28 (external.ingo.reise@de.bosch.com)
///

#include "GracenoteWrapperLib.h"
#include "gn_phonetic_conversion.h"

char *USA_eng_alphabet[] = {"e&I", "O&I", "a&I", "a&U", "o&U", "t&S", "d&Z",
                            "E0", "nK", "R+", "r6", "n%)", "I%)", "\'2", "*.", "*,", "*!", "*?", "*;", "*:",
                            "A", "E", "i", "I", "O", "u", "$", "@", "^", "U",
                            "j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "h", "m", "n", "l", "?", "T", "D",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *MEX_spa_alphabet[] = {"t&S", "d&Z",
                            "*!\\", "*?\\",
							"nK", "n~", "r6",
                            "*.", "*,", "*!", "*?", "*;", "*:",
                            "a", "e", "i", "o", "u",
                            "j", "w", "r", "p", "b", "t", "d", "k", "g", "f", "s", "z", "S", "x", "G", "m", "n", "l", "D", "B", "J",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *DEU_ger_alphabet[] = {"E+%~",
                            "A%~", "O%~", "E%~", "t&S", "d&Z", "t&s", "p&f", "a&u", "O&y", "a&i",
                            "\'2", "*.", "*,", "*!", "*?", "*;", "*:", "E:", "e+", "E+", "i:", "e:", "a:", "y:", "u:", "o:", "nK",
                            "a", "E", "I", "O", "Y", "$", "U",
                            "j", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "x", "h", "m", "n", "l", "?", "R", "C",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *CHN_qad_alphabet[] = {"$%\"~r\"", "O%\"~r\"", "A%\"~r\"",
                            "o&U%r", "a&U%r", "t&s+{", "t&c~{",
                            "t&c~", "t&s{", "t&s+", "o-%r",
                            "e&I", "a&I", "a&U", "o&U", "I%\\", "I%0", "E%r", "$%r", "a%r", "O%r", "u%r", "t&s",
                            "o-", "nK", "c~", "s+", "z+", "p{", "t{", "k{",
                            "*.", "*,", "*!", "*?", "*;", "*:",
                            "a", "A", "E", "i", "o", "O", "u", "y", "$",
                            "j", "w", "p", "t", "k", "f", "s", "m", "n", "l", "X",
                            "#", "_", ".",
                            NULL };
char *CHN_qad_supra[] = { "214", "55", "35", "51", "11", NULL };

char *CAN_fre_alphabet[] = {"t&s", "d&z", "e%~", "@%~", "o%~", "e+%~",
                            "e+", "E+", "nK", "n~", "h\\",
                            "*.", "*,", "*!", "*?", "*;", "*:",
                            "a", "A", "e", "E", "i" , "I", "o", "O", "u", "y", "Y", "$", "U",
                            "j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "m", "n", "l", "?", "R",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *FRA_fre_alphabet[] = {"E+%~",
                            "A%~", "O%~", "E%~", "h\\",
                            "e+", "E+", "nK", "n~",
                            "*.", "*,", "*!", "*?", "*;", "*:",
                            "a", "A", "e", "E", "i", "o", "O", "u" , "y", "$",
                            "j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "m", "n", "l", "?", "R",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *ESP_spa_alphabet[] = {"t&S", "d&Z",
                            "*!\\", "*?\\", 
							"*.", "*,", "*!", "*?", "*;", "*:",
                            "n~", "r6",
                            "B", "J",
                            "a", "e", "i", "o", "u",
                            "j", "w", "r", "p", "b", "t", "d", "k" , "g", "f", "s", "x", "G", "m", "n", "l", "T", "D",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *GBR_eng_alphabet[] = {"I&$", "E&$", "U&$", "e&I", "O&I", "a&I", "a&U", "o&U", "t&S", "d&Z", "n%)", "l%)",
                            "A+", "E0", "nK", "R+",
                            "\'2", "*.", "*,", "*!", "*?", "*;", "*:",
                            "A", "E", "i", "I", "O", "u", "$", "@", "^", "U",
                            "j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "h", "m", "n", "l", "?", "T", "D",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *AUS_eng_alphabet[] = {"I&$", "E&$", "U&$", "e&I", "O&I", "a&I", "a&U", "o&U", "t&S", "d&Z", "n%)", "l%)",
                            "A+", "E0", "nK", "R+",
                            "\'2", "*.", "*,", "*!", "*?", "*;", "*:",
                            "A", "E", "i", "I", "O", "u", "$", "@", "^", "U",
                            "j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "h", "m", "n", "l", "T", "D",
                            ".", "\'", "\"", "#", "_",
                            NULL };
							
char *RUS_rus_alphabet[] = { "t&S%j", "d&Z%j",
                             "t&S", "d&Z", "t&s", "d&z",
                             "p%j", "b%j", "t%j", "d%j", "k%j", "g%j", "f%j", "v%j", "s%j", "z%j", "S%j", "Z%j", "x%j", "m%j", "n%j", "l%j", "r%j",
                             "\'2", "*.", "*,", "*!", "*?", "*;", "*:",
                             "i0", "o-",
                             "a", "e", "E", "i", "I", "o", "u", "$", "^",
                             "j", "r", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "x", "G", "m", "n", "l", "?",
                             ".", "\'", "\"", "#", "_",
                             NULL };

char *NLD_dut_alphabet[] = {"E+%~", 
							"A%~", "O%~", "E%~", "E&i", "^&y", "A&u",
							"E:", "e+", "E+", "nK", 
							"\'2", "*.", "*,", "*!", "*?", "*;", "*:",
							"a", "A", "e", "E", "i", "I", "o", "O", "u", "y", "$", "^", 
							"j", "w", "r", "p", "b", "t", "d", "k" , "g", "f", "v", "s", "z", "S", "Z", "x", "G", "h", "m", "n", "l", "?", "V",
                            ".", "\'", "\"", "#", "_",
                            NULL };

char *PRT_por_alphabet[] = {"a0%~",
							"e%~", "o%~", "i%~", "u%~", "j%~", "w%~", "k&s",
							"a0", "R\\", "n~", "l~", "r6",
							"\'2", "*.", "*,", "*!", "*?", "*;", "*:",
							"a", "e", "E", "i", "o", "O", "u", "$",
							"j", "w", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "m", "n", "l",
                            ".", "\'", "\"", "#", "_",
                            NULL };
							
char *ITA_ita_alphabet[] = {"t&S:", "d&Z:", "t&s:", "d&z:",
							"n~:", "l~:", "t&S", "d&Z", "t&s", "d&z",
							"\'2", // not in Nuance spec but is supported by recognizer/TTS.
							"n~", "r:", "p:", "b:", "t:", "d:", "k:", "g:", "f:", "s:", "S:", "m:", "n:", "l~", "l:", "v:", 
							"*.", "*,", "*!", "*?", "*;",
							"a", "e", "E", "i", "o", "O", "u",
                            "j", "w", "r", "p", "b", "t", "d", "k", "g", "f", "v", "s", "z", "S", "Z", "m", "n", "l",
							".", "\'", "\"", "#", "_",
							NULL };

char *KOR_kor_alphabet[] = {"u-&i",
							"e+", "u-", "nK", "h6", "c{", "j-", "p{", "t{", "k{",
							"p`", "t`", "k`", "c`", "s`",
							"*.", "*,", "*!", "*?", "*;", "*:",
							"a", "e", "E", "i", "o", "O", "u",
							"j", "w", "r", "p", "b", "t", "d", "k", "g", "s", "h", "m", "n", "l", "?", "c",
							".", "#", "_",
                            NULL };

bool tclGracenoteWrapper::verifyLHP(string phoneme, tclLanguageId lang)
{
    bool retval = true;
    map<string, char **> alphabetMap;
    map<tclLanguageId, char **> supraMap;
    string GNLanguageCode = string((char *) (unsigned char *) lang);
    alphabetMap["DEU_ger"] = DEU_ger_alphabet;
    alphabetMap["MEX_spa"] = MEX_spa_alphabet;
    alphabetMap["ESP_spa"] = ESP_spa_alphabet;
    alphabetMap["USA_eng"] = USA_eng_alphabet;
    alphabetMap["GBR_eng"] = GBR_eng_alphabet;
	alphabetMap["AUS_eng"] = AUS_eng_alphabet;
    alphabetMap["CHN_qad"] = CHN_qad_alphabet;
    alphabetMap["CAN_fre"] = CAN_fre_alphabet;
    alphabetMap["FRA_fre"] = FRA_fre_alphabet;
    alphabetMap["RUS_rus"] = RUS_rus_alphabet;
	alphabetMap["NLD_Dut"] = NLD_dut_alphabet;
	alphabetMap["PRT_por"] = PRT_por_alphabet;
	alphabetMap["ITA_ita"] = ITA_ita_alphabet;
	alphabetMap["KOR_kor"] = KOR_kor_alphabet;

    if (alphabetMap.end() == alphabetMap.find(GNLanguageCode))
    {
        // cout << "No alphabet for " << lang << endl;
        return true;
    }

    // Chinese phonemes may contain suprasegmentals nearly anywhere
    // So it's impossible to verify chinese phonemes using a fixed list
    // To aviod this, all suprasgementals are removed first, after this
    // the phonemes are checked versus the fixed phoneme list
    if (GNLanguageCode == string("CHN_qad"))
    {
        size_t index;
        for (char **code = CHN_qad_supra; *code != NULL; ++code)
            while (string::npos != (index = phoneme.find(*code)))
                phoneme.erase(index, strlen(*code));
    }


    for (char *ptr = (char * ) phoneme.c_str(); *ptr != '\0'; )
    {
        for (char **code = alphabetMap[GNLanguageCode]; ; ++code)
        {
            if (*code == NULL)
            {
                cerr << "Unknown code in " << lang << " phoneme: " << phoneme << " at " << ptr << endl;
                retval = false;
                ++ptr;
                break;
            }
            if (0 == strncmp(*code, ptr, strlen(*code)))
            {
                //            cout << "Found code " << *code << " (" << strlen(*code) << ") in phoneme: " << ptr << endl;
                ptr += strlen(*code);
                break;
            }
        }
    }
    return retval;
}


/*
 * Language Identifier Strings
 * Copied from xsampa_to_lnh_conversion.c from Gracenote
 * This is used to get the index of the postprocessing map 
 */
char *gn_speech_langs[] =
    {
        "USA_eng",
        "MEX_spa",
        "CAN_fre",
        "GBR_eng",
        "FRA_fre",
        "ESP_spa",
        "DEU_ger",
        "ITA_ita",
        "JPN_jpn",
        "CHN_qad",
        "RUS_rus",
        "NLD_dut",
        "KOR_kor",
        "BRA_por",
        "PRT_por",
        "TUR_tur",
        "AUS_eng",
        NULL // Added I. Reise
    };

/* **************************************************************************
* Function  GN_XSAMPA2LHplus
* *************************************************************************/ 
/**
 * Convert X-SAMPA-phoneme to L&H+
 * This unused implementation uses the original slow GN-implementation
 * 
 * \param[in] xsampastring Phoneme in X-SAMPA format
 * \param[in] query_lang Spoken language of string. For some languages as special postprocessing is done
 * \return Phoneme in L&H+-Format incl. stressmarks
 */
string tclGracenoteWrapper::GN_XSAMPA2LHplus(const string &xsampastring,
        const tclLanguageId query_lang)
{

    gn_uchar_t* lnh_conversion = NULL;      /* lnh converted transcription */
    gn_uchar_t* lnh_final_conversion = NULL;    /* lnh final converted transcription */

    // cout << "xsampa_to_lnh_conversion " << xsampastring << " ... " << endl;
    xsampa_to_lnh_conversion(query_lang,
                             (gn_uchar_t*) xsampastring.c_str(),
                             &lnh_conversion,
                             &lnh_final_conversion);
    string result = string((char *) lnh_final_conversion);
    //     cout << "xsampa_to_lnh_conversion " << xsampastring << " => " << result << endl;

    if (lnh_conversion)
        gnepal_mem_free(lnh_conversion);
    if (lnh_final_conversion)
        gnepal_mem_free(lnh_final_conversion);

    return result;
}


/* **************************************************************************
 * Function  XSAMPA2LHplus
 * *************************************************************************/ 
/**
 * Convert X-SAMPA-phoneme to L&H+
 * This is a identical but much faster implementation of the very inefficient 
 * GN-algorithm implemented in gn_phonetic_conversion.c
 * 
 * \param[in] xsampastring Phoneme in X-SAMPA format
 * \param[in] query_lang Spoken language of string. For some languages as special postprocessing is done
 * \return Phoneme in L&H+-Format incl. stressmarks
 */

string tclGracenoteWrapper::XSAMPA2LHplus(
    const string src,
    const tclLanguageId lang)
{
	char orig_array[tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH];
    strncpy(orig_array, src.c_str(), tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH);
    orig_array[tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH-1] = '\0';

    char *orig = orig_array;

    char trans_array[tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH];
    char * trans = trans_array;
    *trans = '\0';

	// check size
	if(src.size() > (tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH -1))
	{
		//source string is too long and has been cut
		return trans;
	}
        
    // cout << "Converting " << src << endl;

    // Processing is done in two steps,
    // Step 1 uses a ruleset for a language
    // step 2 uses an optional ruleset for some languages

    // First get language index
    int index = -1;
    for (int i = 0; gn_speech_langs[i] != NULL; ++i)
    {
        if (0 == strncmp((char*) (unsigned char*) lang, gn_speech_langs[i], 7))
        {
            index = i;
            break;
        }
    }
    if(index == -1)
    {
    	// language not found...
    	return trans;
    }
    
    for (int step = 1; step <= 2; ++step)
    {
        // pointer to first rule in ruleset
        gn_phonetic_conversion_map_t *first_rule = (gn_phonetic_conversion_map_t *) GN_NULL;

        if (step == 1) // General processing
        {
            if (GNEX_PHOCVRT_NoError != gn_phonetic_conversion_lnh_map_get(&first_rule, index)
                || first_rule == NULL
                || first_rule->algorithm_id == PHONETIC_MAP_ALGORITHM_NONE
               )
                break; // Skip processing
        }
        else // step 2, post processing for some languages
        {
            if (GNEX_PHOCVRT_NoError != gn_phonetic_conversion_lnh_post_process_map_get(&first_rule, index)
                || first_rule == NULL
                || first_rule->algorithm_id == PHONETIC_MAP_ALGORITHM_NONE
               )
                break; // Skip postprocessing
        }

        if(first_rule->algorithm_id == PHONETIC_MAP_ALGORITHM_2)
        {
        	gn_uchar_t* lnh_cnv = NULL;
        	gnex_error_t error = gn_phonetic_conversion_map_convert(first_rule, (gn_uchar_t*)orig, &lnh_cnv);
        	if(lnh_cnv!=NULL)
        	{
        		strncpy(trans, (char*)lnh_cnv, tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH);
        		trans[tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH-1]='\0';
        		gnepal_mem_free(lnh_cnv);
        	}
        }
        else
        {
			for (gn_phonetic_conversion_rule_t * rule = first_rule->rules;
					rule->src_token != GN_NULL;
                ++rule)
    	    {
            char *tp = trans;
            for (char *op = orig; *op != '\0';)
            {
                size_t stl = strlen((char*)rule->src_token); // Source token length
                size_t ttl = strlen((char*)rule->trg_token); // Target token length
                // Prevent overflows of trans_array for very long phonemes
                if (tp - trans + ttl >= (size_t) (tclGracenoteWrapperInterface::MAX_PHONEME_LENGTH - 1))
                {
                	*trans = '\0';
                	break;
                }
                // Check if rule matches here
                if (0 == strncmp(op, (char*)rule->src_token, stl))
                {
                    // cout << "IR " << rule->src_token << "->" << rule->trg_token << " at " << op << endl;
                    memcpy(tp, rule->trg_token, ttl);
                    tp += ttl;
                    op += stl;
                }
                else
                    *tp++ = *op++;
            }
            *tp = '\0';
            swap(orig, trans);
        }
        // Just revert the final superfluous swap action
        swap(orig, trans);
        }
    }
    return trans;
}


