diff --git a/src/LangBulgarianModel.cpp b/src/LangBulgarianModel.cpp index 379cc5f..9828ab5 100644 --- a/src/LangBulgarianModel.cpp +++ b/src/LangBulgarianModel.cpp @@ -37,10 +37,10 @@ #include "nsSBCharSetProber.h" /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ @@ -50,14 +50,14 @@ static const unsigned char Latin5_BulgarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50 -253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 +110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 +116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80 210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0 @@ -65,27 +65,27 @@ static const unsigned char Latin5_BulgarianCharToOrderMap[] = 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0 - 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0 + 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, //f0 }; static const unsigned char win1251BulgarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 -110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50 -253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 -116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40 +110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60 +116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70 206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80 221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0 - 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0 + 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, //d0 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0 - 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0 + 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, //f0 }; //Model Table: diff --git a/src/LangCyrillicModel.cpp b/src/LangCyrillicModel.cpp index 9be222d..727835f 100644 --- a/src/LangCyrillicModel.cpp +++ b/src/LangCyrillicModel.cpp @@ -43,18 +43,18 @@ //Character Mapping Table: static const unsigned char KOI8R_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90 223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0 -238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0 +238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, //b0 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0 @@ -63,18 +63,18 @@ static const unsigned char KOI8R_CharToOrderMap[] = static const unsigned char win1251_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253, +239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, @@ -83,14 +83,14 @@ static const unsigned char win1251_CharToOrderMap[] = static const unsigned char latin5_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, @@ -98,39 +98,39 @@ static const unsigned char latin5_CharToOrderMap[] = 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, +239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, }; static const unsigned char macCyrillic_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16, +239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, + 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, }; static const unsigned char IBM855_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205, 206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70, 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219, @@ -138,19 +138,19 @@ static const unsigned char IBM855_CharToOrderMap[] = 230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243, 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248, 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249, -250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, +250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR, }; static const unsigned char IBM866_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50 -253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 +155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 + 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, @@ -158,7 +158,7 @@ static const unsigned char IBM866_CharToOrderMap[] = 207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, 223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, +239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, }; //Model Table: diff --git a/src/LangGreekModel.cpp b/src/LangGreekModel.cpp index 8040c99..4a16200 100644 --- a/src/LangGreekModel.cpp +++ b/src/LangGreekModel.cpp @@ -37,54 +37,54 @@ #include "nsSBCharSetProber.h" /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ //Character Mapping Table: static const unsigned char Latin7_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50 -253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90 -+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0 -253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 + 79,118,105, 83, 67,114,119, 95, 99,109,188,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 + 78,115, 65, 66, 58, 76,106,103, 87,107,112,SYM,SYM,SYM,SYM,SYM, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //80 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //90 ++SYM,233, 90,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 74,SYM,SYM, //a0 +SYM,SYM,SYM,SYM,247,248, 61, 36, 46, 71, 73,SYM, 54,SYM,108,123, //b0 110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0 + 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,SYM, //f0 }; static const unsigned char win1253_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 - 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50 -253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 - 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90 -+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0 -253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40 + 79,118,105, 83, 67,114,119, 95, 99,109,188,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60 + 78,115, 65, 66, 58, 76,106,103, 87,107,112,SYM,SYM,SYM,SYM,SYM, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //80 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //90 +SYM,233, 61,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 74,SYM,SYM, //a0 +SYM,SYM,SYM,SYM,247,SYM,SYM, 36, 46, 71, 73,SYM, 54,SYM,108,123, //b0 110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0 124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0 - 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0 + 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,SYM, //f0 }; //Model Table: diff --git a/src/LangHebrewModel.cpp b/src/LangHebrewModel.cpp index 523203e..dcc1e7a 100644 --- a/src/LangHebrewModel.cpp +++ b/src/LangHebrewModel.cpp @@ -41,10 +41,10 @@ /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ @@ -52,14 +52,14 @@ //Character Mapping Table: static const unsigned char win1255_CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40 - 78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50 -253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60 - 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40 + 78,121, 86, 71, 67,102,107, 84,114,103,115,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60 + 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,SYM,SYM,SYM,SYM,SYM, //70 124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214, 215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221, 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, @@ -67,7 +67,7 @@ static const unsigned char win1255_CharToOrderMap[] = 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, 238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250, 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, - 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, + 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,NUM,128, 96,SYM, }; //Model Table: diff --git a/src/LangHungarianModel.cpp b/src/LangHungarianModel.cpp index 7eecf4e..72c25d5 100644 --- a/src/LangHungarianModel.cpp +++ b/src/LangHungarianModel.cpp @@ -37,24 +37,24 @@ #include "nsSBCharSetProber.h" /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ //Character Mapping Table: static const unsigned char Latin2_HungarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, - 46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, -253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, - 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, + 46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,SYM,SYM,SYM,SYM,SYM, +SYM, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, + 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,SYM,SYM,SYM,SYM,SYM, 159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174, 175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190, 191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205, @@ -62,19 +62,19 @@ static const unsigned char Latin2_HungarianCharToOrderMap[] = 221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231, 232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241, 82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85, -245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, +245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,NUM,SYM, }; static const unsigned char win1250HungarianCharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, - 46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, -253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, - 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, + 46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,SYM,SYM,SYM,SYM,SYM, +SYM, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, + 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,SYM,SYM,SYM,SYM,SYM, 161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176, 177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190, 191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205, @@ -82,7 +82,7 @@ static const unsigned char win1250HungarianCharToOrderMap[] = 221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231, 232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241, 84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87, -245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, +245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,NUM,SYM, }; //Model Table: diff --git a/src/LangThaiModel.cpp b/src/LangThaiModel.cpp index 8145ffa..e8db744 100644 --- a/src/LangThaiModel.cpp +++ b/src/LangThaiModel.cpp @@ -39,10 +39,10 @@ /**************************************************************** -255: Control characters that usually does not exist in any text -254: Carriage/Return -253: symbol (punctuation) that does not belong to word -252: 0 - 9 +CTR: Control characters that usually does not exist in any text +RET: Carriage/Return +SYM: symbol (punctuation) that does not belong to word +NUM: 0 - 9 *****************************************************************/ @@ -51,14 +51,14 @@ //Character Mapping Table: static const unsigned char TIS620CharToOrderMap[] = { -255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 -255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 -+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20 -252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30 -253,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40 -188,189,190, 89, 95,112,113,191,192,193,194,253,253,253,253,253, //50 -253, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60 - 96,202, 91, 79, 84,104,105, 97, 98, 92,203,253,253,253,253,253, //70 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 +CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 +SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 +NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 +SYM,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40 +188,189,190, 89, 95,112,113,191,192,193,194,SYM,SYM,SYM,SYM,SYM, //50 +SYM, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60 + 96,202, 91, 79, 84,104,105, 97, 98, 92,203,SYM,SYM,SYM,SYM,SYM, //70 209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222, 223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235, 236, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57, @@ -66,7 +66,7 @@ static const unsigned char TIS620CharToOrderMap[] = 45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63, 22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244, 11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247, - 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, + 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,NUM,SYM, }; diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index d7180dc..f7ef33a 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -40,6 +40,19 @@ #include "nsCharSetProber.h" +/** Codepoints **/ + +/* Illegal codepoints.*/ +#define ILL 255 +/* Control character. */ +#define CTR 254 +/* Symbols and punctuation that does not belong to words. */ +#define SYM 253 +/* Return/Line feeds. */ +#define RET 252 +/* Numbers 0-9. */ +#define NUM 251 + #define SAMPLE_SIZE 64 #define SB_ENOUGH_REL_THRESHOLD 1024 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95