00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include "nsLatin1Prober.h"
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029
00030 #define UDF 0 // undefined
00031 #define OTH 1 //other
00032 #define ASC 2 // ascii capital letter
00033 #define ASS 3 // ascii small letter
00034 #define ACV 4 // accent capital vowel
00035 #define ACO 5 // accent capital other
00036 #define ASV 6 // accent small vowel
00037 #define ASO 7 // accent small other
00038 #define CLASS_NUM 8 // total classes
00039
00040 namespace kencodingprober {
00041 static unsigned char Latin1_CharToClass[] =
00042 {
00043 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00044 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00045 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00046 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00047 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00048 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00049 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00050 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00051 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00052 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00053 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
00054 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,
00055 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00056 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00057 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,
00058 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,
00059 OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,
00060 OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,
00061 UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00062 OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,
00063 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00064 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00065 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00066 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,
00067 ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,
00068 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,
00069 ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,
00070 ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,
00071 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,
00072 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,
00073 ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,
00074 ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,
00075 };
00076
00077
00078
00079
00080
00081
00082
00083 static unsigned char Latin1ClassModel[] =
00084 {
00085
00086 0, 0, 0, 0, 0, 0, 0, 0,
00087 0, 3, 3, 3, 3, 3, 3, 3,
00088 0, 3, 3, 3, 3, 3, 3, 3,
00089 0, 3, 3, 3, 1, 1, 3, 3,
00090 0, 3, 3, 3, 1, 2, 1, 2,
00091 0, 3, 3, 3, 3, 3, 3, 3,
00092 0, 3, 1, 3, 1, 1, 1, 3,
00093 0, 3, 1, 3, 1, 1, 3, 3,
00094 };
00095
00096 void nsLatin1Prober::Reset(void)
00097 {
00098 mState = eDetecting;
00099 mLastCharClass = OTH;
00100 for (int i = 0; i < FREQ_CAT_NUM; i++)
00101 mFreqCounter[i] = 0;
00102 }
00103
00104
00105 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, unsigned int aLen)
00106 {
00107 char *newBuf1 = 0;
00108 unsigned int newLen1 = 0;
00109
00110 if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
00111 newBuf1 = (char*)aBuf;
00112 newLen1 = aLen;
00113 }
00114
00115 unsigned char charClass;
00116 unsigned char freq;
00117 for (unsigned int i = 0; i < newLen1; i++)
00118 {
00119 charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
00120 freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
00121 if (freq == 0) {
00122 mState = eNotMe;
00123 break;
00124 }
00125 mFreqCounter[freq]++;
00126 mLastCharClass = charClass;
00127 }
00128
00129 if (newBuf1 != aBuf)
00130 free(newBuf1);
00131
00132 return mState;
00133 }
00134
00135 float nsLatin1Prober::GetConfidence(void)
00136 {
00137 if (mState == eNotMe)
00138 return 0.01f;
00139
00140 float confidence;
00141 unsigned int total = 0;
00142 for (int i = 0; i < FREQ_CAT_NUM; i++)
00143 total += mFreqCounter[i];
00144
00145 if(!total)
00146 confidence = 0.0f;
00147 else
00148 {
00149 confidence = mFreqCounter[3]*1.0f / total;
00150 confidence -= mFreqCounter[1]*20.0f/total;
00151 }
00152
00153 if (confidence < 0.0f)
00154 confidence = 0.0f;
00155
00156
00157
00158 confidence *= 0.50f;
00159
00160 return confidence;
00161 }
00162
00163 #ifdef DEBUG_PROBE
00164 void nsLatin1Prober::DumpStatus()
00165 {
00166 printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
00167 }
00168 #endif
00169 }
00170
00171