• Skip to content
  • Skip to link menu
KDE 4.3 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

nsUniversalDetector.cpp

Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003 *  Copyright (C) 1998 <developer@mozilla.org>
00004 *  Copyright (C) 2008 <wkai@gmail.com>
00005 *
00006 *  Permission is hereby granted, free of charge, to any person obtaining
00007 *  a copy of this software and associated documentation files (the
00008 *  "Software"), to deal in the Software without restriction, including
00009 *  without limitation the rights to use, copy, modify, merge, publish,
00010 *  distribute, sublicense, and/or sell copies of the Software, and to
00011 *  permit persons to whom the Software is furnished to do so, subject to
00012 *  the following conditions:
00013 *
00014 *  The above copyright notice and this permission notice shall be included 
00015 *  in all copies or substantial portions of the Software.
00016 *
00017 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019 *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020 *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021 *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022 *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023 *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024 */
00025 
00026 #include "nsUniversalDetector.h"
00027 
00028 #include "nsMBCSGroupProber.h"
00029 #include "nsSBCSGroupProber.h"
00030 #include "nsEscCharsetProber.h"
00031 #include "nsLatin1Prober.h"
00032 
00033 namespace kencodingprober {
00034 nsUniversalDetector::nsUniversalDetector()
00035 {
00036   mDone = false;
00037   mBestGuess = -1;   //illegal value as signal
00038   mInTag = false;
00039   mEscCharSetProber = 0;
00040 
00041   mStart = true;
00042   mDetectedCharset = 0;
00043   mGotData = false;
00044   mInputState = ePureAscii;
00045   mLastChar = '\0';
00046 
00047   unsigned int i;
00048   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00049     mCharSetProbers[i] = 0;
00050 }
00051 
00052 nsUniversalDetector::~nsUniversalDetector() 
00053 {
00054   for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00055     if (mCharSetProbers[i])      
00056       delete mCharSetProbers[i];
00057   if (mEscCharSetProber)
00058     delete mEscCharSetProber;
00059 }
00060 
00061 void 
00062 nsUniversalDetector::Reset()
00063 {
00064   mDone = false;
00065   mBestGuess = -1;   //illegal value as signal
00066   mInTag = false;
00067 
00068   mStart = true;
00069   mDetectedCharset = 0;
00070   mGotData = false;
00071   mInputState = ePureAscii;
00072   mLastChar = '\0';
00073 
00074   if (mEscCharSetProber)
00075     mEscCharSetProber->Reset();
00076 
00077   unsigned int i;
00078   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00079     if (mCharSetProbers[i])
00080       mCharSetProbers[i]->Reset();
00081 }
00082 
00083 //---------------------------------------------------------------------
00084 #define SHORTCUT_THRESHOLD      (float)0.95
00085 #define MINIMUM_THRESHOLD      (float)0.20
00086 
00087 nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen)
00088 {
00089   if(mDone) 
00090     return eFoundIt;
00091 
00092   if (aLen > 0)
00093     mGotData = true;
00094 
00095   unsigned int i;
00096   for (i = 0; i < aLen; i++)
00097   {
00098     //other than 0xa0, if every othe character is ascii, the page is ascii
00099     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
00100     {
00101       //we got a non-ascii byte (high-byte)
00102       if (mInputState != eHighbyte)
00103       {
00104         //adjust state
00105         mInputState = eHighbyte;
00106 
00107         //kill mEscCharSetProber if it is active
00108         if (mEscCharSetProber) {
00109           delete mEscCharSetProber;
00110           mEscCharSetProber = 0;
00111         }
00112 
00113         //start multibyte and singlebyte charset prober
00114         if (0 == mCharSetProbers[0])
00115           mCharSetProbers[0] = new nsMBCSGroupProber;
00116         if (0 == mCharSetProbers[1])
00117           mCharSetProbers[1] = new nsSBCSGroupProber;
00118         if (0 == mCharSetProbers[2])
00119           mCharSetProbers[2] = new nsLatin1Prober; 
00120       }
00121     }
00122     else
00123     {
00124       //ok, just pure ascii so far
00125       if ( ePureAscii == mInputState &&
00126         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
00127       {
00128         //found escape character or HZ "~{"
00129         mInputState = eEscAscii;
00130       }
00131           
00132       mLastChar = aBuf[i];
00133     }
00134   }
00135 
00136   nsProbingState st = eDetecting;
00137   switch (mInputState)
00138   {
00139   case eEscAscii:
00140     if (0 == mEscCharSetProber) {
00141       mEscCharSetProber = new nsEscCharSetProber;
00142     }
00143     st = mEscCharSetProber->HandleData(aBuf, aLen);
00144     if (st == eFoundIt)
00145     {
00146       mDone = true;
00147       mDetectedCharset = mEscCharSetProber->GetCharSetName();
00148     }
00149     break;
00150   case eHighbyte:
00151     for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
00152     {
00153       st = mCharSetProbers[i]->HandleData(aBuf, aLen);
00154       if (st == eFoundIt) 
00155       {
00156         mDone = true;
00157         mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
00158       } 
00159     }
00160     break;
00161 
00162   default:  //pure ascii
00163     mDetectedCharset = "UTF-8";
00164   }
00165   return st;
00166 }
00167 
00168 
00169 //---------------------------------------------------------------------
00170 const char* nsUniversalDetector::GetCharSetName()
00171 {
00172   if (mDetectedCharset)
00173     return mDetectedCharset;
00174   switch (mInputState)
00175   {
00176   case eHighbyte:
00177     {
00178       float proberConfidence;
00179       float maxProberConfidence = (float)0.0;
00180       int maxProber = 0;
00181 
00182       for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00183       {
00184         proberConfidence = mCharSetProbers[i]->GetConfidence();
00185         if (proberConfidence > maxProberConfidence)
00186         {
00187           maxProberConfidence = proberConfidence;
00188           maxProber = i;
00189         }
00190       }
00191       //do not report anything because we are not confident of it, that's in fact a negative answer
00192       if (maxProberConfidence > MINIMUM_THRESHOLD)
00193         return mCharSetProbers[maxProber]->GetCharSetName();
00194     }
00195   case eEscAscii:
00196     break;
00197   default:           // pure ascii
00198       ;
00199   }
00200   return "UTF-8";
00201 
00202 }
00203 
00204 //---------------------------------------------------------------------
00205 float nsUniversalDetector::GetConfidence()
00206 {
00207   if (!mGotData)
00208   {
00209     // we haven't got any data yet, return immediately
00210     // caller program sometimes call DataEnd before anything has been sent to detector
00211     return MINIMUM_THRESHOLD;
00212   }
00213   if (mDetectedCharset)
00214     return 0.99f;
00215   switch (mInputState)
00216   {
00217   case eHighbyte:
00218     {
00219       float proberConfidence;
00220       float maxProberConfidence = (float)0.0;
00221       int maxProber = 0;
00222 
00223       for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
00224       {
00225         proberConfidence = mCharSetProbers[i]->GetConfidence();
00226         if (proberConfidence > maxProberConfidence)
00227         {
00228           maxProberConfidence = proberConfidence;
00229           maxProber = i;
00230         }
00231       }
00232       //do not report anything because we are not confident of it, that's in fact a negative answer
00233       if (maxProberConfidence > MINIMUM_THRESHOLD)
00234         return mCharSetProbers[maxProber]->GetConfidence();
00235     }
00236   case eEscAscii:
00237     break;
00238   default:           // pure ascii
00239       ;
00240   }
00241   return MINIMUM_THRESHOLD;
00242 }
00243 
00244 nsProbingState nsUniversalDetector::GetState()
00245 {
00246     if (mDone)
00247         return eFoundIt;
00248     else
00249         return eDetecting;
00250 }
00251 }
00252 
00253 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.6.1
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal