makedent.cpp

00001 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
00002 /* enchant
00003  * Copyright (C) 2003 Dom Lachowicz
00004  *
00005  * This library is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU Lesser General Public
00007  * License as published by the Free Software Foundation; either
00008  * version 2.1 of the License, or (at your option) any later version.
00009  *
00010  * This library is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  * Lesser General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU Lesser General Public
00016  * License along with this library; if not, write to the
00017  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00018  * Boston, MA 02110-1301, USA.
00019  *
00020  * In addition, as a special exception, Dom Lachowicz
00021  * gives permission to link the code of this program with
00022  * non-LGPL Spelling Provider libraries (eg: a MSFT Office
00023  * spell checker backend) and distribute linked combinations including
00024  * the two.  You must obey the GNU Lesser General Public License in all
00025  * respects for all of the code used other than said providers.  If you modify
00026  * this file, you may extend this exception to your version of the
00027  * file, but you are not obligated to do so.  If you do not wish to
00028  * do so, delete this exception statement from your version.
00029  */
00030 
00031 /*
00032  * Copyright 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA
00033  * All rights reserved.
00034  *
00035  * Redistribution and use in source and binary forms, with or without
00036  * modification, are permitted provided that the following conditions
00037  * are met:
00038  *
00039  * 1. Redistributions of source code must retain the above copyright
00040  *    notice, this list of conditions and the following disclaimer.
00041  * 2. Redistributions in binary form must reproduce the above copyright
00042  *    notice, this list of conditions and the following disclaimer in the
00043  *    documentation and/or other materials provided with the distribution.
00044  * 3. All modifications to the source code must be clearly marked as
00045  *    such.  Binary redistributions based on modified source code
00046  *    must be clearly marked as modified versions in the documentation
00047  *    and/or other materials provided with the distribution.
00048  * 4. All advertising materials mentioning features or use of this software
00049  *    must display the following acknowledgment:
00050  *      This product includes software developed by Geoff Kuenning and
00051  *      other unpaid contributors.
00052  * 5. The name of Geoff Kuenning may not be used to endorse or promote
00053  *    products derived from this software without specific prior
00054  *    written permission.
00055  *
00056  * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
00057  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00058  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00059  * ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
00060  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00061  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00062  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00063  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00064  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00065  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00066  * SUCH DAMAGE.
00067  */
00068 
00069 /*
00070  * $Log$
00071  * Revision 1.2  2004/02/01 04:46:46  zrusin
00072  * Both ispell and aspell plugins are not working properly. We can start switching.
00073  *
00074  * Revision 1.1  2004/01/31 16:44:12  zrusin
00075  * ISpell plugin.
00076  *
00077  * Revision 1.4  2003/08/14 17:51:28  dom
00078  * update license - exception clause should be Lesser GPL
00079  *
00080  * Revision 1.3  2003/07/28 20:40:27  dom
00081  * fix up the license clause, further win32-registry proof some directory getting functions
00082  *
00083  * Revision 1.2  2003/07/16 22:52:49  dom
00084  * LGPL + exception license
00085  *
00086  * Revision 1.1  2003/07/15 01:15:08  dom
00087  * ispell enchant backend
00088  *
00089  * Revision 1.3  2003/02/12 02:10:38  hippietrail
00090  *
00091  * C casts -> C++ casts
00092  * Improved const-correctness due to changing casts
00093  * Fixed some warnings
00094  *
00095  * Revision 1.2  2003/01/29 05:50:12  hippietrail
00096  *
00097  * Fixed my mess in EncodingManager.
00098  * Changed many C casts to C++ casts.
00099  *
00100  * Revision 1.1  2003/01/24 05:52:35  hippietrail
00101  *
00102  * Refactored ispell code. Old ispell global variables had been put into
00103  * an allocated structure, a pointer to which was passed to many functions.
00104  * I have now made all such functions and variables private members of the
00105  * ISpellChecker class. It was C OO, now it's C++ OO.
00106  *
00107  * I've fixed the makefiles and tested compilation but am unable to test
00108  * operation. Please back out my changes if they cause problems which
00109  * are not obvious or easy to fix.
00110  *
00111  * Revision 1.8  2003/01/06 18:48:40  dom
00112  * ispell cleanup, start of using new 'add' save features
00113  *
00114  * Revision 1.7  2003/01/04 19:09:04  dom
00115  * some tidying... bug pissing me off...
00116  *
00117  * Revision 1.6  2002/09/19 05:31:18  hippietrail
00118  *
00119  * More Ispell cleanup.  Conditional globals and DEREF macros are removed.
00120  * K&R function declarations removed, converted to Doxygen style comments
00121  * where possible.  No code has been changed (I hope).  Compiles for me but
00122  * unable to test.
00123  *
00124  * Revision 1.5  2002/09/17 03:03:30  hippietrail
00125  *
00126  * After seeking permission on the developer list I've reformatted all the
00127  * spelling source which seemed to have parts which used 2, 3, 4, and 8
00128  * spaces for tabs.  It should all look good with our standard 4-space
00129  * tabs now.
00130  * I've concentrated just on indentation in the actual code.  More prettying
00131  * could be done.
00132  * * NO code changes were made *
00133  *
00134  * Revision 1.4  2002/09/13 17:20:13  mpritchett
00135  * Fix more warnings for Linux build
00136  *
00137  * Revision 1.3  2002/03/22 14:31:57  dom
00138  * fix mg's compile problem
00139  *
00140  * Revision 1.2  2001/05/12 16:05:42  thomasf
00141  * Big pseudo changes to ispell to make it pass around a structure rather
00142  * than rely on all sorts of gloabals willy nilly here and there.  Also
00143  * fixed our spelling class to work with accepting suggestions once more.
00144  * This code is dirty, gross and ugly (not to mention still not supporting
00145  * multiple hash sized just yet) but it works on my machine and will no
00146  * doubt break other machines.
00147  *
00148  * Revision 1.1  2001/04/15 16:01:24  tomas_f
00149  * moving to spell/xp
00150  *
00151  * Revision 1.6  1999/12/21 18:46:29  sterwill
00152  * ispell patch for non-English dictionaries by Henrik Berg <henrik@lansen.se>
00153  *
00154  * Revision 1.5  1999/10/20 03:19:35  paul
00155  * Hacked ispell code to ignore any characters that don't fit in the lookup tables loaded from the dictionary.  It ain't pretty, but at least we don't crash there any more.
00156  *
00157  * Revision 1.4  1999/04/13 17:12:51  jeff
00158  * Applied "Darren O. Benham" <gecko@benham.net> spell check changes.
00159  * Fixed crash on Win32 with the new code.
00160  *
00161  * Revision 1.3  1998/12/29 14:55:33  eric
00162  *
00163  * I've doctored the ispell code pretty extensively here.  It is now
00164  * warning-free on Win32.  It also *works* on Win32 now, since I
00165  * replaced all the I/O calls with ANSI standard ones.
00166  *
00167  * Revision 1.3  1998/12/29 14:55:33  eric
00168  *
00169  * I've doctored the ispell code pretty extensively here.  It is now
00170  * warning-free on Win32.  It also *works* on Win32 now, since I
00171  * replaced all the I/O calls with ANSI standard ones.
00172  *
00173  * Revision 1.2  1998/12/28 23:11:30  eric
00174  *
00175  * modified spell code and integration to build on Windows.
00176  * This is still a hack.
00177  *
00178  * Actually, it doesn't yet WORK on Windows.  It just builds.
00179  * SpellCheckInit is failing for some reason.
00180  *
00181  * Revision 1.1  1998/12/28 18:04:43  davet
00182  * Spell checker code stripped from ispell.  At this point, there are
00183  * two external routines...  the Init routine, and a check-a-word routine
00184  * which returns a boolean value, and takes a 16 bit char string.
00185  * The code resembles the ispell code as much as possible still.
00186  *
00187  * Revision 1.45  1994/12/27  23:08:52  geoff
00188  * Add code to makedent to reject words that contain non-word characters.
00189  * This helps protect people who use ISO 8-bit characters when ispell
00190  * isn't configured for that option.
00191  *
00192  * Revision 1.44  1994/10/25  05:46:20  geoff
00193  * Fix some incorrect declarations in the lint versions of some routines.
00194  *
00195  * Revision 1.43  1994/09/16  03:32:34  geoff
00196  * Issue an error message for bad affix flags
00197  *
00198  * Revision 1.42  1994/02/07  04:23:43  geoff
00199  * Correctly identify the deformatter when changing file types
00200  *
00201  * Revision 1.41  1994/01/25  07:11:55  geoff
00202  * Get rid of all old RCS log lines in preparation for the 3.1 release.
00203  *
00204  */
00205 
00206 #include <stdlib.h>
00207 #include <string.h>
00208 #include <ctype.h>
00209 
00210 #include "ispell_checker.h"
00211 #include "msgs.h"
00212 
00213 int     makedent P ((char * lbuf, int lbuflen, struct dent * ent));
00214 /*int       combinecaps P ((struct dent * hdr, struct dent * newent));
00215 #ifndef NO_CAPITALIZATION_SUPPORT
00216 static void forcevheader P ((struct dent * hdrp, struct dent * oldp,
00217           struct dent * newp));
00218 #endif / * NO_CAPITALIZATION_SUPPORT * /
00219 static int  combine_two_entries P ((struct dent * hdrp,
00220           struct dent * oldp, struct dent * newp));
00221 static int  acoversb P ((struct dent * enta, struct dent * entb));
00222 */
00223 /*static int    issubset P ((struct dent * ent1, struct dent * ent2));
00224 static void combineaffixes P ((struct dent * ent1, struct dent * ent2));*/
00225 
00226 void        toutent P ((FILE * outfile, struct dent * hent,
00227           int onlykeep));
00228 /*static void   toutword P ((FILE * outfile, char * word,
00229           struct dent * cent));
00230 static void flagout P ((FILE * outfile, int flag));
00231 */
00232 #ifndef ICHAR_IS_CHAR
00233 ichar_t *   icharcpy P ((ichar_t * out, ichar_t * in));
00234 int     icharlen P ((ichar_t * str));
00235 int     icharcmp P ((ichar_t * s1, ichar_t * s2));
00236 int     icharncmp P ((ichar_t * s1, ichar_t * s2, int n));
00237 #endif /* ICHAR_IS_CHAR */
00238 
00239 /*static int    has_marker;*/
00240 
00241 /*
00242  * Fill in a directory entry, including setting the capitalization flags, and
00243  * allocate and initialize memory for the d->word field.  Returns -1
00244  * if there was trouble.  The input word must be in canonical form.
00245 int makedent (lbuf, lbuflen, d)
00246 This function is not used by AbiWord.  I don't know if it'll be needed for 
00247 other abi documents
00248  */
00249     
00250 #ifndef NO_CAPITALIZATION_SUPPORT
00251 
00259 long
00260 ISpellChecker::whatcap (ichar_t *word)
00261 {
00262     register ichar_t *  p;
00263 
00264     for (p = word;  *p;  p++)
00265     {
00266         if (mylower (*p))
00267             break;
00268     }
00269     if (*p == '\0')
00270         return ALLCAPS;
00271     else
00272     {
00273         for (  ;  *p;  p++)
00274         {
00275             if (myupper (*p))
00276                 break;
00277         }
00278         if (*p == '\0')
00279         {
00280             /*
00281             ** No uppercase letters follow the lowercase ones.
00282             ** If there is more than one uppercase letter, it's
00283             ** "followcase". If only the first one is capitalized,
00284             ** it's "capitalize".  If there are no capitals
00285             ** at all, it's ANYCASE.
00286             */
00287             if (myupper (word[0]))
00288             {
00289                 for (p = word + 1;  *p != '\0';  p++)
00290                 {
00291                     if (myupper (*p))
00292                         return FOLLOWCASE;
00293                 }
00294                 return CAPITALIZED;
00295             }
00296             else
00297                 return ANYCASE;
00298         }
00299         else
00300             return FOLLOWCASE;  /* .../lower/upper */
00301     }
00302 }
00303 
00312 int ISpellChecker::addvheader ( struct dent *dp)
00313 {
00314     register struct dent *  tdent; /* Copy of entry */
00315 
00316     /*
00317     ** Add a second entry with the correct capitalization, and then make
00318     ** dp into a special dummy entry.
00319     */
00320     tdent = static_cast<struct dent *>(malloc(sizeof (struct dent)));
00321     if (tdent == NULL)
00322     {
00323         fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
00324         return -1;
00325     }
00326     *tdent = *dp;
00327     if (captype (tdent->flagfield) != FOLLOWCASE)
00328         tdent->word = NULL;
00329     else
00330     {
00331         /* Followcase words need a copy of the capitalization */
00332         tdent->word = static_cast<char *>(malloc (static_cast<unsigned int>(strlen(tdent->word)) + 1));
00333         if (tdent->word == NULL)
00334         {
00335             fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
00336             free (reinterpret_cast<char *>(tdent));
00337             return -1;
00338         }
00339         strcpy (tdent->word, dp->word);
00340     }
00341     chupcase (dp->word);
00342     dp->next = tdent;
00343     dp->flagfield &= ~CAPTYPEMASK;
00344     dp->flagfield |= (ALLCAPS | MOREVARIANTS);
00345     return 0;
00346 }
00347 #endif /* NO_CAPITALIZATION_SUPPORT */
00348 
00349 /*
00350 ** Combine and resolve the entries describing two capitalizations of the same
00351 ** word.  This may require allocating yet more entries.
00352 **
00353 ** Hdrp is a pointer into a hash table.  If the word covered by hdrp has
00354 ** variations, hdrp must point to the header.  Newp is a pointer to temporary
00355 ** storage, and space is malloc'ed if newp is to be kept.  The newp->word
00356 ** field must have been allocated with mymalloc, so that this routine may free
00357 ** the space if it keeps newp but not the word.
00358 **
00359 ** Return value:  0 if the word was added, 1 if the word was combined
00360 ** with an existing entry, and -1 if trouble occurred (e.g., malloc).
00361 ** If 1 is returned, newp->word may have been be freed using myfree.
00362 **
00363 ** Life is made much more difficult by the KEEP flag's possibilities.  We
00364 ** must ensure that a !KEEP word doesn't find its way into the personal
00365 ** dictionary as a result of this routine's actions.  However, a !KEEP
00366 ** word that has affixes must have come from the main dictionary, so it
00367 ** is acceptable to combine entries in that case (got that?).
00368 **
00369 ** The net result of all this is a set of rules that is a bloody pain
00370 ** to figure out.  Basically, we want to choose one of the following actions:
00371 **
00372 **  (1) Add newp's affixes and KEEP flag to oldp, and discard newp.
00373 **  (2) Add oldp's affixes and KEEP flag to newp, replace oldp with
00374 **      newp, and discard newp.
00375 #ifndef NO_CAPITALIZATION_SUPPORT
00376 **  (3) Insert newp as a new entry in the variants list.  If there is
00377 **      currently no variant header, this requires adding one.  Adding a
00378 **      header splits into two sub-cases:
00379 **
00380 **      (3a) If oldp is ALLCAPS and the KEEP flags match, just turn it
00381 **      into the header.
00382 **      (3b) Otherwise, add a new entry to serve as the header.
00383 **      To ease list linking, this is done by copying oldp into
00384 **      the new entry, and then performing (3a).
00385 **
00386 **      After newp has been added as a variant, its affixes and KEEP
00387 **      flag are OR-ed into the variant header.
00388 #endif
00389 **
00390 ** So how to choose which?  The default is always case (3), which adds newp
00391 ** as a new entry in the variants list.  Cases (1) and (2) are symmetrical
00392 ** except for which entry is discarded.  We can use case (1) or (2) whenever
00393 ** one entry "covers" the other.  "Covering" is defined as follows:
00394 **
00395 **  (4) For entries with matching capitalization types, A covers B
00396 **      if:
00397 **
00398 **      (4a) B's affix flags are a subset of A's, or the KEEP flags
00399 **       match, and
00400 **      (4b) either the KEEP flags match, or A's KEEP flag is set.
00401 **      (Since A has more suffixes, combining B with it won't
00402 **      cause any extra suffixes to be added to the dictionary.)
00403 **      (4c) If the words are FOLLOWCASE, the capitalizations match
00404 **      exactly.
00405 **
00406 #ifndef NO_CAPITALIZATION_SUPPORT
00407 **  (5) For entries with mismatched capitalization types, A covers B
00408 **      if (4a) and (4b) are true, and:
00409 **
00410 **      (5a) B is ALLCAPS, or
00411 **      (5b) A is ANYCASE, and B is CAPITALIZED.
00412 #endif
00413 **
00414 ** For any "hdrp" without variants, oldp is the same as hdrp.  Otherwise,
00415 ** the above tests are applied using each variant in turn for oldp.
00416 int combinecaps (hdrp, newp)
00417 static void forcevheader (hdrp, oldp, newp)
00418 static int combine_two_entries (hdrp, oldp, newp)
00419 static int acoversb (enta, entb)
00420 */
00421 
00422 /*
00423  * \param s
00424  */
00425 void
00426 ISpellChecker::upcase (ichar_t *s)
00427 {
00428 
00429     while (*s)
00430     {
00431         *s = mytoupper (*s);
00432         s++;
00433     }
00434 }
00435 
00436 /*
00437  * \param s
00438  */
00439 void
00440 ISpellChecker::lowcase (ichar_t *s)
00441 {
00442 
00443     while (*s)
00444     {
00445         *s = mytolower (*s);
00446         s++;
00447     }
00448 }
00449 
00456 void
00457 ISpellChecker::chupcase (char *s)
00458 {
00459     ichar_t *   is;
00460 
00461     is = strtosichar (s, 1);
00462     upcase (is);
00463     ichartostr (s, is, strlen (s) + 1, 1);
00464 }
00465 
00466 /*
00467 ** See if one affix field is a subset of another.  Returns NZ if ent1
00468 ** is a subset of ent2.  The KEEP flag is not taken into consideration.
00469 static int issubset (ent1, ent2)
00470 static void combineaffixes (ent1, ent2)
00471 */
00472 
00473 /*
00474 ** Write out a dictionary entry, including capitalization variants.
00475 ** If onlykeep is true, only those variants with KEEP set will be
00476 ** written.
00477 Removed -- not used by Abiword
00478 void toutent_ (toutfile, hent, onlykeep)
00479 static void toutword (toutfile, word, cent)
00480 static void flagout (toutfile, flag)
00481 */
00482 
00498 int
00499 ISpellChecker::stringcharlen (char *bufp, int canonical)
00500 {
00501 #ifdef SLOWMULTIPLY
00502     static char *   sp[MAXSTRINGCHARS];
00503     static int      inited = 0;
00504 #endif /* SLOWMULTIPLY */
00505     register char * bufcur;
00506     register char * stringcur;
00507     register int    stringno;
00508     register int    lowstringno;
00509     register int    highstringno;
00510     int         dupwanted;
00511 
00512 #ifdef SLOWMULTIPLY
00513     if (!inited)
00514     {
00515         inited = 1;
00516         for (stringno = 0;  stringno < MAXSTRINGCHARS;  stringno++)
00517             sp[stringno] = &hashheader.stringchars[stringno][0];
00518     }
00519 #endif /* SLOWMULTIPLY */
00520     lowstringno = 0;
00521     highstringno = m_hashheader.nstrchars - 1;
00522     dupwanted = canonical ? 0 : m_defdupchar;
00523     while (lowstringno <= highstringno)
00524     {
00525         stringno = (lowstringno + highstringno) >> 1;
00526 #ifdef SLOWMULTIPLY
00527         stringcur = sp[stringno];
00528 #else /* SLOWMULTIPLY */
00529         stringcur = &m_hashheader.stringchars[stringno][0];
00530 #endif /* SLOWMULTIPLY */
00531         bufcur = bufp;
00532         while (*stringcur)
00533         {
00534 #ifdef NO8BIT
00535             if (((*bufcur++ ^ *stringcur) & 0x7F) != 0)
00536 #else /* NO8BIT */
00537             if (*bufcur++ != *stringcur)
00538 #endif /* NO8BIT */
00539                 break;
00540             /*
00541             ** We can't use autoincrement above because of the
00542             ** test below.
00543             */
00544             stringcur++;
00545         }
00546         if (*stringcur == '\0')
00547         {
00548             if (m_hashheader.dupnos[stringno] == dupwanted)
00549             {
00550                 /* We have a match */
00551                 m_laststringch = m_hashheader.stringdups[stringno];
00552 #ifdef SLOWMULTIPLY
00553                 return stringcur - sp[stringno];
00554 #else /* SLOWMULTIPLY */
00555                 return stringcur - &m_hashheader.stringchars[stringno][0];
00556 #endif /* SLOWMULTIPLY */
00557             }
00558             else
00559                 --stringcur;
00560         }
00561         /* No match - choose which side to search on */
00562 #ifdef NO8BIT
00563         if ((*--bufcur & 0x7F) < (*stringcur & 0x7F))
00564             highstringno = stringno - 1;
00565         else if ((*bufcur & 0x7F) > (*stringcur & 0x7F))
00566             lowstringno = stringno + 1;
00567 #else /* NO8BIT */
00568         if (*--bufcur < *stringcur)
00569             highstringno = stringno - 1;
00570         else if (*bufcur > *stringcur)
00571             lowstringno = stringno + 1;
00572 #endif /* NO8BIT */
00573         else if (dupwanted < m_hashheader.dupnos[stringno])
00574             highstringno = stringno - 1;
00575         else
00576             lowstringno = stringno + 1;
00577     }
00578     m_laststringch = static_cast<unsigned int>(-1);
00579     return 0;           /* Not a string character */
00580 }
00581 
00582 /* MACROS CONVERTED TO FUNCTIONS
00583 ** These macros are similar to the ones above, but they take into account
00584 ** the possibility of string characters.  Note well that they take a POINTER,
00585 ** not a character.
00586 **
00587 ** The "l_" versions set "len" to the length of the string character as a
00588 ** handy side effect.  (Note that the global "laststringch" is also set,
00589 ** and sometimes used, by these macros.)
00590 **
00591 ** The "l1_" versions go one step further and guarantee that the "len"
00592 ** field is valid for *all* characters, being set to 1 even if the macro
00593 ** returns false.  This macro is a great example of how NOT to write
00594 ** readable C.
00595 */
00596 #define isstringch(ptr, canon)  (isstringstart (*(ptr)) \
00597                   &&  stringcharlen ((ptr), (canon)) > 0)
00598 /*
00599 int isstringch(char *ptr, int canon) {
00600     return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0);
00601 }
00602 */
00603 
00604 #define l_isstringch(ptr, len, canon)   \
00605                 (isstringstart (*(ptr)) \
00606                   &&  (len = stringcharlen ((ptr), (canon))) \
00607                     > 0)
00608 /*
00609 int l_isstringch(char *ptr, int len, int canon) {
00610     return (isstringstart (*(ptr)) &&  (len = stringcharlen ((ptr), (canon))) > 0);
00611 }
00612 */
00613 
00614 #define l1_isstringch(ptr, len, canon)  \
00615                 (len = 1, \
00616                   isstringstart ((unsigned char)(*(ptr))) \
00617                     &&  ((len = \
00618                       stringcharlen ((ptr), (canon))) \
00619                     > 0 \
00620                       ? 1 : (len = 1, 0)))
00621 /*
00622 int l1_isstringch(char *ptr, int len, int canon) {
00623     return (len = 1, isstringstart ((unsigned char)(*(ptr))) &&  
00624            ((len = stringcharlen ((ptr), (canon))) > 0 ? 1 : (len = 1, 0)));
00625 }
00626 */
00627 
00628 /*** END MACRO CONVERSION ***/
00629 
00641 int
00642 ISpellChecker::strtoichar (ichar_t *out, char *in, int outlen, int canonical)
00643 {
00644     register int len = 1;       /* Length of next character */
00645 
00646     outlen /= sizeof (ichar_t);     /* Convert to an ichar_t count */
00647     for (  ;  --outlen > 0  &&  *in != '\0';  in += len)
00648     {
00649         if (l1_isstringch (in, len , canonical)) {
00650             *out++ = SET_SIZE + m_laststringch;
00651         } else {
00652             *out++ = (unsigned char)( *in );
00653         }
00654     }
00655     *out = 0;
00656     return outlen <= 0;
00657 }
00658 
00674 int
00675 ISpellChecker::ichartostr ( char *out, ichar_t *in, int outlen, int canonical)
00676 {
00677     register int    ch;     /* Next character to store */
00678     register int    i;      /* Index into duplicates list */
00679     register char * scharp;     /* Pointer into a string char */
00680 
00681     while (--outlen > 0  &&  (ch = *in++) != 0)
00682     {
00683         if (ch < SET_SIZE)
00684             *out++ = static_cast<char>(ch);
00685         else
00686         {
00687             ch -= SET_SIZE;
00688             if (!canonical)
00689             {
00690                 for (i = m_hashheader.nstrchars;  --i >= 0;  )
00691                 {
00692                     if (m_hashheader.dupnos[i] == m_defdupchar
00693                       &&  (static_cast<int>(m_hashheader.stringdups[i])) == ch)
00694                     {
00695                         ch = i;
00696                         break;
00697                     }
00698                 }
00699             }
00700             scharp = m_hashheader.stringchars[static_cast<unsigned>(ch)];
00701             while ((*out++ = *scharp++) != '\0')
00702                 ;
00703             out--;
00704         }
00705     }
00706     *out = '\0';
00707     return outlen <= 0;
00708 }
00709 
00718 ichar_t *
00719 ISpellChecker::strtosichar ( char *in, int canonical)
00720 {
00721     static ichar_t  out[STRTOSICHAR_SIZE / sizeof (ichar_t)];
00722 
00723     if (strtoichar (out, in, sizeof out, canonical))
00724         fprintf (stderr, WORD_TOO_LONG (in));
00725     return out;
00726 }
00727 
00736 char *
00737 ISpellChecker::ichartosstr (ichar_t *in, int canonical)
00738 {
00739     static char     out[ICHARTOSSTR_SIZE];
00740 
00741     if (ichartostr (out, in, sizeof out, canonical))
00742         fprintf (stderr, WORD_TOO_LONG (out));
00743     return out;
00744 }
00745 
00754 char *
00755 ISpellChecker::printichar (int in)
00756 {
00757     static char     out[MAXSTRINGCHARLEN + 1];
00758 
00759     if (in < SET_SIZE)
00760     {
00761         out[0] = static_cast<char>(in);
00762         out[1] = '\0';
00763     }
00764     else
00765         strcpy (out, m_hashheader.stringchars[static_cast<unsigned>(in) - SET_SIZE]);
00766     return out;
00767 }
00768 
00769 #ifndef ICHAR_IS_CHAR
00770 
00778 ichar_t *
00779 icharcpy (ichar_t *out, ichar_t *in)
00780 {
00781     ichar_t *       origout;    /* Copy of destination for return */
00782 
00783     origout = out;
00784     while ((*out++ = *in++) != 0)
00785         ;
00786     return origout;
00787 }
00788 
00796 int
00797 icharlen (ichar_t * in)
00798 {
00799     register int    len;        /* Length so far */
00800 
00801     for (len = 0;  *in++ != 0;  len++)
00802         ;
00803     return len;
00804 }
00805 
00814 int
00815 icharcmp (ichar_t * s1, ichar_t * s2)
00816 {
00817 
00818     while (*s1 != 0)
00819     {
00820         if (*s1++ != *s2++)
00821             return *--s1 - *--s2;
00822     }
00823     return *s1 - *s2;
00824 }
00825 
00835 int
00836 icharncmp (ichar_t *s1, ichar_t *s2, int n)
00837 {
00838 
00839     while (--n >= 0  &&  *s1 != 0)
00840     {
00841         if (*s1++ != *s2++)
00842             return *--s1 - *--s2;
00843     }
00844     if (n < 0)
00845         return 0;
00846     else
00847         return *s1 - *s2;
00848 }
00849 
00850 #endif /* ICHAR_IS_CHAR */
00851 
00852 /*
00853  * \param istate
00854  * \param name
00855  * \param searchnames
00856  * \param deformatter
00857  *
00858  * \return
00859  */
00860 int
00861 ISpellChecker::findfiletype (const char *name, int searchnames, int *deformatter)
00862 {
00863     char *      cp;     /* Pointer into suffix list */
00864     int         cplen;      /* Length of current suffix */
00865     register int    i;      /* Index into type table */
00866     int         len;        /* Length of the name */
00867 
00868     /*
00869      * Note:  for now, the deformatter is set to 1 for tex, 0 for nroff.
00870      * Further, we assume that it's one or the other, so that a test
00871      * for tex is sufficient.  This needs to be generalized.
00872      */
00873     len = strlen (name);
00874     if (searchnames)
00875     {
00876         for (i = 0;  i < m_hashheader.nstrchartype;  i++)
00877         {
00878             if (strcmp (name, m_chartypes[i].name) == 0)
00879             {
00880                 if (deformatter != NULL)
00881                     *deformatter =
00882                       (strcmp (m_chartypes[i].deformatter, "tex") == 0);
00883                 return i;
00884             }
00885         }
00886     }
00887     for (i = 0;  i < m_hashheader.nstrchartype;  i++)
00888     {
00889         for (cp = m_chartypes[i].suffixes;  *cp != '\0';  cp += cplen + 1)
00890         {
00891             cplen = strlen (cp);
00892             if (len >= cplen  &&  strcmp (&name[len - cplen], cp) == 0)
00893             {
00894                 if (deformatter != NULL)
00895                     *deformatter =
00896                       (strcmp (m_chartypes[i].deformatter, "tex") == 0);
00897                 return i;
00898             }
00899         }
00900     }
00901     return -1;
00902 }
00903 
00904 /*
00905     HACK: macros replaced with function implementations 
00906     so we could do a side-effect-free check for unicode
00907     characters which aren't in hashheader
00908 
00909     TODO: this is just a workaround to keep us from crashing. 
00910     more sophisticated logic needed here. 
00911 */
00912 char ISpellChecker::myupper(ichar_t c)
00913 {
00914     if (c < (SET_SIZE + MAXSTRINGCHARS))
00915         return m_hashheader.upperchars[c];
00916     else
00917         return 0;
00918 }
00919 
00920 char ISpellChecker::mylower(ichar_t c)
00921 {
00922     if (c < (SET_SIZE + MAXSTRINGCHARS))
00923         return m_hashheader.lowerchars[c];
00924     else
00925         return 0;
00926 }
00927 
00928 int myspace(ichar_t c)
00929 {
00930     return ((c > 0)  &&  (c < 0x80) &&  isspace(static_cast<unsigned char>(c)));
00931 }
00932 
00933 char ISpellChecker::iswordch(ichar_t c)
00934 {
00935     if (c < (SET_SIZE + MAXSTRINGCHARS))
00936         return m_hashheader.wordchars[c];
00937     else
00938         return 0;
00939 }
00940 
00941 char ISpellChecker::isboundarych(ichar_t c)
00942 {
00943     if (c < (SET_SIZE + MAXSTRINGCHARS))
00944         return m_hashheader.boundarychars[c];
00945     else
00946         return 0;
00947 }
00948 
00949 char ISpellChecker::isstringstart(ichar_t c)
00950 {
00951     if (c < (SET_SIZE))
00952         return m_hashheader.stringstarts[static_cast<unsigned char>(c)];
00953     else
00954         return 0;
00955 }
00956 
00957 ichar_t ISpellChecker::mytolower(ichar_t c)
00958 {
00959     if (c < (SET_SIZE + MAXSTRINGCHARS))
00960         return m_hashheader.lowerconv[c];
00961     else
00962         return c;
00963 }
00964 
00965 ichar_t ISpellChecker::mytoupper (ichar_t c)
00966 {
00967     if (c < (SET_SIZE + MAXSTRINGCHARS))
00968         return m_hashheader.upperconv[c];
00969     else
00970         return c;
00971 }
00972 
KDE Home | KDE Accessibility Home | Description of Access Keys