lexer.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
00005  *
00006  *  This library is free software; you can redistribute it and/or
00007  *  modify it under the terms of the GNU Library General Public
00008  *  License as published by the Free Software Foundation; either
00009  *  version 2 of the License, or (at your option) any later version.
00010  *
00011  *  This library is distributed in the hope that it will be useful,
00012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  *  Library General Public License for more details.
00015  *
00016  *  You should have received a copy of the GNU Library General Public License
00017  *  along with this library; see the file COPYING.LIB.  If not, write to
00018  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019  *  Boston, MA 02110-1301, USA.
00020  *
00021  */
00022 
00023 #ifdef HAVE_CONFIG_H
00024 #include <config.h>
00025 #endif
00026 
00027 #include <ctype.h>
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <assert.h>
00032 
00033 #include "value.h"
00034 #include "object.h"
00035 #include "types.h"
00036 #include "interpreter.h"
00037 #include "nodes.h"
00038 #include "lexer.h"
00039 #include "identifier.h"
00040 #include "lookup.h"
00041 #include "internal.h"
00042 #include "dtoa.h"
00043 
00044 // we can't specify the namespace in yacc's C output, so do it here
00045 using namespace KJS;
00046 
00047 static Lexer *currLexer = 0;
00048 
00049 #ifndef KDE_USE_FINAL
00050 #include "grammar.h"
00051 #endif
00052 
00053 #include "lexer.lut.h"
00054 
00055 extern YYLTYPE yylloc; // global bison variable holding token info
00056 
00057 // a bridge for yacc from the C world to C++
00058 int kjsyylex()
00059 {
00060   return Lexer::curr()->lex();
00061 }
00062 
00063 Lexer::Lexer()
00064   : yylineno(1),
00065     size8(128), size16(128), restrKeyword(false),
00066     convertNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0),
00067     code(0), length(0),
00068 #ifndef KJS_PURE_ECMA
00069     bol(true),
00070 #endif
00071     current(0), next1(0), next2(0), next3(0),
00072     strings(0), numStrings(0), stringsCapacity(0),
00073     identifiers(0), numIdentifiers(0), identifiersCapacity(0)
00074 {
00075   // allocate space for read buffers
00076   buffer8 = new char[size8];
00077   buffer16 = new UChar[size16];
00078   currLexer = this;
00079 }
00080 
00081 Lexer::~Lexer()
00082 {
00083   delete [] buffer8;
00084   delete [] buffer16;
00085 }
00086 
00087 Lexer *Lexer::curr()
00088 {
00089   if (!currLexer) {
00090     // create singleton instance
00091     currLexer = new Lexer();
00092   }
00093   return currLexer;
00094 }
00095 
00096 #ifdef KJS_DEBUG_MEM
00097 void Lexer::globalClear()
00098 {
00099   delete currLexer;
00100   currLexer = 0L;
00101 }
00102 #endif
00103 
00104 void Lexer::setCode(const UChar *c, unsigned int len)
00105 {
00106   yylineno = 1;
00107   restrKeyword = false;
00108   delimited = false;
00109   convertNextIdentifier = false;
00110   stackToken = -1;
00111   lastToken = -1;
00112   foundBad = false;
00113   pos = 0;
00114   code = c;
00115   length = len;
00116   skipLF = false;
00117   skipCR = false;
00118 #ifndef KJS_PURE_ECMA
00119   bol = true;
00120 #endif
00121 
00122   // read first characters
00123   current = (length > 0) ? code[0].uc : -1;
00124   next1 = (length > 1) ? code[1].uc : -1;
00125   next2 = (length > 2) ? code[2].uc : -1;
00126   next3 = (length > 3) ? code[3].uc : -1;
00127 }
00128 
00129 void Lexer::shift(unsigned int p)
00130 {
00131   while (p--) {
00132     pos++;
00133     current = next1;
00134     next1 = next2;
00135     next2 = next3;
00136     next3 = (pos + 3 < length) ? code[pos+3].uc : -1;
00137   }
00138 }
00139 
00140 // called on each new line
00141 void Lexer::nextLine()
00142 {
00143   yylineno++;
00144 #ifndef KJS_PURE_ECMA
00145   bol = true;
00146 #endif
00147 }
00148 
00149 void Lexer::setDone(State s)
00150 {
00151   state = s;
00152   done = true;
00153 }
00154 
00155 int Lexer::lex()
00156 {
00157   int token = 0;
00158   state = Start;
00159   unsigned short stringType = 0; // either single or double quotes
00160   pos8 = pos16 = 0;
00161   done = false;
00162   terminator = false;
00163   skipLF = false;
00164   skipCR = false;
00165 
00166   // did we push a token on the stack previously ?
00167   // (after an automatic semicolon insertion)
00168   if (stackToken >= 0) {
00169     setDone(Other);
00170     token = stackToken;
00171     stackToken = 0;
00172   }
00173 
00174   while (!done) {
00175     if (skipLF && current != '\n') // found \r but not \n afterwards
00176         skipLF = false;
00177     if (skipCR && current != '\r') // found \n but not \r afterwards
00178         skipCR = false;
00179     if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one
00180     {
00181         skipLF = false;
00182         skipCR = false;
00183         shift(1);
00184     }
00185 
00186     bool cr = (current == '\r');
00187     bool lf = (current == '\n');
00188     if (cr)
00189       skipLF = true;
00190     else if (lf)
00191       skipCR = true;
00192     bool isLineTerminator = cr || lf;
00193 
00194     switch (state) {
00195     case Start:
00196       if (isWhiteSpace(current)) {
00197         // do nothing
00198       } else if (current == '/' && next1 == '/') {
00199         shift(1);
00200         state = InSingleLineComment;
00201       } else if (current == '/' && next1 == '*') {
00202         shift(1);
00203         state = InMultiLineComment;
00204       } else if (current == -1) {
00205         if (!terminator && !delimited) {
00206           // automatic semicolon insertion if program incomplete
00207           token = ';';
00208           stackToken = 0;
00209           setDone(Other);
00210         } else
00211           setDone(Eof);
00212       } else if (isLineTerminator) {
00213         nextLine();
00214         terminator = true;
00215         if (restrKeyword) {
00216           token = ';';
00217           setDone(Other);
00218         }
00219       } else if (current == '"' || current == '\'') {
00220         state = InString;
00221         stringType = current;
00222       } else if (isIdentLetter(current)) {
00223         record16(current);
00224         state = InIdentifierOrKeyword;
00225       } else if (current == '\\') {
00226         state = InIdentifierUnicodeEscapeStart;
00227       } else if (current == '0') {
00228         record8(current);
00229         state = InNum0;
00230       } else if (isDecimalDigit(current)) {
00231         record8(current);
00232         state = InNum;
00233       } else if (current == '.' && isDecimalDigit(next1)) {
00234         record8(current);
00235         state = InDecimal;
00236 #ifndef KJS_PURE_ECMA
00237         // <!-- marks the beginning of a line comment (for www usage)
00238       } else if (current == '<' && next1 == '!' &&
00239                  next2 == '-' && next3 == '-') {
00240         shift(3);
00241         state = InSingleLineComment;
00242         // same for -->
00243       } else if (bol && current == '-' && next1 == '-' &&  next2 == '>') {
00244         shift(2);
00245         state = InSingleLineComment;
00246 #endif
00247       } else {
00248         token = matchPunctuator(current, next1, next2, next3);
00249         if (token != -1) {
00250           setDone(Other);
00251         } else {
00252           //      cerr << "encountered unknown character" << endl;
00253           setDone(Bad);
00254         }
00255       }
00256       break;
00257     case InString:
00258       if (current == stringType) {
00259         shift(1);
00260         setDone(String);
00261       } else if (current == -1 || isLineTerminator) {
00262         setDone(Bad);
00263       } else if (current == '\\') {
00264         state = InEscapeSequence;
00265       } else {
00266         record16(current);
00267       }
00268       break;
00269     // Escape Sequences inside of strings
00270     case InEscapeSequence:
00271       if (isOctalDigit(current)) {
00272         if (current >= '0' && current <= '3' &&
00273             isOctalDigit(next1) && isOctalDigit(next2)) {
00274           record16(convertOctal(current, next1, next2));
00275           shift(2);
00276           state = InString;
00277         } else if (isOctalDigit(current) && isOctalDigit(next1)) {
00278           record16(convertOctal('0', current, next1));
00279           shift(1);
00280           state = InString;
00281         } else if (isOctalDigit(current)) {
00282           record16(convertOctal('0', '0', current));
00283           state = InString;
00284         } else {
00285           setDone(Bad);
00286         }
00287       } else if (current == 'x')
00288         state = InHexEscape;
00289       else if (current == 'u')
00290         state = InUnicodeEscape;
00291       else {
00292     if (isLineTerminator)
00293       nextLine();
00294         record16(singleEscape(current));
00295         state = InString;
00296       }
00297       break;
00298     case InHexEscape:
00299       if (isHexDigit(current) && isHexDigit(next1)) {
00300         state = InString;
00301         record16(convertHex(current, next1));
00302         shift(1);
00303       } else if (current == stringType) {
00304         record16('x');
00305         shift(1);
00306         setDone(String);
00307       } else {
00308         record16('x');
00309         record16(current);
00310         state = InString;
00311       }
00312       break;
00313     case InUnicodeEscape:
00314       if (isHexDigit(current) && isHexDigit(next1) &&
00315           isHexDigit(next2) && isHexDigit(next3)) {
00316         record16(convertUnicode(current, next1, next2, next3));
00317         shift(3);
00318         state = InString;
00319       } else if (current == stringType) {
00320         record16('u');
00321         shift(1);
00322         setDone(String);
00323       } else {
00324         setDone(Bad);
00325       }
00326       break;
00327     case InSingleLineComment:
00328       if (isLineTerminator) {
00329         nextLine();
00330         terminator = true;
00331         if (restrKeyword) {
00332           token = ';';
00333           setDone(Other);
00334         } else
00335           state = Start;
00336       } else if (current == -1) {
00337         setDone(Eof);
00338       }
00339       break;
00340     case InMultiLineComment:
00341       if (current == -1) {
00342         setDone(Bad);
00343       } else if (isLineTerminator) {
00344         nextLine();
00345       } else if (current == '*' && next1 == '/') {
00346         state = Start;
00347         shift(1);
00348       }
00349       break;
00350     case InIdentifierOrKeyword:
00351     case InIdentifier:
00352       if (isIdentLetter(current) || isDecimalDigit(current))
00353         record16(current);
00354       else if (current == '\\')
00355         state = InIdentifierUnicodeEscapeStart;
00356       else
00357         setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
00358       break;
00359     case InNum0:
00360       if (current == 'x' || current == 'X') {
00361         record8(current);
00362         state = InHex;
00363       } else if (current == '.') {
00364         record8(current);
00365         state = InDecimal;
00366       } else if (current == 'e' || current == 'E') {
00367         record8(current);
00368         state = InExponentIndicator;
00369       } else if (isOctalDigit(current)) {
00370         record8(current);
00371         state = InOctal;
00372       } else if (isDecimalDigit(current)) {
00373         record8(current);
00374         state = InDecimal;
00375       } else {
00376         setDone(Number);
00377       }
00378       break;
00379     case InHex:
00380       if (isHexDigit(current)) {
00381         record8(current);
00382       } else {
00383         setDone(Hex);
00384       }
00385       break;
00386     case InOctal:
00387       if (isOctalDigit(current)) {
00388         record8(current);
00389       }
00390       else if (isDecimalDigit(current)) {
00391         record8(current);
00392         state = InDecimal;
00393       } else
00394         setDone(Octal);
00395       break;
00396     case InNum:
00397       if (isDecimalDigit(current)) {
00398         record8(current);
00399       } else if (current == '.') {
00400         record8(current);
00401         state = InDecimal;
00402       } else if (current == 'e' || current == 'E') {
00403         record8(current);
00404         state = InExponentIndicator;
00405       } else
00406         setDone(Number);
00407       break;
00408     case InDecimal:
00409       if (isDecimalDigit(current)) {
00410         record8(current);
00411       } else if (current == 'e' || current == 'E') {
00412         record8(current);
00413         state = InExponentIndicator;
00414       } else
00415         setDone(Number);
00416       break;
00417     case InExponentIndicator:
00418       if (current == '+' || current == '-') {
00419         record8(current);
00420       } else if (isDecimalDigit(current)) {
00421         record8(current);
00422         state = InExponent;
00423       } else
00424         setDone(Bad);
00425       break;
00426     case InExponent:
00427       if (isDecimalDigit(current)) {
00428         record8(current);
00429       } else
00430         setDone(Number);
00431       break;
00432     case InIdentifierUnicodeEscapeStart:
00433       if (current == 'u')
00434         state = InIdentifierUnicodeEscape;
00435       else
00436         setDone(Bad);
00437       break;
00438     case InIdentifierUnicodeEscape:
00439       if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
00440         record16(convertUnicode(current, next1, next2, next3));
00441         shift(3);
00442         state = InIdentifier;
00443       } else {
00444         setDone(Bad);
00445       }
00446       break;
00447     default:
00448       assert(!"Unhandled state in switch statement");
00449     }
00450 
00451     // move on to the next character
00452     if (!done)
00453       shift(1);
00454 #ifndef KJS_PURE_ECMA
00455     if (state != Start && state != InSingleLineComment)
00456       bol = false;
00457 #endif
00458   }
00459 
00460   // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
00461   if ((state == Number || state == Octal || state == Hex)
00462       && isIdentLetter(current))
00463     state = Bad;
00464 
00465   // terminate string
00466   buffer8[pos8] = '\0';
00467 
00468 #ifdef KJS_DEBUG_LEX
00469   fprintf(stderr, "line: %d ", lineNo());
00470   fprintf(stderr, "yytext (%x): ", buffer8[0]);
00471   fprintf(stderr, "%s ", buffer8);
00472 #endif
00473 
00474   long double dval = 0;
00475   if (state == Number) {
00476     dval = kjs_strtod(buffer8, 0L);
00477   } else if (state == Hex) { // scan hex numbers
00478     dval = 0;
00479     if (buffer8[0] == '0' && (buffer8[1] == 'x' || buffer8[1] == 'X')) {
00480       for (const char *p = buffer8+2; *p; p++) {
00481     if (!isHexDigit(*p)) {
00482       dval = 0;
00483       break;
00484     }
00485     dval = dval * 16 + convertHex(*p);
00486       }
00487     }
00488     state = Number;
00489   } else if (state == Octal) {   // scan octal number
00490     dval = 0;
00491     if (buffer8[0] == '0') {
00492       for (const char *p = buffer8+1; *p; p++) {
00493     if (*p < '0' || *p > '7') {
00494       dval = 0;
00495       break;
00496     }
00497     dval = dval * 8 + *p - '0';
00498       }
00499     }
00500     state = Number;
00501   }
00502 
00503 #ifdef KJS_DEBUG_LEX
00504   switch (state) {
00505   case Eof:
00506     printf("(EOF)\n");
00507     break;
00508   case Other:
00509     printf("(Other)\n");
00510     break;
00511   case Identifier:
00512   case IdentifierOrKeyword:
00513     printf("(Identifier)/(Keyword)\n");
00514     break;
00515   case String:
00516     printf("(String)\n");
00517     break;
00518   case Number:
00519     printf("(Number)\n");
00520     break;
00521   default:
00522     printf("(unknown)");
00523   }
00524 #endif
00525 
00526   if (state != Identifier && state != IdentifierOrKeyword &&
00527       convertNextIdentifier)
00528     convertNextIdentifier = false;
00529 
00530   restrKeyword = false;
00531   delimited = false;
00532   kjsyylloc.first_line = yylineno; // ???
00533   kjsyylloc.last_line = yylineno;
00534 
00535   switch (state) {
00536   case Eof:
00537     token = 0;
00538     break;
00539   case Other:
00540     if(token == '}' || token == ';') {
00541       delimited = true;
00542     }
00543     break;
00544   case IdentifierOrKeyword:
00545     if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
00546   case Identifier:
00547       // Lookup for keyword failed, means this is an identifier
00548       // Apply anonymous-function hack below (convert the identifier)
00549       if (convertNextIdentifier) {
00550         convertNextIdentifier = false;
00551 #ifdef KJS_VERBOSE
00552         UString debugstr(buffer16, pos16); fprintf(stderr,"Anonymous function hack: eating identifier %s\n",debugstr.ascii());
00553 #endif
00554     token = FUNCEXPRIDENT;
00555       } else {
00556     token = IDENT;
00557       }
00558       /* TODO: close leak on parse error. same holds true for String */
00559       kjsyylval.ident = makeIdentifier(buffer16, pos16);
00560       break;
00561     }
00562 
00563     convertNextIdentifier = false;
00564     // Hack for "f = function somename() { ... }", too hard to get into the grammar
00565     // Same for building an array with function pointers ( 'name', func1, 'name2', func2 )
00566     // There are lots of other uses, we really have to get this into the grammar
00567     if ( token == FUNCTION &&
00568          ( lastToken == '=' || lastToken == ',' || lastToken == '(' ||
00569        lastToken == ':' || lastToken == RETURN ) )
00570             convertNextIdentifier = true;
00571 
00572     if (token == CONTINUE || token == BREAK ||
00573         token == RETURN || token == THROW)
00574       restrKeyword = true;
00575     break;
00576   case String:
00577     kjsyylval.ustr = makeUString(buffer16, pos16);
00578     token = STRING;
00579     break;
00580   case Number:
00581     kjsyylval.dval = dval;
00582     token = NUMBER;
00583     break;
00584   case Bad:
00585     foundBad = true;
00586     return -1;
00587   default:
00588     assert(!"unhandled numeration value in switch");
00589     return -1;
00590   }
00591   lastToken = token;
00592   return token;
00593 }
00594 
00595 bool Lexer::isWhiteSpace(unsigned short c)
00596 {
00597   return (c == ' ' || c == '\t' ||
00598           c == 0x0b || c == 0x0c || c == 0xa0);
00599 }
00600 
00601 bool Lexer::isIdentLetter(unsigned short c)
00602 {
00603   // Allow any character in the Unicode categories
00604   // Uppercase letter (Lu), Lowercase letter (Ll),
00605   // Titlecase letter (Lt)", Modifier letter (Lm),
00606   // Other letter (Lo), or Letter number (Nl).
00607   // Also see: http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
00608   return (c >= 'a' && c <= 'z' ||
00609           c >= 'A' && c <= 'Z' ||
00610           // A with grave - O with diaeresis
00611           c >= 0x00c0 && c <= 0x00d6 ||
00612           // O with stroke - o with diaeresis
00613           c >= 0x00d8 && c <= 0x00f6 ||
00614           // o with stroke - turned h with fishook and tail
00615           c >= 0x00f8 && c <= 0x02af ||
00616           // Greek etc. TODO: not precise
00617           c >= 0x0388 && c <= 0x1ffc ||
00618           c == '$' || c == '_');
00619   /* TODO: use complete category table */
00620 }
00621 
00622 bool Lexer::isDecimalDigit(unsigned short c)
00623 {
00624   return (c >= '0' && c <= '9');
00625 }
00626 
00627 bool Lexer::isHexDigit(unsigned short c)
00628 {
00629   return (c >= '0' && c <= '9' ||
00630           c >= 'a' && c <= 'f' ||
00631           c >= 'A' && c <= 'F');
00632 }
00633 
00634 bool Lexer::isOctalDigit(unsigned short c)
00635 {
00636   return (c >= '0' && c <= '7');
00637 }
00638 
00639 int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
00640                               unsigned short c3, unsigned short c4)
00641 {
00642   if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
00643     shift(4);
00644     return URSHIFTEQUAL;
00645   } else if (c1 == '=' && c2 == '=' && c3 == '=') {
00646     shift(3);
00647     return STREQ;
00648   } else if (c1 == '!' && c2 == '=' && c3 == '=') {
00649     shift(3);
00650     return STRNEQ;
00651    } else if (c1 == '>' && c2 == '>' && c3 == '>') {
00652     shift(3);
00653     return URSHIFT;
00654   } else if (c1 == '<' && c2 == '<' && c3 == '=') {
00655     shift(3);
00656     return LSHIFTEQUAL;
00657   } else if (c1 == '>' && c2 == '>' && c3 == '=') {
00658     shift(3);
00659     return RSHIFTEQUAL;
00660   } else if (c1 == '<' && c2 == '=') {
00661     shift(2);
00662     return LE;
00663   } else if (c1 == '>' && c2 == '=') {
00664     shift(2);
00665     return GE;
00666   } else if (c1 == '!' && c2 == '=') {
00667     shift(2);
00668     return NE;
00669   } else if (c1 == '+' && c2 == '+') {
00670     shift(2);
00671     if (terminator)
00672       return AUTOPLUSPLUS;
00673     else
00674       return PLUSPLUS;
00675   } else if (c1 == '-' && c2 == '-') {
00676     shift(2);
00677     if (terminator)
00678       return AUTOMINUSMINUS;
00679     else
00680       return MINUSMINUS;
00681   } else if (c1 == '=' && c2 == '=') {
00682     shift(2);
00683     return EQEQ;
00684   } else if (c1 == '+' && c2 == '=') {
00685     shift(2);
00686     return PLUSEQUAL;
00687   } else if (c1 == '-' && c2 == '=') {
00688     shift(2);
00689     return MINUSEQUAL;
00690   } else if (c1 == '*' && c2 == '=') {
00691     shift(2);
00692     return MULTEQUAL;
00693   } else if (c1 == '/' && c2 == '=') {
00694     shift(2);
00695     return DIVEQUAL;
00696   } else if (c1 == '&' && c2 == '=') {
00697     shift(2);
00698     return ANDEQUAL;
00699   } else if (c1 == '^' && c2 == '=') {
00700     shift(2);
00701     return XOREQUAL;
00702   } else if (c1 == '%' && c2 == '=') {
00703     shift(2);
00704     return MODEQUAL;
00705   } else if (c1 == '|' && c2 == '=') {
00706     shift(2);
00707     return OREQUAL;
00708   } else if (c1 == '<' && c2 == '<') {
00709     shift(2);
00710     return LSHIFT;
00711   } else if (c1 == '>' && c2 == '>') {
00712     shift(2);
00713     return RSHIFT;
00714   } else if (c1 == '&' && c2 == '&') {
00715     shift(2);
00716     return AND;
00717   } else if (c1 == '|' && c2 == '|') {
00718     shift(2);
00719     return OR;
00720   }
00721 
00722   switch(c1) {
00723     case '=':
00724     case '>':
00725     case '<':
00726     case ',':
00727     case '!':
00728     case '~':
00729     case '?':
00730     case ':':
00731     case '.':
00732     case '+':
00733     case '-':
00734     case '*':
00735     case '/':
00736     case '&':
00737     case '|':
00738     case '^':
00739     case '%':
00740     case '(':
00741     case ')':
00742     case '{':
00743     case '}':
00744     case '[':
00745     case ']':
00746     case ';':
00747       shift(1);
00748       return static_cast<int>(c1);
00749     default:
00750       return -1;
00751   }
00752 }
00753 
00754 unsigned short Lexer::singleEscape(unsigned short c) const
00755 {
00756   switch(c) {
00757   case 'b':
00758     return 0x08;
00759   case 't':
00760     return 0x09;
00761   case 'n':
00762     return 0x0A;
00763   case 'v':
00764     return 0x0B;
00765   case 'f':
00766     return 0x0C;
00767   case 'r':
00768     return 0x0D;
00769   case '"':
00770     return 0x22;
00771   case '\'':
00772     return 0x27;
00773   case '\\':
00774     return 0x5C;
00775   default:
00776     return c;
00777   }
00778 }
00779 
00780 unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
00781                                       unsigned short c3) const
00782 {
00783   return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
00784 }
00785 
00786 unsigned char Lexer::convertHex(unsigned short c)
00787 {
00788   if (c >= '0' && c <= '9')
00789     return (c - '0');
00790   else if (c >= 'a' && c <= 'f')
00791     return (c - 'a' + 10);
00792   else
00793     return (c - 'A' + 10);
00794 }
00795 
00796 unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
00797 {
00798   return ((convertHex(c1) << 4) + convertHex(c2));
00799 }
00800 
00801 UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
00802                                      unsigned short c3, unsigned short c4)
00803 {
00804   return UChar((convertHex(c1) << 4) + convertHex(c2),
00805                (convertHex(c3) << 4) + convertHex(c4));
00806 }
00807 
00808 void Lexer::record8(unsigned short c)
00809 {
00810   assert(c <= 0xff);
00811 
00812   // enlarge buffer if full
00813   if (pos8 >= size8 - 1) {
00814     char *tmp = new char[2 * size8];
00815     memcpy(tmp, buffer8, size8 * sizeof(char));
00816     delete [] buffer8;
00817     buffer8 = tmp;
00818     size8 *= 2;
00819   }
00820 
00821   buffer8[pos8++] = (char) c;
00822 }
00823 
00824 void Lexer::record16(int c)
00825 {
00826   assert(c >= 0);
00827   //assert(c <= USHRT_MAX);
00828   record16(UChar(static_cast<unsigned short>(c)));
00829 }
00830 
00831 void Lexer::record16(UChar c)
00832 {
00833   // enlarge buffer if full
00834   if (pos16 >= size16 - 1) {
00835     UChar *tmp = new UChar[2 * size16];
00836     memcpy(tmp, buffer16, size16 * sizeof(UChar));
00837     delete [] buffer16;
00838     buffer16 = tmp;
00839     size16 *= 2;
00840   }
00841 
00842   buffer16[pos16++] = c;
00843 }
00844 
00845 bool Lexer::scanRegExp()
00846 {
00847   pos16 = 0;
00848   bool lastWasEscape = false;
00849   bool inBrackets = false;
00850 
00851   while (1) {
00852     if (current == '\r' || current == '\n' || current == -1)
00853       return false;
00854     else if (current != '/' || lastWasEscape == true || inBrackets == true)
00855     {
00856         // keep track of '[' and ']'
00857         if ( !lastWasEscape ) {
00858           if ( current == '[' && !inBrackets )
00859             inBrackets = true;
00860           if ( current == ']' && inBrackets )
00861             inBrackets = false;
00862         }
00863         record16(current);
00864         lastWasEscape =
00865             !lastWasEscape && (current == '\\');
00866     }
00867     else { // end of regexp
00868       pattern = UString(buffer16, pos16);
00869       pos16 = 0;
00870       shift(1);
00871       break;
00872     }
00873     shift(1);
00874   }
00875 
00876   while (isIdentLetter(current)) {
00877     record16(current);
00878     shift(1);
00879   }
00880   flags = UString(buffer16, pos16);
00881 
00882   return true;
00883 }
00884 
00885 
00886 void Lexer::doneParsing()
00887 {
00888   for (unsigned i = 0; i < numIdentifiers; i++) {
00889     delete identifiers[i];
00890   }
00891   free(identifiers);
00892   identifiers = 0;
00893   numIdentifiers = 0;
00894   identifiersCapacity = 0;
00895 
00896   for (unsigned i = 0; i < numStrings; i++) {
00897     delete strings[i];
00898   }
00899   free(strings);
00900   strings = 0;
00901   numStrings = 0;
00902   stringsCapacity = 0;
00903 }
00904 
00905 const int initialCapacity = 64;
00906 const int growthFactor = 2;
00907 
00908 Identifier *Lexer::makeIdentifier(UChar *buffer, unsigned int pos)
00909 {
00910   if (numIdentifiers == identifiersCapacity) {
00911     identifiersCapacity = (identifiersCapacity == 0) ? initialCapacity : identifiersCapacity *growthFactor;
00912     identifiers = (KJS::Identifier **)realloc(identifiers, sizeof(KJS::Identifier *) * identifiersCapacity);
00913   }
00914 
00915   KJS::Identifier *identifier = new KJS::Identifier(buffer, pos);
00916   identifiers[numIdentifiers++] = identifier;
00917   return identifier;
00918 }
00919 
00920 UString *Lexer::makeUString(UChar *buffer, unsigned int pos)
00921 {
00922   if (numStrings == stringsCapacity) {
00923     stringsCapacity = (stringsCapacity == 0) ? initialCapacity : stringsCapacity *growthFactor;
00924     strings = (UString **)realloc(strings, sizeof(UString *) * stringsCapacity);
00925   }
00926 
00927   UString *string = new UString(buffer, pos);
00928   strings[numStrings++] = string;
00929   return string;
00930 }
KDE Home | KDE Accessibility Home | Description of Access Keys