Frames | No Frames |
1: /* StreamTokenizer.java -- parses streams of characters into tokens 2: Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: package java.io; 39: 40: import gnu.java.lang.CPStringBuilder; 41: 42: /** 43: * This class parses streams of characters into tokens. There are a 44: * million-zillion flags that can be set to control the parsing, as 45: * described under the various method headings. 46: * 47: * @author Warren Levy (warrenl@cygnus.com) 48: * @date October 25, 1998. 49: */ 50: /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 51: * "The Java Language Specification", ISBN 0-201-63451-1 52: * plus online API docs for JDK 1.2 beta from http://www.javasoft.com. 53: * Status: Believed complete and correct. 54: */ 55: 56: public class StreamTokenizer 57: { 58: /** A constant indicating that the end of the stream has been read. */ 59: public static final int TT_EOF = -1; 60: 61: /** A constant indicating that the end of the line has been read. */ 62: public static final int TT_EOL = '\n'; 63: 64: /** A constant indicating that a number token has been read. */ 65: public static final int TT_NUMBER = -2; 66: 67: /** A constant indicating that a word token has been read. */ 68: public static final int TT_WORD = -3; 69: 70: /** A constant indicating that no tokens have been read yet. */ 71: private static final int TT_NONE = -4; 72: 73: /** 74: * Contains the type of the token read resulting from a call to nextToken 75: * The rules are as follows: 76: * <ul> 77: * <li>For a token consisting of a single ordinary character, this is the 78: * value of that character.</li> 79: * <li>For a quoted string, this is the value of the quote character</li> 80: * <li>For a word, this is TT_WORD</li> 81: * <li>For a number, this is TT_NUMBER</li> 82: * <li>For the end of the line, this is TT_EOL</li> 83: * <li>For the end of the stream, this is TT_EOF</li> 84: * </ul> 85: */ 86: public int ttype = TT_NONE; 87: 88: /** The String associated with word and string tokens. */ 89: public String sval; 90: 91: /** The numeric value associated with number tokens. */ 92: public double nval; 93: 94: /* Indicates whether end-of-line is recognized as a token. */ 95: private boolean eolSignificant = false; 96: 97: /* Indicates whether word tokens are automatically made lower case. */ 98: private boolean lowerCase = false; 99: 100: /* Indicates whether C++ style comments are recognized and skipped. */ 101: private boolean slashSlash = false; 102: 103: /* Indicates whether C style comments are recognized and skipped. */ 104: private boolean slashStar = false; 105: 106: /* Attribute tables of each byte from 0x00 to 0xFF. */ 107: private boolean[] whitespace = new boolean[256]; 108: private boolean[] alphabetic = new boolean[256]; 109: private boolean[] numeric = new boolean[256]; 110: private boolean[] quote = new boolean[256]; 111: private boolean[] comment = new boolean[256]; 112: 113: /* The Reader associated with this class. */ 114: private PushbackReader in; 115: 116: /* Indicates if a token has been pushed back. */ 117: private boolean pushedBack = false; 118: 119: /* Contains the current line number of the reader. */ 120: private int lineNumber = 1; 121: 122: /** 123: * This method reads bytes from an <code>InputStream</code> and tokenizes 124: * them. For details on how this method operates by default, see 125: * <code>StreamTokenizer(Reader)</code>. 126: * 127: * @param is The <code>InputStream</code> to read from 128: * 129: * @deprecated Since JDK 1.1. 130: */ 131: public StreamTokenizer(InputStream is) 132: { 133: this(new InputStreamReader(is)); 134: } 135: 136: /** 137: * This method initializes a new <code>StreamTokenizer</code> to read 138: * characters from a <code>Reader</code> and parse them. The char values 139: * have their hight bits masked so that the value is treated a character 140: * in the range of 0x0000 to 0x00FF. 141: * <p> 142: * This constructor sets up the parsing table to parse the stream in the 143: * following manner: 144: * <ul> 145: * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF 146: * are initialized as alphabetic</li> 147: * <li>The values 0x00 through 0x20 are initialized as whitespace</li> 148: * <li>The values '\'' and '"' are initialized as quote characters</li> 149: * <li>'/' is a comment character</li> 150: * <li>Numbers will be parsed</li> 151: * <li>EOL is not treated as significant</li> 152: * <li>C and C++ (//) comments are not recognized</li> 153: * </ul> 154: * 155: * @param r The <code>Reader</code> to read chars from 156: */ 157: public StreamTokenizer(Reader r) 158: { 159: in = new PushbackReader(r); 160: 161: whitespaceChars(0x00, 0x20); 162: wordChars('A', 'Z'); 163: wordChars('a', 'z'); 164: wordChars(0xA0, 0xFF); 165: commentChar('/'); 166: quoteChar('\''); 167: quoteChar('"'); 168: parseNumbers(); 169: } 170: 171: /** 172: * This method sets the comment attribute on the specified 173: * character. Other attributes for the character are cleared. 174: * 175: * @param ch The character to set the comment attribute for, passed as an int 176: */ 177: public void commentChar(int ch) 178: { 179: if (ch >= 0 && ch <= 255) 180: { 181: comment[ch] = true; 182: whitespace[ch] = false; 183: alphabetic[ch] = false; 184: numeric[ch] = false; 185: quote[ch] = false; 186: } 187: } 188: 189: /** 190: * This method sets a flag that indicates whether or not the end of line 191: * sequence terminates and is a token. The defaults to <code>false</code> 192: * 193: * @param flag <code>true</code> if EOF is significant, <code>false</code> 194: * otherwise 195: */ 196: public void eolIsSignificant(boolean flag) 197: { 198: eolSignificant = flag; 199: } 200: 201: /** 202: * This method returns the current line number. Note that if the 203: * <code>pushBack()</code> method is called, it has no effect on the 204: * line number returned by this method. 205: * 206: * @return The current line number 207: */ 208: public int lineno() 209: { 210: return lineNumber; 211: } 212: 213: /** 214: * This method sets a flag that indicates whether or not alphabetic 215: * tokens that are returned should be converted to lower case. 216: * 217: * @param flag <code>true</code> to convert to lower case, 218: * <code>false</code> otherwise 219: */ 220: public void lowerCaseMode(boolean flag) 221: { 222: lowerCase = flag; 223: } 224: 225: private boolean isWhitespace(int ch) 226: { 227: return (ch >= 0 && ch <= 255 && whitespace[ch]); 228: } 229: 230: private boolean isAlphabetic(int ch) 231: { 232: return ((ch > 255) || (ch >= 0 && alphabetic[ch])); 233: } 234: 235: private boolean isNumeric(int ch) 236: { 237: return (ch >= 0 && ch <= 255 && numeric[ch]); 238: } 239: 240: private boolean isQuote(int ch) 241: { 242: return (ch >= 0 && ch <= 255 && quote[ch]); 243: } 244: 245: private boolean isComment(int ch) 246: { 247: return (ch >= 0 && ch <= 255 && comment[ch]); 248: } 249: 250: /** 251: * This method reads the next token from the stream. It sets the 252: * <code>ttype</code> variable to the appropriate token type and 253: * returns it. It also can set <code>sval</code> or <code>nval</code> 254: * as described below. The parsing strategy is as follows: 255: * <ul> 256: * <li>Skip any whitespace characters.</li> 257: * <li>If a numeric character is encountered, attempt to parse a numeric 258: * value. Leading '-' characters indicate a numeric only if followed by 259: * another non-'-' numeric. The value of the numeric token is terminated 260: * by either the first non-numeric encountered, or the second occurrence of 261: * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code> 262: * is set to the value parsed.</li> 263: * <li>If an alphabetic character is parsed, all subsequent characters 264: * are read until the first non-alphabetic or non-numeric character is 265: * encountered. The token type returned is TT_WORD and the value parsed 266: * is stored in <code>sval</code>. If lower case mode is set, the token 267: * stored in <code>sval</code> is converted to lower case. The end of line 268: * sequence terminates a word only if EOL signficance has been turned on. 269: * The start of a comment also terminates a word. Any character with a 270: * non-alphabetic and non-numeric attribute (such as white space, a quote, 271: * or a commet) are treated as non-alphabetic and terminate the word.</li> 272: * <li>If a comment character is parsed, then all remaining characters on 273: * the current line are skipped and another token is parsed. Any EOL or 274: * EOF's encountered are not discarded, but rather terminate the comment.</li> 275: * <li>If a quote character is parsed, then all characters up to the 276: * second occurrence of the same quote character are parsed into a 277: * <code>String</code>. This <code>String</code> is stored as 278: * <code>sval</code>, but is not converted to lower case, even if lower case 279: * mode is enabled. The token type returned is the value of the quote 280: * character encountered. Any escape sequences 281: * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r 282: * (carriage return), \" (double quote), \' (single quote), \\ 283: * (backslash), \XXX (octal esacpe)) are converted to the appropriate 284: * char values. Invalid esacape sequences are left in untranslated. 285: * Unicode characters like ('\ u0000') are not recognized. </li> 286: * <li>If the C++ comment sequence "//" is encountered, and the parser 287: * is configured to handle that sequence, then the remainder of the line 288: * is skipped and another token is read exactly as if a character with 289: * the comment attribute was encountered.</li> 290: * <li>If the C comment sequence "/*" is encountered, and the parser 291: * is configured to handle that sequence, then all characters up to and 292: * including the comment terminator sequence are discarded and another 293: * token is parsed.</li> 294: * <li>If all cases above are not met, then the character is an ordinary 295: * character that is parsed as a token by itself. The char encountered 296: * is returned as the token type.</li> 297: * </ul> 298: * 299: * @return The token type 300: * @exception IOException If an I/O error occurs 301: */ 302: public int nextToken() throws IOException 303: { 304: if (pushedBack) 305: { 306: pushedBack = false; 307: if (ttype != TT_NONE) 308: return ttype; 309: } 310: 311: sval = null; 312: int ch; 313: 314: // Skip whitespace. Deal with EOL along the way. 315: while (isWhitespace(ch = in.read())) 316: if (ch == '\n' || ch == '\r') 317: { 318: lineNumber++; 319: 320: // Throw away \n if in combination with \r. 321: if (ch == '\r' && (ch = in.read()) != '\n') 322: { 323: if (ch != TT_EOF) 324: in.unread(ch); 325: } 326: if (eolSignificant) 327: return (ttype = TT_EOL); 328: } 329: 330: if (ch == '/') 331: if ((ch = in.read()) == '/' && slashSlash) 332: { 333: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 334: ; 335: 336: if (ch != TT_EOF) 337: in.unread(ch); 338: return nextToken(); // Recursive, but not too deep in normal cases 339: } 340: else if (ch == '*' && slashStar) 341: { 342: while (true) 343: { 344: ch = in.read(); 345: if (ch == '*') 346: { 347: if ((ch = in.read()) == '/') 348: break; 349: else if (ch != TT_EOF) 350: in.unread(ch); 351: } 352: else if (ch == '\n' || ch == '\r') 353: { 354: lineNumber++; 355: if (ch == '\r' && (ch = in.read()) != '\n') 356: { 357: if (ch != TT_EOF) 358: in.unread(ch); 359: } 360: } 361: else if (ch == TT_EOF) 362: { 363: break; 364: } 365: } 366: return nextToken(); // Recursive, but not too deep in normal cases 367: } 368: else 369: { 370: if (ch != TT_EOF) 371: in.unread(ch); 372: ch = '/'; 373: } 374: 375: if (ch == TT_EOF) 376: ttype = TT_EOF; 377: else if (isNumeric(ch)) 378: { 379: boolean isNegative = false; 380: if (ch == '-') 381: { 382: // Read ahead to see if this is an ordinary '-' rather than numeric. 383: ch = in.read(); 384: if (isNumeric(ch) && ch != '-') 385: { 386: isNegative = true; 387: } 388: else 389: { 390: if (ch != TT_EOF) 391: in.unread(ch); 392: return (ttype = '-'); 393: } 394: } 395: 396: CPStringBuilder tokbuf = new CPStringBuilder(); 397: tokbuf.append((char) ch); 398: 399: int decCount = 0; 400: while (isNumeric(ch = in.read()) && ch != '-') 401: if (ch == '.' && decCount++ > 0) 402: break; 403: else 404: tokbuf.append((char) ch); 405: 406: if (ch != TT_EOF) 407: in.unread(ch); 408: ttype = TT_NUMBER; 409: try 410: { 411: nval = Double.valueOf(tokbuf.toString()).doubleValue(); 412: } 413: catch (NumberFormatException _) 414: { 415: nval = 0.0; 416: } 417: if (isNegative) 418: nval = -nval; 419: } 420: else if (isAlphabetic(ch)) 421: { 422: CPStringBuilder tokbuf = new CPStringBuilder(); 423: tokbuf.append((char) ch); 424: while (isAlphabetic(ch = in.read()) || isNumeric(ch)) 425: tokbuf.append((char) ch); 426: if (ch != TT_EOF) 427: in.unread(ch); 428: ttype = TT_WORD; 429: sval = tokbuf.toString(); 430: if (lowerCase) 431: sval = sval.toLowerCase(); 432: } 433: else if (isComment(ch)) 434: { 435: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 436: ; 437: 438: if (ch != TT_EOF) 439: in.unread(ch); 440: return nextToken(); // Recursive, but not too deep in normal cases. 441: } 442: else if (isQuote(ch)) 443: { 444: ttype = ch; 445: CPStringBuilder tokbuf = new CPStringBuilder(); 446: while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' && 447: ch != TT_EOF) 448: { 449: if (ch == '\\') 450: switch (ch = in.read()) 451: { 452: case 'a': ch = 0x7; 453: break; 454: case 'b': ch = '\b'; 455: break; 456: case 'f': ch = 0xC; 457: break; 458: case 'n': ch = '\n'; 459: break; 460: case 'r': ch = '\r'; 461: break; 462: case 't': ch = '\t'; 463: break; 464: case 'v': ch = 0xB; 465: break; 466: case '\n': ch = '\n'; 467: break; 468: case '\r': ch = '\r'; 469: break; 470: case '\"': 471: case '\'': 472: case '\\': 473: break; 474: default: 475: int ch1, nextch; 476: if ((nextch = ch1 = ch) >= '0' && ch <= '7') 477: { 478: ch -= '0'; 479: if ((nextch = in.read()) >= '0' && nextch <= '7') 480: { 481: ch = ch * 8 + nextch - '0'; 482: if ((nextch = in.read()) >= '0' && nextch <= '7' && 483: ch1 >= '0' && ch1 <= '3') 484: { 485: ch = ch * 8 + nextch - '0'; 486: nextch = in.read(); 487: } 488: } 489: } 490: 491: if (nextch != TT_EOF) 492: in.unread(nextch); 493: } 494: 495: tokbuf.append((char) ch); 496: } 497: 498: // Throw away matching quote char. 499: if (ch != ttype && ch != TT_EOF) 500: in.unread(ch); 501: 502: sval = tokbuf.toString(); 503: } 504: else 505: { 506: ttype = ch; 507: } 508: 509: return ttype; 510: } 511: 512: private void resetChar(int ch) 513: { 514: whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] = 515: false; 516: } 517: 518: /** 519: * This method makes the specified character an ordinary character. This 520: * means that none of the attributes (whitespace, alphabetic, numeric, 521: * quote, or comment) will be set on this character. This character will 522: * parse as its own token. 523: * 524: * @param ch The character to make ordinary, passed as an int 525: */ 526: public void ordinaryChar(int ch) 527: { 528: if (ch >= 0 && ch <= 255) 529: resetChar(ch); 530: } 531: 532: /** 533: * This method makes all the characters in the specified range, range 534: * terminators included, ordinary. This means the none of the attributes 535: * (whitespace, alphabetic, numeric, quote, or comment) will be set on 536: * any of the characters in the range. This makes each character in this 537: * range parse as its own token. 538: * 539: * @param low The low end of the range of values to set the whitespace 540: * attribute for 541: * @param hi The high end of the range of values to set the whitespace 542: * attribute for 543: */ 544: public void ordinaryChars(int low, int hi) 545: { 546: if (low < 0) 547: low = 0; 548: if (hi > 255) 549: hi = 255; 550: for (int i = low; i <= hi; i++) 551: resetChar(i); 552: } 553: 554: /** 555: * This method sets the numeric attribute on the characters '0' - '9' and 556: * the characters '.' and '-'. 557: * When this method is used, the result of giving other attributes 558: * (whitespace, quote, or comment) to the numeric characters may 559: * vary depending on the implementation. For example, if 560: * parseNumbers() and then whitespaceChars('1', '1') are called, 561: * this implementation reads "121" as 2, while some other implementation 562: * will read it as 21. 563: */ 564: public void parseNumbers() 565: { 566: for (int i = 0; i <= 9; i++) 567: numeric['0' + i] = true; 568: 569: numeric['.'] = true; 570: numeric['-'] = true; 571: } 572: 573: /** 574: * Puts the current token back into the StreamTokenizer so 575: * <code>nextToken</code> will return the same value on the next call. 576: * May cause the lineno method to return an incorrect value 577: * if lineno is called before the next call to nextToken. 578: */ 579: public void pushBack() 580: { 581: pushedBack = true; 582: } 583: 584: /** 585: * This method sets the quote attribute on the specified character. 586: * Other attributes for the character are cleared. 587: * 588: * @param ch The character to set the quote attribute for, passed as an int. 589: */ 590: public void quoteChar(int ch) 591: { 592: if (ch >= 0 && ch <= 255) 593: { 594: quote[ch] = true; 595: comment[ch] = false; 596: whitespace[ch] = false; 597: alphabetic[ch] = false; 598: numeric[ch] = false; 599: } 600: } 601: 602: /** 603: * This method removes all attributes (whitespace, alphabetic, numeric, 604: * quote, and comment) from all characters. It is equivalent to calling 605: * <code>ordinaryChars(0x00, 0xFF)</code>. 606: * 607: * @see #ordinaryChars(int, int) 608: */ 609: public void resetSyntax() 610: { 611: ordinaryChars(0x00, 0xFF); 612: } 613: 614: /** 615: * This method sets a flag that indicates whether or not "C++" language style 616: * comments ("//" comments through EOL ) are handled by the parser. 617: * If this is <code>true</code> commented out sequences are skipped and 618: * ignored by the parser. This defaults to <code>false</code>. 619: * 620: * @param flag <code>true</code> to recognized and handle "C++" style 621: * comments, <code>false</code> otherwise 622: */ 623: public void slashSlashComments(boolean flag) 624: { 625: slashSlash = flag; 626: } 627: 628: /** 629: * This method sets a flag that indicates whether or not "C" language style 630: * comments (with nesting not allowed) are handled by the parser. 631: * If this is <code>true</code> commented out sequences are skipped and 632: * ignored by the parser. This defaults to <code>false</code>. 633: * 634: * @param flag <code>true</code> to recognized and handle "C" style comments, 635: * <code>false</code> otherwise 636: */ 637: public void slashStarComments(boolean flag) 638: { 639: slashStar = flag; 640: } 641: 642: /** 643: * This method returns the current token value as a <code>String</code> in 644: * the form "Token[x], line n", where 'n' is the current line numbers and 645: * 'x' is determined as follows. 646: * <p> 647: * <ul> 648: * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li> 649: * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li> 650: * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li> 651: * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li> 652: * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where 653: * 'strnval' is <code>String.valueOf(nval)</code>.</li> 654: * <li>If <code>ttype</code> is a quote character, then 'x' is 655: * <code>sval</code></li> 656: * <li>For all other cases, 'x' is <code>ttype</code></li> 657: * </ul> 658: */ 659: public String toString() 660: { 661: String tempstr; 662: if (ttype == TT_EOF) 663: tempstr = "EOF"; 664: else if (ttype == TT_EOL) 665: tempstr = "EOL"; 666: else if (ttype == TT_WORD) 667: tempstr = sval; 668: else if (ttype == TT_NUMBER) 669: tempstr = "n=" + nval; 670: else if (ttype == TT_NONE) 671: tempstr = "NOTHING"; 672: else // must be an ordinary char. 673: tempstr = "\'" + (char) ttype + "\'"; 674: 675: return "Token[" + tempstr + "], line " + lineno(); 676: } 677: 678: /** 679: * This method sets the whitespace attribute for all characters in the 680: * specified range, range terminators included. 681: * 682: * @param low The low end of the range of values to set the whitespace 683: * attribute for 684: * @param hi The high end of the range of values to set the whitespace 685: * attribute for 686: */ 687: public void whitespaceChars(int low, int hi) 688: { 689: if (low < 0) 690: low = 0; 691: if (hi > 255) 692: hi = 255; 693: for (int i = low; i <= hi; i++) 694: { 695: resetChar(i); 696: whitespace[i] = true; 697: } 698: } 699: 700: /** 701: * This method sets the alphabetic attribute for all characters in the 702: * specified range, range terminators included. 703: * 704: * @param low The low end of the range of values to set the alphabetic 705: * attribute for 706: * @param hi The high end of the range of values to set the alphabetic 707: * attribute for 708: */ 709: public void wordChars(int low, int hi) 710: { 711: if (low < 0) 712: low = 0; 713: if (hi > 255) 714: hi = 255; 715: for (int i = low; i <= hi; i++) 716: alphabetic[i] = true; 717: } 718: }