Source for gnu.xml.xpath.XPathTokenizer

   1: /* XPathTokenizer.java --
   2:    Copyright (C) 2004 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.xpath;
  39: 
  40: import gnu.java.lang.CPStringBuilder;
  41: 
  42: import java.io.BufferedReader;
  43: import java.io.IOException;
  44: import java.io.Reader;
  45: import java.io.StringReader;
  46: import java.util.Map;
  47: import java.util.TreeMap;
  48: 
  49: /*import antlr.Token;
  50: import antlr.TokenStream;
  51: import antlr.TokenStreamException;
  52: import antlr.TokenStreamIOException;*/
  53: 
  54: /**
  55:  * XPath 1.0 expression tokenizer.
  56:  *
  57:  * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
  58:  */
  59: public class XPathTokenizer
  60: implements XPathParser.yyInput
  61: //implements TokenStream
  62: {
  63: 
  64:   static class XPathToken
  65:   //extends Token
  66:   {
  67: 
  68:     int type;
  69:     String val;
  70: 
  71:     XPathToken (int type)
  72:     {
  73:       this (type, null);
  74:     }
  75: 
  76:     XPathToken (int type, String val)
  77:     {
  78:       //super (type);
  79:       this.type = type;
  80:       this.val = val;
  81:     }
  82: 
  83:     public String getText ()
  84:     {
  85:       return val;
  86:     }
  87: 
  88:     public String toString ()
  89:     {
  90:       return val;
  91:     }
  92: 
  93:   }
  94: 
  95:   static final Map<String,Integer> keywords = new TreeMap<String,Integer> ();
  96:   static
  97:   {
  98:     keywords.put ("ancestor", new Integer (XPathParser.ANCESTOR));
  99:     keywords.put ("ancestor-or-self", new Integer (XPathParser.ANCESTOR_OR_SELF));
 100:     keywords.put ("attribute", new Integer (XPathParser.ATTRIBUTE));
 101:     keywords.put ("child", new Integer (XPathParser.CHILD));
 102:     keywords.put ("descendant", new Integer (XPathParser.DESCENDANT));
 103:     keywords.put ("descendant-or-self", new Integer (XPathParser.DESCENDANT_OR_SELF));
 104:     keywords.put ("following", new Integer (XPathParser.FOLLOWING));
 105:     keywords.put ("following-sibling", new Integer (XPathParser.FOLLOWING_SIBLING));
 106:     keywords.put ("namespace", new Integer (XPathParser.NAMESPACE));
 107:     keywords.put ("parent", new Integer (XPathParser.PARENT));
 108:     keywords.put ("preceding", new Integer (XPathParser.PRECEDING));
 109:     keywords.put ("preceding-sibling", new Integer (XPathParser.PRECEDING_SIBLING));
 110:     keywords.put ("self", new Integer (XPathParser.SELF));
 111:     keywords.put ("div", new Integer (XPathParser.DIV));
 112:     keywords.put ("mod", new Integer (XPathParser.MOD));
 113:     keywords.put ("or", new Integer (XPathParser.OR));
 114:     keywords.put ("and", new Integer (XPathParser.AND));
 115:     keywords.put ("comment", new Integer (XPathParser.COMMENT));
 116:     keywords.put ("processing-instruction", new Integer (XPathParser.PROCESSING_INSTRUCTION));
 117:     keywords.put ("text", new Integer (XPathParser.TEXT));
 118:     keywords.put ("node", new Integer (XPathParser.NODE));
 119:   }
 120: 
 121:   Reader in;
 122:   XPathToken token;
 123:   XPathToken lastToken;
 124: 
 125:   public XPathTokenizer (String expr)
 126:   {
 127:     this (new StringReader (expr));
 128:   }
 129: 
 130:   XPathTokenizer (Reader in)
 131:   {
 132:     this.in = in.markSupported () ? in : new BufferedReader (in);
 133:   }
 134: 
 135:   /* Begin ANTLR specific *
 136: 
 137:   public Token nextToken ()
 138:     throws TokenStreamException
 139:   {
 140:     try
 141:       {
 142:         if (!advance ())
 143:           {
 144:             throw new TokenStreamException ("eof");
 145:           }
 146:         token ();
 147:         return token;
 148:       }
 149:     catch (IOException e)
 150:       {
 151:         throw new TokenStreamIOException (e);
 152:       }
 153:   }
 154: 
 155:   * End ANTLR specific */
 156: 
 157:   public boolean advance ()
 158:     throws IOException
 159:   {
 160:     lastToken = token;
 161:     int c = in.read ();
 162:     switch (c)
 163:       {
 164:       case -1: // eof
 165:         return false;
 166:       case 0x20:
 167:       case 0x09:
 168:       case 0x0d:
 169:       case 0x0a: // skip whitespace
 170:         return advance ();
 171:       case 0x22: // "
 172:       case 0x27: // '
 173:         token = consume_literal (c);
 174:         break;
 175:       case 0x28: // (
 176:         token = new XPathToken (XPathParser.LP);
 177:         break;
 178:       case 0x29: // )
 179:         token = new XPathToken (XPathParser.RP);
 180:         break;
 181:       case 0x5b: // [
 182:         token = new XPathToken (XPathParser.LB);
 183:         break;
 184:       case 0x5d: // ]
 185:         token = new XPathToken (XPathParser.RB);
 186:         break;
 187:       case 0x2c: // ,
 188:         token = new XPathToken (XPathParser.COMMA);
 189:         break;
 190:       case 0x7c: // |
 191:         token = new XPathToken (XPathParser.PIPE);
 192:         break;
 193:       case 0x2f: // /
 194:         in.mark (1);
 195:         int d1 = in.read ();
 196:         if (d1 == 0x2f)
 197:           {
 198:             token = new XPathToken (XPathParser.DOUBLE_SLASH);
 199:           }
 200:         else
 201:           {
 202:             in.reset ();
 203:             token = new XPathToken (XPathParser.SLASH);
 204:           }
 205:         break;
 206:       case 0x3d: // =
 207:         token = new XPathToken (XPathParser.EQ);
 208:         break;
 209:       case 0x21: // !
 210:         in.mark (1);
 211:         int d2 = in.read ();
 212:         if (d2 == 0x3d) // =
 213:           {
 214:             token = new XPathToken (XPathParser.NE);
 215:           }
 216:         else
 217:           {
 218:             in.reset ();
 219:             token = new XPathToken (XPathParser.yyErrorCode);
 220:           }
 221:         break;
 222:       case 0x3e: // >
 223:         in.mark (1);
 224:         int d3 = in.read ();
 225:         if (d3 == 0x3d) // =
 226:           {
 227:             token = new XPathToken (XPathParser.GTE);
 228:           }
 229:         else
 230:           {
 231:             in.reset ();
 232:             token = new XPathToken (XPathParser.GT);
 233:           }
 234:         break;
 235:       case 0x3c: // <
 236:         in.mark (1);
 237:         int d4 = in.read ();
 238:         if (d4 == 0x3d) // =
 239:           {
 240:             token = new XPathToken (XPathParser.LTE);
 241:           }
 242:         else
 243:           {
 244:             in.reset ();
 245:             token = new XPathToken (XPathParser.LT);
 246:           }
 247:         break;
 248:       case 0x2b: // +
 249:         token = new XPathToken (XPathParser.PLUS);
 250:         break;
 251:       case 0x2d: // -
 252:         token = new XPathToken (XPathParser.MINUS);
 253:         break;
 254:       case 0x40: // @
 255:         token = new XPathToken (XPathParser.AT);
 256:         break;
 257:       case 0x2a: // *
 258:         token = new XPathToken (XPathParser.STAR);
 259:         break;
 260:       case 0x24: // $
 261:         token = new XPathToken (XPathParser.DOLLAR);
 262:         break;
 263:       case 0x3a: // :
 264:         in.mark (1);
 265:         int d5 = in.read ();
 266:         if (d5 == 0x3a)
 267:           {
 268:             token = new XPathToken (XPathParser.DOUBLE_COLON);
 269:           }
 270:         else
 271:           {
 272:             in.reset ();
 273:             token = new XPathToken (XPathParser.COLON);
 274:           }
 275:         break;
 276:       case 0x2e: // .
 277:         in.mark (1);
 278:         int d6 = in.read ();
 279:         if (d6 == 0x2e)
 280:           {
 281:             token = new XPathToken (XPathParser.DOUBLE_DOT);
 282:           }
 283:         else
 284:           {
 285:             in.reset ();
 286:             token = new XPathToken (XPathParser.DOT);
 287:           }
 288:         break;
 289:       default:
 290:         if (c >= 0x30 && c <= 0x39)
 291:           {
 292:             token = consume_digits (c);
 293:           }
 294:         else if (c == 0x5f || Character.isLetter ((char) c))
 295:           {
 296:             token = consume_name (c);
 297:           }
 298:         else
 299:           {
 300:             token = new XPathToken (XPathParser.yyErrorCode);
 301:           }
 302:       }
 303:     return true;
 304:   }
 305: 
 306:   public int token ()
 307:   {
 308:     return token.type;
 309:   }
 310: 
 311:   public Object value ()
 312:   {
 313:     return token.val;
 314:   }
 315: 
 316:   XPathToken consume_literal (int delimiter)
 317:     throws IOException
 318:   {
 319:     CPStringBuilder buf = new CPStringBuilder ();
 320:     while (true)
 321:       {
 322:         int c = in.read ();
 323:         if (c == -1)
 324:           {
 325:             return new XPathToken (XPathParser.yyErrorCode);
 326:           }
 327:         else if (c == delimiter)
 328:           {
 329:             return new XPathToken (XPathParser.LITERAL, buf.toString ());
 330:           }
 331:         else
 332:           {
 333:             buf.append ((char) c);
 334:           }
 335:       }
 336:   }
 337: 
 338:   XPathToken consume_digits (int c)
 339:     throws IOException
 340:   {
 341:     CPStringBuilder buf = new CPStringBuilder ();
 342:     buf.append ((char) c);
 343:     while (true)
 344:       {
 345:         in.mark (1);
 346:         c = in.read ();
 347:         if (c >= 0x30 && c <= 0x39)
 348:           {
 349:             buf.append ((char) c);
 350:           }
 351:         else
 352:           {
 353:             in.reset ();
 354:             return new XPathToken (XPathParser.DIGITS, buf.toString ());
 355:           }
 356:       }
 357:   }
 358: 
 359:   XPathToken consume_name (int c)
 360:     throws IOException
 361:   {
 362:     CPStringBuilder buf = new CPStringBuilder ();
 363:     buf.append ((char) c);
 364:     while (true)
 365:       {
 366:         in.mark (1);
 367:         c = in.read ();
 368:         if (isNameChar (c))
 369:           {
 370:             buf.append ((char) c);
 371:           }
 372:         else
 373:           {
 374:             in.reset ();
 375:             String name = buf.toString ();
 376:             Integer keyword = (Integer) keywords.get (name);
 377:             if (keyword == null)
 378:               {
 379:                 return new XPathToken (XPathParser.NAME, name);
 380:               }
 381:             else
 382:               {
 383:                 int val = keyword.intValue ();
 384:                 switch (val)
 385:                   {
 386:                   case XPathParser.NODE:
 387:                   case XPathParser.COMMENT:
 388:                   case XPathParser.TEXT:
 389:                   case XPathParser.PROCESSING_INSTRUCTION:
 390:                     // Consume subsequent (
 391:                     in.mark (1);
 392:                     do
 393:                       {
 394:                         c = in.read ();
 395:                       }
 396:                     while (c == 0x20 || c == 0x09);
 397:                     if (c != 0x28)
 398:                       {
 399:                         in.reset ();
 400:                         return new XPathToken (XPathParser.NAME, name);
 401:                       }
 402:                     break;
 403:                   case XPathParser.CHILD:
 404:                   case XPathParser.PARENT:
 405:                   case XPathParser.SELF:
 406:                   case XPathParser.DESCENDANT:
 407:                   case XPathParser.ANCESTOR:
 408:                   case XPathParser.DESCENDANT_OR_SELF:
 409:                   case XPathParser.ANCESTOR_OR_SELF:
 410:                   case XPathParser.ATTRIBUTE:
 411:                   case XPathParser.NAMESPACE:
 412:                   case XPathParser.FOLLOWING:
 413:                   case XPathParser.FOLLOWING_SIBLING:
 414:                   case XPathParser.PRECEDING:
 415:                   case XPathParser.PRECEDING_SIBLING:
 416:                     // Check that this is an axis specifier
 417:                     in.mark(1);
 418:                     do
 419:                       {
 420:                         c = in.read();
 421:                       }
 422:                     while (c == 0x20 || c == 0x09);
 423:                     if (c == 0x3a)
 424:                       {
 425:                         c = in.read();
 426:                         if (c == 0x3a)
 427:                           {
 428:                             in.reset();
 429:                             return new XPathToken(val);
 430:                           }
 431:                       }
 432:                     in.reset();
 433:                     return new XPathToken(XPathParser.NAME, name);
 434:                   case XPathParser.DIV:
 435:                   case XPathParser.MOD:
 436:                     // May be a name
 437:                     if (lastToken == null)
 438:                       {
 439:                         return new XPathToken(XPathParser.NAME, name);
 440:                       }
 441:                     switch (lastToken.type)
 442:                       {
 443:                       case XPathParser.LP:
 444:                       case XPathParser.LB:
 445:                       case XPathParser.COMMA:
 446:                       case XPathParser.PIPE:
 447:                       case XPathParser.EQ:
 448:                       case XPathParser.NE:
 449:                       case XPathParser.GT:
 450:                       case XPathParser.LT:
 451:                       case XPathParser.GTE:
 452:                       case XPathParser.LTE:
 453:                       case XPathParser.PLUS:
 454:                       case XPathParser.MINUS:
 455:                       case XPathParser.STAR:
 456:                       case XPathParser.AT:
 457:                       case XPathParser.DOLLAR:
 458:                       case XPathParser.COLON:
 459:                       case XPathParser.DOUBLE_COLON:
 460:                       case XPathParser.DIV:
 461:                       case XPathParser.MOD:
 462:                       case XPathParser.OR:
 463:                       case XPathParser.AND:
 464:                       case XPathParser.SLASH:
 465:                         return new XPathToken(XPathParser.NAME, name);
 466:                       }
 467:                     break;
 468:                   }
 469:                 return new XPathToken (val);
 470:               }
 471:           }
 472:       }
 473:   }
 474: 
 475:   boolean isNameChar (int c)
 476:   {
 477:     /* Name */
 478:     return (c == 0x5f
 479:             || c == 0x2d
 480:             || c == 0x2e
 481:             || (c >= 0x30 && c <= 0x39)
 482:             /* CombiningChar */
 483:             || (c >= 0x0300 && c <= 0x0345)
 484:             || (c >= 0x0360 && c <= 0x0361)
 485:             || (c >= 0x0483 && c <= 0x0486)
 486:             || (c >= 0x0591 && c <= 0x05A1)
 487:             || (c >= 0x05A3 && c <= 0x05B9)
 488:             || (c >= 0x05BB && c <= 0x05BD)
 489:             || c == 0x05BF
 490:             || (c >= 0x05C1 && c <= 0x05C2)
 491:             || c == 0x05C4
 492:             || (c >= 0x064B && c <= 0x0652)
 493:             || c == 0x0670
 494:             || (c >= 0x06D6 && c <= 0x06DC)
 495:             || (c >= 0x06DD && c <= 0x06DF)
 496:             || (c >= 0x06E0 && c <= 0x06E4)
 497:             || (c >= 0x06E7 && c <= 0x06E8)
 498:             || (c >= 0x06EA && c <= 0x06ED)
 499:             || (c >= 0x0901 && c <= 0x0903)
 500:             || c == 0x093C
 501:             || (c >= 0x093E && c <= 0x094C)
 502:             || c == 0x094D
 503:             || (c >= 0x0951 && c <= 0x0954)
 504:             || (c >= 0x0962 && c <= 0x0963)
 505:             || (c >= 0x0981 && c <= 0x0983)
 506:             || c == 0x09BC
 507:             || c == 0x09BE
 508:             || c == 0x09BF
 509:             || (c >= 0x09C0 && c <= 0x09C4)
 510:             || (c >= 0x09C7 && c <= 0x09C8)
 511:             || (c >= 0x09CB && c <= 0x09CD)
 512:             || c == 0x09D7
 513:             || (c >= 0x09E2 && c <= 0x09E3)
 514:             || c == 0x0A02
 515:             || c == 0x0A3C
 516:             || c == 0x0A3E
 517:             || c == 0x0A3F
 518:             || (c >= 0x0A40 && c <= 0x0A42)
 519:             || (c >= 0x0A47 && c <= 0x0A48)
 520:             || (c >= 0x0A4B && c <= 0x0A4D)
 521:             || (c >= 0x0A70 && c <= 0x0A71)
 522:             || (c >= 0x0A81 && c <= 0x0A83)
 523:             || c == 0x0ABC
 524:             || (c >= 0x0ABE && c <= 0x0AC5)
 525:             || (c >= 0x0AC7 && c <= 0x0AC9)
 526:             || (c >= 0x0ACB && c <= 0x0ACD)
 527:             || (c >= 0x0B01 && c <= 0x0B03)
 528:             || c == 0x0B3C
 529:             || (c >= 0x0B3E && c <= 0x0B43)
 530:             || (c >= 0x0B47 && c <= 0x0B48)
 531:             || (c >= 0x0B4B && c <= 0x0B4D)
 532:             || (c >= 0x0B56 && c <= 0x0B57)
 533:             || (c >= 0x0B82 && c <= 0x0B83)
 534:             || (c >= 0x0BBE && c <= 0x0BC2)
 535:             || (c >= 0x0BC6 && c <= 0x0BC8)
 536:             || (c >= 0x0BCA && c <= 0x0BCD)
 537:             || c == 0x0BD7
 538:             || (c >= 0x0C01 && c <= 0x0C03)
 539:             || (c >= 0x0C3E && c <= 0x0C44)
 540:             || (c >= 0x0C46 && c <= 0x0C48)
 541:             || (c >= 0x0C4A && c <= 0x0C4D)
 542:             || (c >= 0x0C55 && c <= 0x0C56)
 543:             || (c >= 0x0C82 && c <= 0x0C83)
 544:             || (c >= 0x0CBE && c <= 0x0CC4)
 545:             || (c >= 0x0CC6 && c <= 0x0CC8)
 546:             || (c >= 0x0CCA && c <= 0x0CCD)
 547:             || (c >= 0x0CD5 && c <= 0x0CD6)
 548:             || (c >= 0x0D02 && c <= 0x0D03)
 549:             || (c >= 0x0D3E && c <= 0x0D43)
 550:             || (c >= 0x0D46 && c <= 0x0D48)
 551:             || (c >= 0x0D4A && c <= 0x0D4D)
 552:             || c == 0x0D57
 553:             || c == 0x0E31
 554:             || (c >= 0x0E34 && c <= 0x0E3A)
 555:             || (c >= 0x0E47 && c <= 0x0E4E)
 556:             || c == 0x0EB1
 557:             || (c >= 0x0EB4 && c <= 0x0EB9)
 558:             || (c >= 0x0EBB && c <= 0x0EBC)
 559:             || (c >= 0x0EC8 && c <= 0x0ECD)
 560:             || (c >= 0x0F18 && c <= 0x0F19)
 561:             || c == 0x0F35
 562:             || c == 0x0F37
 563:             || c == 0x0F39
 564:             || c == 0x0F3E
 565:             || c == 0x0F3F
 566:             || (c >= 0x0F71 && c <= 0x0F84)
 567:             || (c >= 0x0F86 && c <= 0x0F8B)
 568:             || (c >= 0x0F90 && c <= 0x0F95)
 569:             || c == 0x0F97
 570:             || (c >= 0x0F99 && c <= 0x0FAD)
 571:             || (c >= 0x0FB1 && c <= 0x0FB7)
 572:             || c == 0x0FB9
 573:             || (c >= 0x20D0 && c <= 0x20DC)
 574:             || c == 0x20E1
 575:             || (c >= 0x302A && c <= 0x302F)
 576:             || c == 0x3099
 577:             || c == 0x309A
 578:             /* Extender */
 579:             || c == 0x00B7
 580:             || c == 0x02D0
 581:             || c == 0x02D1
 582:             || c == 0x0387
 583:             || c == 0x0640
 584:             || c == 0x0E46
 585:             || c == 0x0EC6
 586:             || c == 0x3005
 587:             || (c >= 0x3031 && c <= 0x3035)
 588:             || (c >= 0x309D && c <= 0x309E)
 589:             || (c >= 0x30FC && c <= 0x30FE)
 590:             /* Name */
 591:             || Character.isLetter ((char) c));
 592:   }
 593: 
 594: }