Source for java.util.regex.Matcher

   1: /* Matcher.java -- Instance of a regular expression applied to a char sequence.
   2:    Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package java.util.regex;
  40: 
  41: import gnu.java.lang.CPStringBuilder;
  42: 
  43: import gnu.java.util.regex.CharIndexed;
  44: import gnu.java.util.regex.RE;
  45: import gnu.java.util.regex.REMatch;
  46: 
  47: /**
  48:  * Instance of a regular expression applied to a char sequence.
  49:  *
  50:  * @since 1.4
  51:  */
  52: public final class Matcher implements MatchResult
  53: {
  54:   private Pattern pattern;
  55:   private CharSequence input;
  56:   // We use CharIndexed as an input object to the getMatch method in order
  57:   // that /\G/ (the end of the previous match) may work.  The information
  58:   // of the previous match is stored in the CharIndexed object.
  59:   private CharIndexed inputCharIndexed;
  60:   private int position;
  61:   private int appendPosition;
  62:   private REMatch match;
  63: 
  64:   /**
  65:    * The start of the region of the input on which to match.
  66:    */
  67:   private int regionStart;
  68: 
  69:   /**
  70:    * The end of the region of the input on which to match.
  71:    */
  72:   private int regionEnd;
  73: 
  74:   /**
  75:    * True if the match process should look beyond the
  76:    * region marked by regionStart to regionEnd when
  77:    * performing lookAhead, lookBehind and boundary
  78:    * matching.
  79:    */
  80:   private boolean transparentBounds;
  81: 
  82:   /**
  83:    * The flags that affect the anchoring bounds.
  84:    * If {@link #hasAnchoringBounds()} is {@code true},
  85:    * the match process will honour the
  86:    * anchoring bounds: ^, \A, \Z, \z and $.  If
  87:    * {@link #hasAnchoringBounds()} is {@code false},
  88:    * the anchors are ignored and appropriate flags,
  89:    * stored in this variable, are used to provide this
  90:    * behaviour.
  91:    */
  92:   private int anchoringBounds;
  93: 
  94:   Matcher(Pattern pattern, CharSequence input)
  95:   {
  96:     this.pattern = pattern;
  97:     this.input = input;
  98:     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
  99:     regionStart = 0;
 100:     regionEnd = input.length();
 101:     transparentBounds = false;
 102:     anchoringBounds = 0;
 103:   }
 104: 
 105:   /**
 106:    * Changes the pattern used by the {@link Matcher} to
 107:    * the one specified.  Existing match information is lost,
 108:    * but the input and the matcher's position within it is
 109:    * retained.
 110:    *
 111:    * @param newPattern the new pattern to use.
 112:    * @return this matcher.
 113:    * @throws IllegalArgumentException if {@code newPattern} is
 114:    *                                  {@code null}.
 115:    * @since 1.5
 116:    */
 117:   public Matcher usePattern(Pattern newPattern)
 118:   {
 119:     if (newPattern == null)
 120:       throw new IllegalArgumentException("The new pattern was null.");
 121:     pattern = newPattern;
 122:     match = null;
 123: 
 124:     return this;
 125:   }
 126: 
 127:   /**
 128:    * @param sb The target string buffer
 129:    * @param replacement The replacement string
 130:    *
 131:    * @exception IllegalStateException If no match has yet been attempted,
 132:    * or if the previous match operation failed
 133:    * @exception IndexOutOfBoundsException If the replacement string refers
 134:    * to a capturing group that does not exist in the pattern
 135:    */
 136:   public Matcher appendReplacement (StringBuffer sb, String replacement)
 137:     throws IllegalStateException
 138:   {
 139:     assertMatchOp();
 140:     sb.append(input.subSequence(appendPosition,
 141:                                 match.getStartIndex()).toString());
 142:     sb.append(RE.getReplacement(replacement, match,
 143:         RE.REG_REPLACE_USE_BACKSLASHESCAPE));
 144:     appendPosition = match.getEndIndex();
 145:     return this;
 146:   }
 147: 
 148:   /**
 149:    * @param sb The target string buffer
 150:    */
 151:   public StringBuffer appendTail (StringBuffer sb)
 152:   {
 153:     sb.append(input.subSequence(appendPosition, input.length()).toString());
 154:     return sb;
 155:   }
 156: 
 157:   /**
 158:    * @exception IllegalStateException If no match has yet been attempted,
 159:    * or if the previous match operation failed
 160:    */
 161:   public int end ()
 162:     throws IllegalStateException
 163:   {
 164:     assertMatchOp();
 165:     return match.getEndIndex();
 166:   }
 167: 
 168:   /**
 169:    * @param group The index of a capturing group in this matcher's pattern
 170:    *
 171:    * @exception IllegalStateException If no match has yet been attempted,
 172:    * or if the previous match operation failed
 173:    * @exception IndexOutOfBoundsException If the replacement string refers
 174:    * to a capturing group that does not exist in the pattern
 175:    */
 176:   public int end (int group)
 177:     throws IllegalStateException
 178:   {
 179:     assertMatchOp();
 180:     return match.getEndIndex(group);
 181:   }
 182: 
 183:   public boolean find ()
 184:   {
 185:     boolean first = (match == null);
 186:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 187:       match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
 188:     else
 189:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
 190:                                        position, anchoringBounds);
 191:     if (match != null)
 192:       {
 193:         int endIndex = match.getEndIndex();
 194:         // Is the match within input limits?
 195:         if (endIndex > input.length())
 196:           {
 197:             match = null;
 198:             return false;
 199:           }
 200:         // Are we stuck at the same position?
 201:         if (!first && endIndex == position)
 202:           {
 203:             match = null;
 204:             // Not at the end of the input yet?
 205:             if (position < input.length() - 1)
 206:               {
 207:                 position++;
 208:                 return find(position);
 209:               }
 210:             else
 211:               return false;
 212:           }
 213:         position = endIndex;
 214:         return true;
 215:       }
 216:     return false;
 217:   }
 218: 
 219:   /**
 220:    * @param start The index to start the new pattern matching
 221:    *
 222:    * @exception IndexOutOfBoundsException If the replacement string refers
 223:    * to a capturing group that does not exist in the pattern
 224:    */
 225:   public boolean find (int start)
 226:   {
 227:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 228:       match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
 229:     else
 230:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
 231:                                        start, anchoringBounds);
 232:     if (match != null)
 233:       {
 234:         position = match.getEndIndex();
 235:         return true;
 236:       }
 237:     return false;
 238:   }
 239: 
 240:   /**
 241:    * @exception IllegalStateException If no match has yet been attempted,
 242:    * or if the previous match operation failed
 243:    */
 244:   public String group ()
 245:   {
 246:     assertMatchOp();
 247:     return match.toString();
 248:   }
 249: 
 250:   /**
 251:    * @param group The index of a capturing group in this matcher's pattern
 252:    *
 253:    * @exception IllegalStateException If no match has yet been attempted,
 254:    * or if the previous match operation failed
 255:    * @exception IndexOutOfBoundsException If the replacement string refers
 256:    * to a capturing group that does not exist in the pattern
 257:    */
 258:   public String group (int group)
 259:     throws IllegalStateException
 260:   {
 261:     assertMatchOp();
 262:     return match.toString(group);
 263:   }
 264: 
 265:   /**
 266:    * @param replacement The replacement string
 267:    */
 268:   public String replaceFirst (String replacement)
 269:   {
 270:     reset();
 271:     // Semantics might not quite match
 272:     return pattern.getRE().substitute(input, replacement, position,
 273:         RE.REG_REPLACE_USE_BACKSLASHESCAPE);
 274:   }
 275: 
 276:   /**
 277:    * @param replacement The replacement string
 278:    */
 279:   public String replaceAll (String replacement)
 280:   {
 281:     reset();
 282:     return pattern.getRE().substituteAll(input, replacement, position,
 283:         RE.REG_REPLACE_USE_BACKSLASHESCAPE);
 284:   }
 285: 
 286:   public int groupCount ()
 287:   {
 288:     return pattern.getRE().getNumSubs();
 289:   }
 290: 
 291:   public boolean lookingAt ()
 292:   {
 293:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 294:       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
 295:                                        anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
 296:     else
 297:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
 298:                                        anchoringBounds|RE.REG_FIX_STARTING_POSITION);
 299:     if (match != null)
 300:       {
 301:         if (match.getStartIndex() == 0)
 302:           {
 303:             position = match.getEndIndex();
 304:             return true;
 305:           }
 306:         match = null;
 307:       }
 308:     return false;
 309:   }
 310: 
 311:   /**
 312:    * Attempts to match the entire input sequence against the pattern.
 313:    *
 314:    * If the match succeeds then more information can be obtained via the
 315:    * start, end, and group methods.
 316:    *
 317:    * @see #start()
 318:    * @see #end()
 319:    * @see #group()
 320:    */
 321:   public boolean matches ()
 322:   {
 323:     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
 324:       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
 325:                                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
 326:     else
 327:       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
 328:                                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
 329:     if (match != null)
 330:       {
 331:         if (match.getStartIndex() == 0)
 332:           {
 333:             position = match.getEndIndex();
 334:             if (position == input.length())
 335:                 return true;
 336:           }
 337:         match = null;
 338:       }
 339:     return false;
 340:   }
 341: 
 342:   /**
 343:    * Returns the Pattern that is interpreted by this Matcher
 344:    */
 345:   public Pattern pattern ()
 346:   {
 347:     return pattern;
 348:   }
 349: 
 350:   /**
 351:    * Resets the internal state of the matcher, including
 352:    * resetting the region to its default state of encompassing
 353:    * the whole input.  The state of {@link #hasTransparentBounds()}
 354:    * and {@link #hasAnchoringBounds()} are unaffected.
 355:    *
 356:    * @return a reference to this matcher.
 357:    * @see #regionStart()
 358:    * @see #regionEnd()
 359:    * @see #hasTransparentBounds()
 360:    * @see #hasAnchoringBounds()
 361:    */
 362:   public Matcher reset ()
 363:   {
 364:     position = 0;
 365:     match = null;
 366:     regionStart = 0;
 367:     regionEnd = input.length();
 368:     appendPosition = 0;
 369:     return this;
 370:   }
 371: 
 372:   /**
 373:    * Resets the internal state of the matcher, including
 374:    * resetting the region to its default state of encompassing
 375:    * the whole input.  The state of {@link #hasTransparentBounds()}
 376:    * and {@link #hasAnchoringBounds()} are unaffected.
 377:    *
 378:    * @param input The new input character sequence.
 379:    * @return a reference to this matcher.
 380:    * @see #regionStart()
 381:    * @see #regionEnd()
 382:    * @see #hasTransparentBounds()
 383:    * @see #hasAnchoringBounds()
 384:    */
 385:   public Matcher reset (CharSequence input)
 386:   {
 387:     this.input = input;
 388:     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
 389:     return reset();
 390:   }
 391: 
 392:   /**
 393:    * @return the index of a capturing group in this matcher's pattern
 394:    *
 395:    * @exception IllegalStateException If no match has yet been attempted,
 396:    * or if the previous match operation failed
 397:    */
 398:   public int start ()
 399:     throws IllegalStateException
 400:   {
 401:     assertMatchOp();
 402:     return match.getStartIndex();
 403:   }
 404: 
 405:   /**
 406:    * @param group The index of a capturing group in this matcher's pattern
 407:    *
 408:    * @exception IllegalStateException If no match has yet been attempted,
 409:    * or if the previous match operation failed
 410:    * @exception IndexOutOfBoundsException If the replacement string refers
 411:    * to a capturing group that does not exist in the pattern
 412:    */
 413:   public int start (int group)
 414:     throws IllegalStateException
 415:   {
 416:     assertMatchOp();
 417:     return match.getStartIndex(group);
 418:   }
 419: 
 420:   /**
 421:    * @return True if and only if the matcher hit the end of input.
 422:    * @since 1.5
 423:    */
 424:   public boolean hitEnd()
 425:   {
 426:     return inputCharIndexed.hitEnd();
 427:   }
 428: 
 429:   /**
 430:    * @return A string expression of this matcher.
 431:    */
 432:   public String toString()
 433:   {
 434:     CPStringBuilder sb = new CPStringBuilder();
 435:     sb.append(this.getClass().getName())
 436:       .append("[pattern=").append(pattern.pattern())
 437:       .append(" region=").append(regionStart).append(",").append(regionEnd)
 438:       .append(" anchoringBounds=").append(anchoringBounds == 0)
 439:       .append(" transparentBounds=").append(transparentBounds)
 440:       .append(" lastmatch=").append(match == null ? "" : match.toString())
 441:       .append("]");
 442:     return sb.toString();
 443:   }
 444: 
 445:   private void assertMatchOp()
 446:   {
 447:     if (match == null) throw new IllegalStateException();
 448:   }
 449: 
 450:   /**
 451:    * <p>
 452:    * Defines the region of the input on which to match.
 453:    * By default, the {@link Matcher} attempts to match
 454:    * the whole string (from 0 to the length of the input),
 455:    * but a region between {@code start} (inclusive) and
 456:    * {@code end} (exclusive) on which to match may instead
 457:    * be defined using this method.
 458:    * </p>
 459:    * <p>
 460:    * The behaviour of region matching is further affected
 461:    * by the use of transparent or opaque bounds (see
 462:    * {@link #useTransparentBounds(boolean)}) and whether or not
 463:    * anchors ({@code ^} and {@code $}) are in use
 464:    * (see {@link #useAnchoringBounds(boolean)}).  With transparent
 465:    * bounds, the matcher is aware of input outside the bounds
 466:    * set by this method, whereas, with opaque bounds (the default)
 467:    * only the input within the bounds is used.  The use of
 468:    * anchors are affected by this setting; with transparent
 469:    * bounds, anchors will match the beginning of the real input,
 470:    * while with opaque bounds they match the beginning of the
 471:    * region.  {@link #useAnchoringBounds(boolean)} can be used
 472:    * to turn on or off the matching of anchors.
 473:    * </p>
 474:    *
 475:    * @param start the start of the region (inclusive).
 476:    * @param end the end of the region (exclusive).
 477:    * @return a reference to this matcher.
 478:    * @throws IndexOutOfBoundsException if either {@code start} or
 479:    *                                   {@code end} are less than zero,
 480:    *                                   if either {@code start} or
 481:    *                                   {@code end} are greater than the
 482:    *                                   length of the input, or if
 483:    *                                   {@code start} is greater than
 484:    *                                   {@code end}.
 485:    * @see #regionStart()
 486:    * @see #regionEnd()
 487:    * @see #hasTransparentBounds()
 488:    * @see #useTransparentBounds(boolean)
 489:    * @see #hasAnchoringBounds()
 490:    * @see #useAnchoringBounds(boolean)
 491:    * @since 1.5
 492:    */
 493:   public Matcher region(int start, int end)
 494:   {
 495:     int length = input.length();
 496:     if (start < 0)
 497:       throw new IndexOutOfBoundsException("The start position was less than zero.");
 498:     if (start >= length)
 499:       throw new IndexOutOfBoundsException("The start position is after the end of the input.");
 500:     if (end < 0)
 501:       throw new IndexOutOfBoundsException("The end position was less than zero.");
 502:     if (end > length)
 503:       throw new IndexOutOfBoundsException("The end position is after the end of the input.");
 504:     if (start > end)
 505:       throw new IndexOutOfBoundsException("The start position is after the end position.");
 506:     reset();
 507:     regionStart = start;
 508:     regionEnd = end;
 509:     return this;
 510:   }
 511: 
 512:   /**
 513:    * The start of the region on which to perform matches (inclusive).
 514:    *
 515:    * @return the start index of the region.
 516:    * @see #region(int,int)
 517:    * #see #regionEnd()
 518:    * @since 1.5
 519:    */
 520:   public int regionStart()
 521:   {
 522:     return regionStart;
 523:   }
 524: 
 525:   /**
 526:    * The end of the region on which to perform matches (exclusive).
 527:    *
 528:    * @return the end index of the region.
 529:    * @see #region(int,int)
 530:    * @see #regionStart()
 531:    * @since 1.5
 532:    */
 533:   public int regionEnd()
 534:   {
 535:     return regionEnd;
 536:   }
 537: 
 538:   /**
 539:    * Returns true if the bounds of the region marked by
 540:    * {@link #regionStart()} and {@link #regionEnd()} are
 541:    * transparent.  When these bounds are transparent, the
 542:    * matching process can look beyond them to perform
 543:    * lookahead, lookbehind and boundary matching operations.
 544:    * By default, the bounds are opaque.
 545:    *
 546:    * @return true if the bounds of the matching region are
 547:    *         transparent.
 548:    * @see #useTransparentBounds(boolean)
 549:    * @see #region(int,int)
 550:    * @see #regionStart()
 551:    * @see #regionEnd()
 552:    * @since 1.5
 553:    */
 554:   public boolean hasTransparentBounds()
 555:   {
 556:     return transparentBounds;
 557:   }
 558: 
 559:   /**
 560:    * Sets the transparency of the bounds of the region
 561:    * marked by {@link #regionStart()} and {@link #regionEnd()}.
 562:    * A value of {@code true} makes the bounds transparent,
 563:    * so the matcher can see beyond them to perform lookahead,
 564:    * lookbehind and boundary matching operations.  A value
 565:    * of {@code false} (the default) makes the bounds opaque,
 566:    * restricting the match to the input region denoted
 567:    * by {@link #regionStart()} and {@link #regionEnd()}.
 568:    *
 569:    * @param transparent true if the bounds should be transparent.
 570:    * @return a reference to this matcher.
 571:    * @see #hasTransparentBounds()
 572:    * @see #region(int,int)
 573:    * @see #regionStart()
 574:    * @see #regionEnd()
 575:    * @since 1.5
 576:    */
 577:   public Matcher useTransparentBounds(boolean transparent)
 578:   {
 579:     transparentBounds = transparent;
 580:     return this;
 581:   }
 582: 
 583:   /**
 584:    * Returns true if the matcher will honour the use of
 585:    * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
 586:    * {@code \z} and {@code $}.  By default, the anchors
 587:    * are used.  Note that the effect of the anchors is
 588:    * also affected by {@link #hasTransparentBounds()}.
 589:    *
 590:    * @return true if the matcher will attempt to match
 591:    *         the anchoring bounds.
 592:    * @see #useAnchoringBounds(boolean)
 593:    * @see #hasTransparentBounds()
 594:    * @since 1.5
 595:    */
 596:   public boolean hasAnchoringBounds()
 597:   {
 598:     return anchoringBounds == 0;
 599:   }
 600: 
 601:   /**
 602:    * Enables or disables the use of the anchoring bounds:
 603:    * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
 604:    * {@code $}. By default, their use is enabled.  When
 605:    * disabled, the matcher will not attempt to match
 606:    * the anchors.
 607:    *
 608:    * @param useAnchors true if anchoring bounds should be used.
 609:    * @return a reference to this matcher.
 610:    * @since 1.5
 611:    * @see #hasAnchoringBounds()
 612:    */
 613:   public Matcher useAnchoringBounds(boolean useAnchors)
 614:   {
 615:     if (useAnchors)
 616:       anchoringBounds = 0;
 617:     else
 618:       anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
 619:     return this;
 620:   }
 621: 
 622:   /**
 623:    * Returns a read-only snapshot of the current state of
 624:    * the {@link Matcher} as a {@link MatchResult}.  Any
 625:    * subsequent changes to this instance are not reflected
 626:    * in the returned {@link MatchResult}.
 627:    *
 628:    * @return a {@link MatchResult} instance representing the
 629:    *         current state of the {@link Matcher}.
 630:    */
 631:   public MatchResult toMatchResult()
 632:   {
 633:     Matcher snapshot = new Matcher(pattern, input);
 634:     if (match != null)
 635:       snapshot.match = (REMatch) match.clone();
 636:     return snapshot;
 637:   }
 638: 
 639:   /**
 640:    * Returns a literalized string of s where characters {@code $} and {@code
 641:    * \\} are escaped.
 642:    *
 643:    * @param s the string to literalize.
 644:    * @return the literalized string.
 645:    * @since 1.5
 646:    */
 647:   public static String quoteReplacement(String s)
 648:   {
 649:     if (s == null)
 650:       throw new NullPointerException();
 651:     CPStringBuilder sb = new CPStringBuilder();
 652:     for (int i = 0; i < s.length(); i++)
 653:     {
 654:       char ch = s.charAt(i);
 655:       if (ch == '$' || ch == '\\')
 656:         sb.append('\\');
 657:       sb.append(ch);
 658:     }
 659:     return sb.toString();
 660:   }
 661: 
 662: }