Source for gnu.java.util.regex.RE

   1: /* gnu/regexp/RE.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.java.util.regex;
  39: 
  40: import gnu.java.lang.CPStringBuilder;
  41: 
  42: import java.io.InputStream;
  43: import java.io.Serializable;
  44: 
  45: import java.util.ArrayList;
  46: import java.util.List;
  47: import java.util.Locale;
  48: import java.util.PropertyResourceBundle;
  49: import java.util.ResourceBundle;
  50: 
  51: /**
  52:  * RE provides the user interface for compiling and matching regular
  53:  * expressions.
  54:  * <P>
  55:  * A regular expression object (class RE) is compiled by constructing it
  56:  * from a String, StringBuffer or character array, with optional
  57:  * compilation flags (below)
  58:  * and an optional syntax specification (see RESyntax; if not specified,
  59:  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  60:  * <P>
  61:  * Once compiled, a regular expression object is reusable as well as
  62:  * threadsafe: multiple threads can use the RE instance simultaneously
  63:  * to match against different input text.
  64:  * <P>
  65:  * Various methods attempt to match input text against a compiled
  66:  * regular expression.  These methods are:
  67:  * <LI><code>isMatch</code>: returns true if the input text in its
  68:  * entirety matches the regular expression pattern.
  69:  * <LI><code>getMatch</code>: returns the first match found in the
  70:  * input text, or null if no match is found.
  71:  * <LI><code>getAllMatches</code>: returns an array of all
  72:  * non-overlapping matches found in the input text.  If no matches are
  73:  * found, the array is zero-length.
  74:  * <LI><code>substitute</code>: substitute the first occurence of the
  75:  * pattern in the input text with a replacement string (which may
  76:  * include metacharacters $0-$9, see REMatch.substituteInto).
  77:  * <LI><code>substituteAll</code>: same as above, but repeat for each
  78:  * match before returning.
  79:  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  80:  * object that allows iteration over the matches (see
  81:  * REMatchEnumeration for some reasons why you may want to do this
  82:  * instead of using <code>getAllMatches</code>.
  83:  * <P>
  84:  *
  85:  * These methods all have similar argument lists.  The input can be a
  86:  * CharIndexed, String, a character array, a StringBuffer, or an
  87:  * InputStream of some sort.  Note that when using an
  88:  * InputStream, the stream read position cannot be guaranteed after
  89:  * attempting a match (this is not a bug, but a consequence of the way
  90:  * regular expressions work).  Using an REMatchEnumeration can
  91:  * eliminate most positioning problems.
  92:  *
  93:  * Although the input object can be of various types, it is recommended
  94:  * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
  95:  * can show the last match found on this input, which helps the expression
  96:  * \G work as the end of the previous match.
  97:  *
  98:  * <P>
  99:  *
 100:  * The optional index argument specifies the offset from the beginning
 101:  * of the text at which the search should start (see the descriptions
 102:  * of some of the execution flags for how this can affect positional
 103:  * pattern operators).  For an InputStream, this means an
 104:  * offset from the current read position, so subsequent calls with the
 105:  * same index argument on an InputStream will not
 106:  * necessarily access the same position on the stream, whereas
 107:  * repeated searches at a given index in a fixed string will return
 108:  * consistent results.
 109:  *
 110:  * <P>
 111:  * You can optionally affect the execution environment by using a
 112:  * combination of execution flags (constants listed below).
 113:  *
 114:  * <P>
 115:  * All operations on a regular expression are performed in a
 116:  * thread-safe manner.
 117:  *
 118:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 119:  * @version 1.1.5-dev, to be released
 120:  */
 121: 
 122: public class RE extends REToken
 123: {
 124: 
 125:   private static final class IntPair implements Serializable
 126:   {
 127:     public int first, second;
 128:   }
 129: 
 130:   private static final class CharUnit implements Serializable
 131:   {
 132:     public char ch;
 133:     public boolean bk;
 134:   }
 135: 
 136:   // This String will be returned by getVersion()
 137:   private static final String VERSION = "1.1.5-dev";
 138: 
 139:   // The localized strings are kept in a separate file
 140:   // Used by getLocalizedMessage().
 141:   private static ResourceBundle messages;
 142: 
 143:   // Name of the bundle that contains the localized messages.
 144:   private static final String bundle = "gnu/java/util/regex/MessagesBundle";
 145: 
 146:   // These are, respectively, the first and last tokens in our linked list
 147:   // If there is only one token, firstToken == lastToken
 148:   private REToken firstToken, lastToken;
 149: 
 150:   // This is the number of subexpressions in this regular expression,
 151:   // with a minimum value of zero.  Returned by getNumSubs()
 152:   private int numSubs;
 153: 
 154:     /** Minimum length, in characters, of any possible match. */
 155:   private int minimumLength;
 156:   private int maximumLength;
 157: 
 158:   /**
 159:    * Compilation flag. Do  not  differentiate  case.   Subsequent
 160:    * searches  using  this  RE will be case insensitive.
 161:    */
 162:   public static final int REG_ICASE = 0x02;
 163: 
 164:   /**
 165:    * Compilation flag. The match-any-character operator (dot)
 166:    * will match a newline character.  When set this overrides the syntax
 167:    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 168:    * the "/s" operator in Perl.
 169:    */
 170:   public static final int REG_DOT_NEWLINE = 0x04;
 171: 
 172:   /**
 173:    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 174:    * anchors will match based on newlines within the input. This is
 175:    * equivalent to the "/m" operator in Perl.
 176:    */
 177:   public static final int REG_MULTILINE = 0x08;
 178: 
 179:   /**
 180:    * Execution flag.
 181:    * The match-beginning operator (^) will not match at the beginning
 182:    * of the input string. Useful for matching on a substring when you
 183:    * know the context of the input is such that position zero of the
 184:    * input to the match test is not actually position zero of the text.
 185:    * <P>
 186:    * This example demonstrates the results of various ways of matching on
 187:    * a substring.
 188:    * <P>
 189:    * <CODE>
 190:    * String s = "food bar fool";<BR>
 191:    * RE exp = new RE("^foo.");<BR>
 192:    * REMatch m0 = exp.getMatch(s);<BR>
 193:    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 194:    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 195:    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 196:    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 197:    * <P>
 198:    * // Results:<BR>
 199:    * //  m0.toString(): "food"<BR>
 200:    * //  m1.toString(): "fool"<BR>
 201:    * //  m2.toString(): null<BR>
 202:    * //  m3.toString(): null<BR>
 203:    * //  m4.toString(): "fool"<BR>
 204:    * </CODE>
 205:    */
 206:   public static final int REG_NOTBOL = 0x10;
 207: 
 208:   /**
 209:    * Execution flag.
 210:    * The match-end operator ($) does not match at the end
 211:    * of the input string. Useful for matching on substrings.
 212:    */
 213:   public static final int REG_NOTEOL = 0x20;
 214: 
 215:   /**
 216:    * Execution flag.
 217:    * When a match method is invoked that starts matching at a non-zero
 218:    * index into the input, treat the input as if it begins at the index
 219:    * given.  The effect of this flag is that the engine does not "see"
 220:    * any text in the input before the given index.  This is useful so
 221:    * that the match-beginning operator (^) matches not at position 0
 222:    * in the input string, but at the position the search started at
 223:    * (based on the index input given to the getMatch function).  See
 224:    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 225:    * and \b operators.
 226:    */
 227:   public static final int REG_ANCHORINDEX = 0x40;
 228: 
 229:   /**
 230:    * Execution flag.
 231:    * The substitute and substituteAll methods will not attempt to
 232:    * interpolate occurrences of $1-$9 in the replacement text with
 233:    * the corresponding subexpressions.  For example, you may want to
 234:    * replace all matches of "one dollar" with "$1".
 235:    */
 236:   public static final int REG_NO_INTERPOLATE = 0x80;
 237: 
 238:   /**
 239:    * Execution flag.
 240:    * Try to match the whole input string. An implicit match-end operator
 241:    * is added to this regexp.
 242:    */
 243:   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 244: 
 245:   /**
 246:    * Execution flag.
 247:    * The substitute and substituteAll methods will treat the
 248:    * character '\' in the replacement as an escape to a literal
 249:    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 250:    * will become "n", "$", "\", "x40" and "012" respectively.
 251:    * This flag has no effect if REG_NO_INTERPOLATE is set on.
 252:    */
 253:   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 254: 
 255:   /**
 256:    * Compilation flag. Allow whitespace and comments in pattern.
 257:    * This is equivalent to the "/x" operator in Perl.
 258:    */
 259:   public static final int REG_X_COMMENTS = 0x0400;
 260: 
 261:   /**
 262:    * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
 263:    */
 264:   public static final int REG_ICASE_USASCII = 0x0800;
 265: 
 266:   /**
 267:    * Execution flag.
 268:    * Do not move the position at which the search begins.  If not set,
 269:    * the starting position will be moved until a match is found.
 270:    */
 271:   public static final int REG_FIX_STARTING_POSITION = 0x1000;
 272: 
 273:   /** Returns a string representing the version of the gnu.regexp package. */
 274:   public static final String version ()
 275:   {
 276:     return VERSION;
 277:   }
 278: 
 279:   // Retrieves a message from the ResourceBundle
 280:   static final String getLocalizedMessage (String key)
 281:   {
 282:     if (messages == null)
 283:       messages =
 284:         PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
 285:     return messages.getString (key);
 286:   }
 287: 
 288:   /**
 289:    * Constructs a regular expression pattern buffer without any compilation
 290:    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 291:    *
 292:    * @param pattern A regular expression pattern, in the form of a String,
 293:    *   StringBuffer or char[].  Other input types will be converted to
 294:    *   strings using the toString() method.
 295:    * @exception REException The input pattern could not be parsed.
 296:    * @exception NullPointerException The pattern was null.
 297:    */
 298:   public RE (Object pattern) throws REException
 299:   {
 300:     this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
 301:   }
 302: 
 303:   /**
 304:    * Constructs a regular expression pattern buffer using the specified
 305:    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 306:    *
 307:    * @param pattern A regular expression pattern, in the form of a String,
 308:    *   StringBuffer, or char[].  Other input types will be converted to
 309:    *   strings using the toString() method.
 310:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 311:    * @exception REException The input pattern could not be parsed.
 312:    * @exception NullPointerException The pattern was null.
 313:    */
 314:   public RE (Object pattern, int cflags) throws REException
 315:   {
 316:     this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
 317:   }
 318: 
 319:   /**
 320:    * Constructs a regular expression pattern buffer using the specified
 321:    * compilation flags and regular expression syntax.
 322:    *
 323:    * @param pattern A regular expression pattern, in the form of a String,
 324:    *   StringBuffer, or char[].  Other input types will be converted to
 325:    *   strings using the toString() method.
 326:    * @param cflags The logical OR of any combination of the compilation flags listed above.
 327:    * @param syntax The type of regular expression syntax to use.
 328:    * @exception REException The input pattern could not be parsed.
 329:    * @exception NullPointerException The pattern was null.
 330:    */
 331:   public RE (Object pattern, int cflags, RESyntax syntax) throws REException
 332:   {
 333:     this (pattern, cflags, syntax, 0, 0);
 334:   }
 335: 
 336:   // internal constructor used for alternation
 337:   private RE (REToken first, REToken last, int subs, int subIndex,
 338:               int minLength, int maxLength)
 339:   {
 340:     super (subIndex);
 341:     firstToken = first;
 342:     lastToken = last;
 343:     numSubs = subs;
 344:     minimumLength = minLength;
 345:     maximumLength = maxLength;
 346:     addToken (new RETokenEndSub (subIndex));
 347:   }
 348: 
 349:   private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
 350:               int nextSub) throws REException
 351:   {
 352:     super (myIndex);            // Subexpression index of this token.
 353:     initialize (patternObj, cflags, syntax, myIndex, nextSub);
 354:   }
 355: 
 356:   // For use by subclasses
 357:   protected RE ()
 358:   {
 359:     super (0);
 360:   }
 361: 
 362:   // The meat of construction
 363:   protected void initialize (Object patternObj, int cflags, RESyntax syntax,
 364:                              int myIndex, int nextSub) throws REException
 365:   {
 366:     char[] pattern;
 367:     if (patternObj instanceof String)
 368:       {
 369:         pattern = ((String) patternObj).toCharArray ();
 370:       }
 371:     else if (patternObj instanceof char[])
 372:       {
 373:         pattern = (char[]) patternObj;
 374:       }
 375:     else if (patternObj instanceof StringBuffer)
 376:       {
 377:         pattern = new char[((StringBuffer) patternObj).length ()];
 378:         ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
 379:       }
 380:     else if (patternObj instanceof StringBuilder)
 381:       {
 382:         pattern = new char[((StringBuilder) patternObj).length ()];
 383:         ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
 384:       }
 385:     else if (patternObj instanceof CPStringBuilder)
 386:       {
 387:         pattern = new char[((CPStringBuilder) patternObj).length ()];
 388:         ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
 389:                                                  0);
 390:       }
 391:     else
 392:       {
 393:         pattern = patternObj.toString ().toCharArray ();
 394:       }
 395: 
 396:     int pLength = pattern.length;
 397: 
 398:     numSubs = 0;                // Number of subexpressions in this token.
 399:     ArrayList < REToken > branches = null;
 400: 
 401:     // linked list of tokens (sort of -- some closed loops can exist)
 402:     firstToken = lastToken = null;
 403: 
 404:     // Precalculate these so we don't pay for the math every time we
 405:     // need to access them.
 406:     boolean insens = ((cflags & REG_ICASE) > 0);
 407:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 408: 
 409:     // Parse pattern into tokens.  Does anyone know if it's more efficient
 410:     // to use char[] than a String.charAt()?  I'm assuming so.
 411: 
 412:     // index tracks the position in the char array
 413:     int index = 0;
 414: 
 415:     // this will be the current parse character (pattern[index])
 416:     CharUnit unit = new CharUnit ();
 417: 
 418:     // This is used for {x,y} calculations
 419:     IntPair minMax = new IntPair ();
 420: 
 421:     // Buffer a token so we can create a TokenRepeated, etc.
 422:     REToken currentToken = null;
 423:     boolean quot = false;
 424: 
 425:     // Saved syntax and flags.
 426:     RESyntax savedSyntax = null;
 427:     int savedCflags = 0;
 428:     boolean flagsSaved = false;
 429: 
 430:     while (index < pLength)
 431:       {
 432:         // read the next character unit (including backslash escapes)
 433:         index = getCharUnit (pattern, index, unit, quot);
 434: 
 435:         if (unit.bk)
 436:           if (unit.ch == 'Q')
 437:             {
 438:               quot = true;
 439:               continue;
 440:             }
 441:           else if (unit.ch == 'E')
 442:             {
 443:               quot = false;
 444:               continue;
 445:             }
 446:         if (quot)
 447:           unit.bk = false;
 448: 
 449:         if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
 450:           {
 451:             if (Character.isWhitespace (unit.ch))
 452:               {
 453:                 continue;
 454:               }
 455:             if (unit.ch == '#')
 456:               {
 457:                 for (int i = index; i < pLength; i++)
 458:                   {
 459:                     if (pattern[i] == '\n')
 460:                       {
 461:                         index = i + 1;
 462:                         continue;
 463:                       }
 464:                     else if (pattern[i] == '\r')
 465:                       {
 466:                         if (i + 1 < pLength && pattern[i + 1] == '\n')
 467:                           {
 468:                             index = i + 2;
 469:                           }
 470:                         else
 471:                           {
 472:                             index = i + 1;
 473:                           }
 474:                         continue;
 475:                       }
 476:                   }
 477:                 index = pLength;
 478:                 continue;
 479:               }
 480:           }
 481: 
 482:         // ALTERNATION OPERATOR
 483:         //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 484:         //  not available if RE_LIMITED_OPS is set
 485: 
 486:         // TODO: the '\n' literal here should be a test against REToken.newline,
 487:         // which unfortunately may be more than a single character.
 488:         if (((unit.ch == '|'
 489:               && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 490:              || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
 491:                  && !(unit.bk || quot)))
 492:             && !syntax.get (RESyntax.RE_LIMITED_OPS))
 493:           {
 494:             // make everything up to here be a branch. create vector if nec.
 495:             addToken (currentToken);
 496:             RE theBranch =
 497:               new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
 498:                       maximumLength);
 499:             minimumLength = 0;
 500:             maximumLength = 0;
 501:             if (branches == null)
 502:               {
 503:                 branches = new ArrayList < REToken > ();
 504:               }
 505:             branches.add (theBranch);
 506:             firstToken = lastToken = currentToken = null;
 507:           }
 508: 
 509:         // INTERVAL OPERATOR:
 510:         //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 511:         //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 512:         //
 513:         // OPEN QUESTION:
 514:         //  what is proper interpretation of '{' at start of string?
 515:         //
 516:         // This method used to check "repeat.empty.token" to avoid such regexp
 517:         // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 518: 
 519:         else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
 520:                  && (syntax.
 521:                      get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
 522:           {
 523:             int newIndex = getMinMax (pattern, index, minMax, syntax);
 524:             if (newIndex > index)
 525:               {
 526:                 if (minMax.first > minMax.second)
 527:                   throw new
 528:                     REException (getLocalizedMessage ("interval.order"),
 529:                                  REException.REG_BADRPT, newIndex);
 530:                 if (currentToken == null)
 531:                   throw new
 532:                     REException (getLocalizedMessage ("repeat.no.token"),
 533:                                  REException.REG_BADRPT, newIndex);
 534:                 if (currentToken instanceof RETokenRepeated)
 535:                   throw new
 536:                     REException (getLocalizedMessage ("repeat.chained"),
 537:                                  REException.REG_BADRPT, newIndex);
 538:                 if (currentToken instanceof RETokenWordBoundary
 539:                     || currentToken instanceof RETokenWordBoundary)
 540:                   throw new
 541:                     REException (getLocalizedMessage ("repeat.assertion"),
 542:                                  REException.REG_BADRPT, newIndex);
 543:                 index = newIndex;
 544:                 currentToken =
 545:                   setRepeated (currentToken, minMax.first, minMax.second,
 546:                                index);
 547:               }
 548:             else
 549:               {
 550:                 addToken (currentToken);
 551:                 currentToken = new RETokenChar (subIndex, unit.ch, insens);
 552:                 if (insensUSASCII)
 553:                   currentToken.unicodeAware = false;
 554:               }
 555:           }
 556: 
 557:         // LIST OPERATOR:
 558:         //  [...] | [^...]
 559: 
 560:         else if ((unit.ch == '[') && !(unit.bk || quot))
 561:           {
 562:             // Create a new RETokenOneOf
 563:             ParseCharClassResult result =
 564:               parseCharClass (subIndex, pattern, index, pLength, cflags,
 565:                               syntax, 0);
 566:             addToken (currentToken);
 567:             currentToken = result.token;
 568:             index = result.index;
 569:           }
 570: 
 571:         // SUBEXPRESSIONS
 572:         //  (...) | \(...\) depending on RE_NO_BK_PARENS
 573: 
 574:         else if ((unit.ch == '(')
 575:                  && (syntax.
 576:                      get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 577:           {
 578:             boolean pure = false;
 579:             boolean comment = false;
 580:             boolean lookAhead = false;
 581:             boolean lookBehind = false;
 582:             boolean independent = false;
 583:             boolean negativelh = false;
 584:             boolean negativelb = false;
 585:             if ((index + 1 < pLength) && (pattern[index] == '?'))
 586:               {
 587:                 switch (pattern[index + 1])
 588:                   {
 589:                   case '!':
 590:                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
 591:                       {
 592:                         pure = true;
 593:                         negativelh = true;
 594:                         lookAhead = true;
 595:                         index += 2;
 596:                       }
 597:                     break;
 598:                   case '=':
 599:                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
 600:                       {
 601:                         pure = true;
 602:                         lookAhead = true;
 603:                         index += 2;
 604:                       }
 605:                     break;
 606:                   case '<':
 607:                     // We assume that if the syntax supports look-ahead,
 608:                     // it also supports look-behind.
 609:                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
 610:                       {
 611:                         index++;
 612:                         switch (pattern[index + 1])
 613:                           {
 614:                           case '!':
 615:                             pure = true;
 616:                             negativelb = true;
 617:                             lookBehind = true;
 618:                             index += 2;
 619:                             break;
 620:                           case '=':
 621:                             pure = true;
 622:                             lookBehind = true;
 623:                             index += 2;
 624:                           }
 625:                       }
 626:                     break;
 627:                   case '>':
 628:                     // We assume that if the syntax supports look-ahead,
 629:                     // it also supports independent group.
 630:                     if (syntax.get (RESyntax.RE_LOOKAHEAD))
 631:                       {
 632:                         pure = true;
 633:                         independent = true;
 634:                         index += 2;
 635:                       }
 636:                     break;
 637:                   case 'i':
 638:                   case 'd':
 639:                   case 'm':
 640:                   case 's':
 641:                   case 'u':
 642:                   case 'x':
 643:                   case '-':
 644:                     if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
 645:                       break;
 646:                     // Set or reset syntax flags.
 647:                     int flagIndex = index + 1;
 648:                     int endFlag = -1;
 649:                     RESyntax newSyntax = new RESyntax (syntax);
 650:                     int newCflags = cflags;
 651:                     boolean negate = false;
 652:                     while (flagIndex < pLength && endFlag < 0)
 653:                       {
 654:                         switch (pattern[flagIndex])
 655:                           {
 656:                           case 'i':
 657:                             if (negate)
 658:                               newCflags &= ~REG_ICASE;
 659:                             else
 660:                               newCflags |= REG_ICASE;
 661:                             flagIndex++;
 662:                             break;
 663:                           case 'd':
 664:                             if (negate)
 665:                               newSyntax.setLineSeparator (RESyntax.
 666:                                                           DEFAULT_LINE_SEPARATOR);
 667:                             else
 668:                               newSyntax.setLineSeparator ("\n");
 669:                             flagIndex++;
 670:                             break;
 671:                           case 'm':
 672:                             if (negate)
 673:                               newCflags &= ~REG_MULTILINE;
 674:                             else
 675:                               newCflags |= REG_MULTILINE;
 676:                             flagIndex++;
 677:                             break;
 678:                           case 's':
 679:                             if (negate)
 680:                               newCflags &= ~REG_DOT_NEWLINE;
 681:                             else
 682:                               newCflags |= REG_DOT_NEWLINE;
 683:                             flagIndex++;
 684:                             break;
 685:                           case 'u':
 686:                             if (negate)
 687:                               newCflags |= REG_ICASE_USASCII;
 688:                             else
 689:                               newCflags &= ~REG_ICASE_USASCII;
 690:                             flagIndex++;
 691:                             break;
 692:                           case 'x':
 693:                             if (negate)
 694:                               newCflags &= ~REG_X_COMMENTS;
 695:                             else
 696:                               newCflags |= REG_X_COMMENTS;
 697:                             flagIndex++;
 698:                             break;
 699:                           case '-':
 700:                             negate = true;
 701:                             flagIndex++;
 702:                             break;
 703:                           case ':':
 704:                           case ')':
 705:                             endFlag = pattern[flagIndex];
 706:                             break;
 707:                           default:
 708:                             throw new
 709:                               REException (getLocalizedMessage
 710:                                            ("repeat.no.token"),
 711:                                            REException.REG_BADRPT, index);
 712:                           }
 713:                       }
 714:                     if (endFlag == ')')
 715:                       {
 716:                         syntax = newSyntax;
 717:                         cflags = newCflags;
 718:                         insens = ((cflags & REG_ICASE) > 0);
 719:                         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 720:                         // This can be treated as though it were a comment.
 721:                         comment = true;
 722:                         index = flagIndex - 1;
 723:                         break;
 724:                       }
 725:                     if (endFlag == ':')
 726:                       {
 727:                         savedSyntax = syntax;
 728:                         savedCflags = cflags;
 729:                         flagsSaved = true;
 730:                         syntax = newSyntax;
 731:                         cflags = newCflags;
 732:                         insens = ((cflags & REG_ICASE) > 0);
 733:                         insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 734:                         index = flagIndex - 1;
 735:                         // Fall through to the next case.
 736:                       }
 737:                     else
 738:                       {
 739:                         throw new
 740:                           REException (getLocalizedMessage
 741:                                        ("unmatched.paren"),
 742:                                        REException.REG_ESUBREG, index);
 743:                       }
 744:                   case ':':
 745:                     if (syntax.get (RESyntax.RE_PURE_GROUPING))
 746:                       {
 747:                         pure = true;
 748:                         index += 2;
 749:                       }
 750:                     break;
 751:                   case '#':
 752:                     if (syntax.get (RESyntax.RE_COMMENTS))
 753:                       {
 754:                         comment = true;
 755:                       }
 756:                     break;
 757:                   default:
 758:                     throw new
 759:                       REException (getLocalizedMessage ("repeat.no.token"),
 760:                                    REException.REG_BADRPT, index);
 761:                   }
 762:               }
 763: 
 764:             if (index >= pLength)
 765:               {
 766:                 throw new
 767:                   REException (getLocalizedMessage ("unmatched.paren"),
 768:                                REException.REG_ESUBREG, index);
 769:               }
 770: 
 771:             // find end of subexpression
 772:             int endIndex = index;
 773:             int nextIndex = index;
 774:             int nested = 0;
 775: 
 776:             while (((nextIndex =
 777:                      getCharUnit (pattern, endIndex, unit, false)) > 0)
 778:                    && !(nested == 0 && (unit.ch == ')')
 779:                         && (syntax.
 780:                             get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 781:                                                               || quot))))
 782:               {
 783:                 if ((endIndex = nextIndex) >= pLength)
 784:                   throw new
 785:                     REException (getLocalizedMessage ("subexpr.no.end"),
 786:                                  REException.REG_ESUBREG, nextIndex);
 787:                 else
 788:               if ((unit.ch == '[') && !(unit.bk || quot))
 789:                 {
 790:                   // I hate to do something similar to the LIST OPERATOR matters
 791:                   // above, but ...
 792:                   int listIndex = nextIndex;
 793:                   if (listIndex < pLength && pattern[listIndex] == '^')
 794:                     listIndex++;
 795:                   if (listIndex < pLength && pattern[listIndex] == ']')
 796:                     listIndex++;
 797:                   int listEndIndex = -1;
 798:                   int listNest = 0;
 799:                   while (listIndex < pLength && listEndIndex < 0)
 800:                     {
 801:                       switch (pattern[listIndex++])
 802:                         {
 803:                         case '\\':
 804:                           listIndex++;
 805:                           break;
 806:                         case '[':
 807:                           // Sun's API document says that regexp like "[a-d[m-p]]"
 808:                           // is legal. Even something like "[[[^]]]]" is accepted.
 809:                           listNest++;
 810:                           if (listIndex < pLength
 811:                               && pattern[listIndex] == '^')
 812:                             listIndex++;
 813:                           if (listIndex < pLength
 814:                               && pattern[listIndex] == ']')
 815:                             listIndex++;
 816:                           break;
 817:                         case ']':
 818:                           if (listNest == 0)
 819:                             listEndIndex = listIndex;
 820:                           listNest--;
 821:                           break;
 822:                         }
 823:                     }
 824:                   if (listEndIndex >= 0)
 825:                     {
 826:                       nextIndex = listEndIndex;
 827:                       if ((endIndex = nextIndex) >= pLength)
 828:                         throw new
 829:                           REException (getLocalizedMessage ("subexpr.no.end"),
 830:                                        REException.REG_ESUBREG, nextIndex);
 831:                       else
 832:                       continue;
 833:                     }
 834:                   throw new
 835:                     REException (getLocalizedMessage ("subexpr.no.end"),
 836:                                  REException.REG_ESUBREG, nextIndex);
 837:                 }
 838:               else if (unit.ch == '('
 839:                        && (syntax.
 840:                            get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 841:                                                              || quot)))
 842:                 nested++;
 843:               else if (unit.ch == ')'
 844:                        && (syntax.
 845:                            get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
 846:                                                              || quot)))
 847:                 nested--;
 848:               }
 849: 
 850:             // endIndex is now position at a ')','\)'
 851:             // nextIndex is end of string or position after ')' or '\)'
 852: 
 853:             if (comment)
 854:               index = nextIndex;
 855:             else
 856:               {                 // not a comment
 857:                 // create RE subexpression as token.
 858:                 addToken (currentToken);
 859:                 if (!pure)
 860:                   {
 861:                     numSubs++;
 862:                   }
 863: 
 864:                 int useIndex = (pure || lookAhead || lookBehind
 865:                                 || independent) ? 0 : nextSub + numSubs;
 866:                 currentToken =
 867:                   new RE (String.valueOf (pattern, index, endIndex - index).
 868:                           toCharArray (), cflags, syntax, useIndex,
 869:                           nextSub + numSubs);
 870:                 numSubs += ((RE) currentToken).getNumSubs ();
 871: 
 872:                 if (lookAhead)
 873:                   {
 874:                     currentToken =
 875:                       new RETokenLookAhead (currentToken, negativelh);
 876:                   }
 877:                 else if (lookBehind)
 878:                   {
 879:                     currentToken =
 880:                       new RETokenLookBehind (currentToken, negativelb);
 881:                   }
 882:                 else if (independent)
 883:                   {
 884:                     currentToken = new RETokenIndependent (currentToken);
 885:                   }
 886: 
 887:                 index = nextIndex;
 888:                 if (flagsSaved)
 889:                   {
 890:                     syntax = savedSyntax;
 891:                     cflags = savedCflags;
 892:                     insens = ((cflags & REG_ICASE) > 0);
 893:                     insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 894:                     flagsSaved = false;
 895:                   }
 896:               }                 // not a comment
 897:           }                     // subexpression
 898: 
 899:         // UNMATCHED RIGHT PAREN
 900:         // ) or \) throw exception if
 901:         // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 902:         else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 903:                  && ((unit.ch == ')')
 904:                      && (syntax.
 905:                          get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
 906:           {
 907:             throw new REException (getLocalizedMessage ("unmatched.paren"),
 908:                                    REException.REG_EPAREN, index);
 909:           }
 910: 
 911:         // START OF LINE OPERATOR
 912:         //  ^
 913: 
 914:         else if ((unit.ch == '^') && !(unit.bk || quot))
 915:           {
 916:             addToken (currentToken);
 917:             currentToken = null;
 918:             RETokenStart token = null;
 919:             if ((cflags & REG_MULTILINE) > 0)
 920:               {
 921:                 String sep = syntax.getLineSeparator ();
 922:                 if (sep == null)
 923:                   {
 924:                     token = new RETokenStart (subIndex, null, true);
 925:                   }
 926:                 else
 927:                   {
 928:                     token = new RETokenStart (subIndex, sep);
 929:                   }
 930:               }
 931:             else
 932:               {
 933:                 token = new RETokenStart (subIndex, null);
 934:               }
 935:             addToken (token);
 936:           }
 937: 
 938:         // END OF LINE OPERATOR
 939:         //  $
 940: 
 941:         else if ((unit.ch == '$') && !(unit.bk || quot))
 942:           {
 943:             addToken (currentToken);
 944:             currentToken = null;
 945:             RETokenEnd token = null;
 946:             if ((cflags & REG_MULTILINE) > 0)
 947:               {
 948:                 String sep = syntax.getLineSeparator ();
 949:                 if (sep == null)
 950:                   {
 951:                     token = new RETokenEnd (subIndex, null, true);
 952:                   }
 953:                 else
 954:                   {
 955:                     token = new RETokenEnd (subIndex, sep);
 956:                   }
 957:               }
 958:             else
 959:               {
 960:                 token = new RETokenEnd (subIndex, null);
 961:               }
 962:             addToken (token);
 963:           }
 964: 
 965:         // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 966:         //  .
 967: 
 968:         else if ((unit.ch == '.') && !(unit.bk || quot))
 969:           {
 970:             addToken (currentToken);
 971:             currentToken =
 972:               new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
 973:                               || ((cflags & REG_DOT_NEWLINE) > 0),
 974:                               syntax.get (RESyntax.RE_DOT_NOT_NULL));
 975:           }
 976: 
 977:         // ZERO-OR-MORE REPEAT OPERATOR
 978:         //  *
 979:         //
 980:         // This method used to check "repeat.empty.token" to avoid such regexp
 981:         // as "(a*)*", but now "repeat.empty.token" is allowed.
 982: 
 983:         else if ((unit.ch == '*') && !(unit.bk || quot))
 984:           {
 985:             if (currentToken == null)
 986:               throw new REException (getLocalizedMessage ("repeat.no.token"),
 987:                                      REException.REG_BADRPT, index);
 988:             if (currentToken instanceof RETokenRepeated)
 989:               throw new REException (getLocalizedMessage ("repeat.chained"),
 990:                                      REException.REG_BADRPT, index);
 991:             if (currentToken instanceof RETokenWordBoundary
 992:                 || currentToken instanceof RETokenWordBoundary)
 993:               throw new REException (getLocalizedMessage ("repeat.assertion"),
 994:                                      REException.REG_BADRPT, index);
 995:             currentToken =
 996:               setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
 997:           }
 998: 
 999:         // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
1000:         //  + | \+ depending on RE_BK_PLUS_QM
1001:         //  not available if RE_LIMITED_OPS is set
1002:         //
1003:         // This method used to check "repeat.empty.token" to avoid such regexp
1004:         // as "(a*)+", but now "repeat.empty.token" is allowed.
1005: 
1006:         else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007:                  && (!syntax.
1008:                      get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1009:           {
1010:             if (currentToken == null)
1011:               throw new REException (getLocalizedMessage ("repeat.no.token"),
1012:                                      REException.REG_BADRPT, index);
1013: 
1014:             // Check for possessive matching on RETokenRepeated
1015:             if (currentToken instanceof RETokenRepeated)
1016:               {
1017:                 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018:                 if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019:                     && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020:                   tokenRep.makePossessive ();
1021:                 else
1022:                   throw new
1023:                     REException (getLocalizedMessage ("repeat.chained"),
1024:                                  REException.REG_BADRPT, index);
1025: 
1026:               }
1027:             else if (currentToken instanceof RETokenWordBoundary
1028:                      || currentToken instanceof RETokenWordBoundary)
1029:               throw new REException (getLocalizedMessage ("repeat.assertion"),
1030:                                      REException.REG_BADRPT, index);
1031:             else
1032:             currentToken =
1033:               setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1034:           }
1035: 
1036:         // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1037:         //  ? | \? depending on RE_BK_PLUS_QM
1038:         //  not available if RE_LIMITED_OPS is set
1039:         //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1040: 
1041:         else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042:                  && (!syntax.
1043:                      get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1044:           {
1045:             if (currentToken == null)
1046:               throw new REException (getLocalizedMessage ("repeat.no.token"),
1047:                                      REException.REG_BADRPT, index);
1048: 
1049:             // Check for stingy matching on RETokenRepeated
1050:             if (currentToken instanceof RETokenRepeated)
1051:               {
1052:                 RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053:                 if (syntax.get (RESyntax.RE_STINGY_OPS)
1054:                     && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055:                   tokenRep.makeStingy ();
1056:                 else
1057:                   throw new
1058:                     REException (getLocalizedMessage ("repeat.chained"),
1059:                                  REException.REG_BADRPT, index);
1060:               }
1061:             else if (currentToken instanceof RETokenWordBoundary
1062:                      || currentToken instanceof RETokenWordBoundary)
1063:               throw new REException (getLocalizedMessage ("repeat.assertion"),
1064:                                      REException.REG_BADRPT, index);
1065:             else
1066:             currentToken = setRepeated (currentToken, 0, 1, index);
1067:           }
1068: 
1069:         // OCTAL CHARACTER
1070:         //  \0377
1071: 
1072:         else if (unit.bk && (unit.ch == '0')
1073:                  && syntax.get (RESyntax.RE_OCTAL_CHAR))
1074:           {
1075:             CharExpression ce =
1076:               getCharExpression (pattern, index - 2, pLength, syntax);
1077:             if (ce == null)
1078:               throw new REException ("invalid octal character",
1079:                                      REException.REG_ESCAPE, index);
1080:             index = index - 2 + ce.len;
1081:             addToken (currentToken);
1082:             currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083:             if (insensUSASCII)
1084:               currentToken.unicodeAware = false;
1085:           }
1086: 
1087:         // BACKREFERENCE OPERATOR
1088:         //  \1 \2 ... \9 and \10 \11 \12 ...
1089:         // not available if RE_NO_BK_REFS is set
1090:         // Perl recognizes \10, \11, and so on only if enough number of
1091:         // parentheses have opened before it, otherwise they are treated
1092:         // as aliases of \010, \011, ... (octal characters).  In case of
1093:         // Sun's JDK, octal character expression must always begin with \0.
1094:         // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
1095:         // JDK treats \2 as a back reference to the 2nd group because
1096:         // there are only two groups. But in our poor implementation,
1097:         // we cannot help but treat \29 as a back reference to the 29th group.
1098: 
1099:         else if (unit.bk && Character.isDigit (unit.ch)
1100:                  && !syntax.get (RESyntax.RE_NO_BK_REFS))
1101:           {
1102:             addToken (currentToken);
1103:             int numBegin = index - 1;
1104:             int numEnd = pLength;
1105:             for (int i = index; i < pLength; i++)
1106:               {
1107:                 if (!Character.isDigit (pattern[i]))
1108:                   {
1109:                     numEnd = i;
1110:                     break;
1111:                   }
1112:               }
1113:             int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1114: 
1115:             currentToken = new RETokenBackRef (subIndex, num, insens);
1116:             if (insensUSASCII)
1117:               currentToken.unicodeAware = false;
1118:             index = numEnd;
1119:           }
1120: 
1121:         // START OF STRING OPERATOR
1122:         //  \A if RE_STRING_ANCHORS is set
1123: 
1124:         else if (unit.bk && (unit.ch == 'A')
1125:                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1126:           {
1127:             addToken (currentToken);
1128:             currentToken = new RETokenStart (subIndex, null);
1129:           }
1130: 
1131:         // WORD BREAK OPERATOR
1132:         //  \b if ????
1133: 
1134:         else if (unit.bk && (unit.ch == 'b')
1135:                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1136:           {
1137:             addToken (currentToken);
1138:             currentToken =
1139:               new RETokenWordBoundary (subIndex,
1140:                                        RETokenWordBoundary.
1141:                                        BEGIN | RETokenWordBoundary.END,
1142:                                        false);
1143:           }
1144: 
1145:         // WORD BEGIN OPERATOR
1146:         //  \< if ????
1147:         else if (unit.bk && (unit.ch == '<'))
1148:           {
1149:             addToken (currentToken);
1150:             currentToken =
1151:               new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152:                                        false);
1153:           }
1154: 
1155:         // WORD END OPERATOR
1156:         //  \> if ????
1157:         else if (unit.bk && (unit.ch == '>'))
1158:           {
1159:             addToken (currentToken);
1160:             currentToken =
1161:               new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162:                                        false);
1163:           }
1164: 
1165:         // NON-WORD BREAK OPERATOR
1166:         // \B if ????
1167: 
1168:         else if (unit.bk && (unit.ch == 'B')
1169:                  && syntax.get (RESyntax.RE_STRING_ANCHORS))
1170:           {
1171:             addToken (currentToken);
1172:             currentToken =
1173:               new RETokenWordBoundary (subIndex,
1174:                                        RETokenWordBoundary.
1175:                                        BEGIN | RETokenWordBoundary.END, true);
1176:           }
1177: 
1178: 
1179:         // DIGIT OPERATOR
1180:         //  \d if RE_CHAR_CLASS_ESCAPES is set
1181: 
1182:         else if (unit.bk && (unit.ch == 'd')
1183:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1184:           {
1185:             addToken (currentToken);
1186:             currentToken =
1187:               new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188:             if (insensUSASCII)
1189:               currentToken.unicodeAware = false;
1190:           }
1191: 
1192:         // NON-DIGIT OPERATOR
1193:         //  \D
1194: 
1195:         else if (unit.bk && (unit.ch == 'D')
1196:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1197:           {
1198:             addToken (currentToken);
1199:             currentToken =
1200:               new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201:             if (insensUSASCII)
1202:               currentToken.unicodeAware = false;
1203:           }
1204: 
1205:         // NEWLINE ESCAPE
1206:         //  \n
1207: 
1208:         else if (unit.bk && (unit.ch == 'n'))
1209:           {
1210:             addToken (currentToken);
1211:             currentToken = new RETokenChar (subIndex, '\n', false);
1212:           }
1213: 
1214:         // RETURN ESCAPE
1215:         //  \r
1216: 
1217:         else if (unit.bk && (unit.ch == 'r'))
1218:           {
1219:             addToken (currentToken);
1220:             currentToken = new RETokenChar (subIndex, '\r', false);
1221:           }
1222: 
1223:         // WHITESPACE OPERATOR
1224:         //  \s if RE_CHAR_CLASS_ESCAPES is set
1225: 
1226:         else if (unit.bk && (unit.ch == 's')
1227:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1228:           {
1229:             addToken (currentToken);
1230:             currentToken =
1231:               new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232:             if (insensUSASCII)
1233:               currentToken.unicodeAware = false;
1234:           }
1235: 
1236:         // NON-WHITESPACE OPERATOR
1237:         //  \S
1238: 
1239:         else if (unit.bk && (unit.ch == 'S')
1240:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1241:           {
1242:             addToken (currentToken);
1243:             currentToken =
1244:               new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245:             if (insensUSASCII)
1246:               currentToken.unicodeAware = false;
1247:           }
1248: 
1249:         // TAB ESCAPE
1250:         //  \t
1251: 
1252:         else if (unit.bk && (unit.ch == 't'))
1253:           {
1254:             addToken (currentToken);
1255:             currentToken = new RETokenChar (subIndex, '\t', false);
1256:           }
1257: 
1258:         // ALPHANUMERIC OPERATOR
1259:         //  \w
1260: 
1261:         else if (unit.bk && (unit.ch == 'w')
1262:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1263:           {
1264:             addToken (currentToken);
1265:             currentToken =
1266:               new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267:             if (insensUSASCII)
1268:               currentToken.unicodeAware = false;
1269:           }
1270: 
1271:         // NON-ALPHANUMERIC OPERATOR
1272:         //  \W
1273: 
1274:         else if (unit.bk && (unit.ch == 'W')
1275:                  && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1276:           {
1277:             addToken (currentToken);
1278:             currentToken =
1279:               new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280:             if (insensUSASCII)
1281:               currentToken.unicodeAware = false;
1282:           }
1283: 
1284:         // END OF STRING OPERATOR
1285:         //  \Z, \z
1286: 
1287:         // FIXME: \Z and \z are different in that if the input string
1288:         // ends with a line terminator, \Z matches the position before
1289:         // the final terminator.  This special behavior of \Z is yet
1290:         // to be implemented.
1291: 
1292:         else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293:                  syntax.get (RESyntax.RE_STRING_ANCHORS))
1294:           {
1295:             addToken (currentToken);
1296:             currentToken = new RETokenEnd (subIndex, null);
1297:           }
1298: 
1299:         // HEX CHARACTER, UNICODE CHARACTER
1300:         //  \x1B, \u1234
1301: 
1302:         else
1303:           if ((unit.bk && (unit.ch == 'x')
1304:                && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305:                                                          && (unit.ch == 'u')
1306:                                                          && syntax.
1307:                                                          get (RESyntax.
1308:                                                               RE_UNICODE_CHAR)))
1309:           {
1310:             CharExpression ce =
1311:               getCharExpression (pattern, index - 2, pLength, syntax);
1312:             if (ce == null)
1313:               throw new REException ("invalid hex character",
1314:                                      REException.REG_ESCAPE, index);
1315:             index = index - 2 + ce.len;
1316:             addToken (currentToken);
1317:             currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318:             if (insensUSASCII)
1319:               currentToken.unicodeAware = false;
1320:           }
1321: 
1322:         // NAMED PROPERTY
1323:         // \p{prop}, \P{prop}
1324: 
1325:         else
1326:           if ((unit.bk && (unit.ch == 'p')
1327:                && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328:                                                                && (unit.ch ==
1329:                                                                    'P')
1330:                                                                && syntax.
1331:                                                                get (RESyntax.
1332:                                                                     RE_NAMED_PROPERTY)))
1333:           {
1334:             NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335:             if (np == null)
1336:               throw new REException ("invalid escape sequence",
1337:                                      REException.REG_ESCAPE, index);
1338:             index = index - 2 + np.len;
1339:             addToken (currentToken);
1340:             currentToken =
1341:               getRETokenNamedProperty (subIndex, np, insens, index);
1342:             if (insensUSASCII)
1343:               currentToken.unicodeAware = false;
1344:           }
1345: 
1346:         // END OF PREVIOUS MATCH
1347:         //  \G
1348: 
1349:         else if (unit.bk && (unit.ch == 'G') &&
1350:                  syntax.get (RESyntax.RE_STRING_ANCHORS))
1351:           {
1352:             addToken (currentToken);
1353:             currentToken = new RETokenEndOfPreviousMatch (subIndex);
1354:           }
1355: 
1356:         // NON-SPECIAL CHARACTER (or escape to make literal)
1357:         //  c | \* for example
1358: 
1359:         else
1360:           {                     // not a special character
1361:             addToken (currentToken);
1362:             currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363:             if (insensUSASCII)
1364:               currentToken.unicodeAware = false;
1365:           }
1366:       }                         // end while
1367: 
1368:     // Add final buffered token and an EndSub marker
1369:     addToken (currentToken);
1370: 
1371:     if (branches != null)
1372:       {
1373:         branches.
1374:           add (new
1375:                RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376:                    maximumLength));
1377:         branches.trimToSize (); // compact the Vector
1378:         minimumLength = 0;
1379:         maximumLength = 0;
1380:         firstToken = lastToken = null;
1381:         addToken (new RETokenOneOf (subIndex, branches, false));
1382:       }
1383:     else
1384:       addToken (new RETokenEndSub (subIndex));
1385: 
1386:   }
1387: 
1388:   private static class ParseCharClassResult
1389:   {
1390:     RETokenOneOf token;
1391:     int index;
1392:     boolean returnAtAndOperator = false;
1393:   }
1394: 
1395:   /**
1396:    * Parse [...] or [^...] and make an RETokenOneOf instance.
1397:    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1398:    * @param pattern Input array of characters to be parsed.
1399:    * @param index Index pointing to the character next to the beginning '['.
1400:    * @param pLength Limit of the input array.
1401:    * @param cflags Compilation flags used to parse the pattern.
1402:    * @param pflags Flags that affect the behavior of this method.
1403:    * @param syntax Syntax used to parse the pattern.
1404:    */
1405:   private static ParseCharClassResult parseCharClass (int subIndex,
1406:                                                       char[]pattern,
1407:                                                       int index, int pLength,
1408:                                                       int cflags,
1409:                                                       RESyntax syntax,
1410:                                                       int pflags) throws
1411:     REException
1412:   {
1413: 
1414:     boolean insens = ((cflags & REG_ICASE) > 0);
1415:     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416:     final ArrayList < REToken > options = new ArrayList < REToken > ();
1417:       ArrayList < Object > addition = new ArrayList < Object > ();
1418:     boolean additionAndAppeared = false;
1419:     final int RETURN_AT_AND = 0x01;
1420:     boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421:     boolean negative = false;
1422:     char ch;
1423: 
1424:     char lastChar = 0;
1425:     boolean lastCharIsSet = false;
1426:     if (index == pLength)
1427:       throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428:                              REException.REG_EBRACK, index);
1429: 
1430:     // Check for initial caret, negation
1431:     if ((ch = pattern[index]) == '^')
1432:       {
1433:         negative = true;
1434:         if (++index == pLength)
1435:           throw new REException (getLocalizedMessage ("class.no.end"),
1436:                                  REException.REG_EBRACK, index);
1437:           ch = pattern[index];
1438:       }
1439: 
1440:     // Check for leading right bracket literal
1441:     if (ch == ']')
1442:       {
1443:         lastChar = ch;
1444:         lastCharIsSet = true;
1445:         if (++index == pLength)
1446:           throw new REException (getLocalizedMessage ("class.no.end"),
1447:                                  REException.REG_EBRACK, index);
1448:       }
1449: 
1450:     while ((ch = pattern[index++]) != ']')
1451:       {
1452:         if ((ch == '-') && (lastCharIsSet))
1453:           {
1454:             if (index == pLength)
1455:               throw new REException (getLocalizedMessage ("class.no.end"),
1456:                                      REException.REG_EBRACK, index);
1457:             if ((ch = pattern[index]) == ']')
1458:               {
1459:                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460:                 if (insensUSASCII)
1461:                   t.unicodeAware = false;
1462:                 options.add (t);
1463:                 lastChar = '-';
1464:               }
1465:             else
1466:               {
1467:                 if ((ch == '\\')
1468:                     && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1469:                   {
1470:                     CharExpression ce =
1471:                       getCharExpression (pattern, index, pLength, syntax);
1472:                     if (ce == null)
1473:                       throw new REException ("invalid escape sequence",
1474:                                              REException.REG_ESCAPE, index);
1475:                     ch = ce.ch;
1476:                     index = index + ce.len - 1;
1477:                   }
1478:                 RETokenRange t =
1479:                   new RETokenRange (subIndex, lastChar, ch, insens);
1480:                 if (insensUSASCII)
1481:                   t.unicodeAware = false;
1482:                 options.add (t);
1483:                 lastChar = 0;
1484:                 lastCharIsSet = false;
1485:                 index++;
1486:               }
1487:           }
1488:         else if ((ch == '\\')
1489:                  && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1490:           {
1491:             if (index == pLength)
1492:               throw new REException (getLocalizedMessage ("class.no.end"),
1493:                                      REException.REG_EBRACK, index);
1494:             int posixID = -1;
1495:             boolean negate = false;
1496:             char asciiEsc = 0;
1497:             boolean asciiEscIsSet = false;
1498:             NamedProperty np = null;
1499:             if (("dswDSW".indexOf (pattern[index]) != -1)
1500:                 && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1501:               {
1502:                 switch (pattern[index])
1503:                   {
1504:                   case 'D':
1505:                     negate = true;
1506:                   case 'd':
1507:                     posixID = RETokenPOSIX.DIGIT;
1508:                     break;
1509:                   case 'S':
1510:                     negate = true;
1511:                   case 's':
1512:                     posixID = RETokenPOSIX.SPACE;
1513:                     break;
1514:                   case 'W':
1515:                     negate = true;
1516:                   case 'w':
1517:                     posixID = RETokenPOSIX.ALNUM;
1518:                     break;
1519:                   }
1520:               }
1521:             if (("pP".indexOf (pattern[index]) != -1)
1522:                 && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1523:               {
1524:                 np = getNamedProperty (pattern, index - 1, pLength);
1525:                 if (np == null)
1526:                   throw new REException ("invalid escape sequence",
1527:                                          REException.REG_ESCAPE, index);
1528:                 index = index - 1 + np.len - 1;
1529:               }
1530:             else
1531:               {
1532:                 CharExpression ce =
1533:                   getCharExpression (pattern, index - 1, pLength, syntax);
1534:                 if (ce == null)
1535:                   throw new REException ("invalid escape sequence",
1536:                                          REException.REG_ESCAPE, index);
1537:                 asciiEsc = ce.ch;
1538:                 asciiEscIsSet = true;
1539:                 index = index - 1 + ce.len - 1;
1540:               }
1541:             if (lastCharIsSet)
1542:               {
1543:                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544:                 if (insensUSASCII)
1545:                   t.unicodeAware = false;
1546:                 options.add (t);
1547:               }
1548: 
1549:             if (posixID != -1)
1550:               {
1551:                 RETokenPOSIX t =
1552:                   new RETokenPOSIX (subIndex, posixID, insens, negate);
1553:                 if (insensUSASCII)
1554:                   t.unicodeAware = false;
1555:                 options.add (t);
1556:               }
1557:             else if (np != null)
1558:               {
1559:                 RETokenNamedProperty t =
1560:                   getRETokenNamedProperty (subIndex, np, insens, index);
1561:                 if (insensUSASCII)
1562:                   t.unicodeAware = false;
1563:                 options.add (t);
1564:               }
1565:             else if (asciiEscIsSet)
1566:               {
1567:                 lastChar = asciiEsc;
1568:                 lastCharIsSet = true;
1569:               }
1570:             else
1571:               {
1572:                 lastChar = pattern[index];
1573:                 lastCharIsSet = true;
1574:               }
1575:             ++index;
1576:           }
1577:         else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578:                  && (index < pLength) && (pattern[index] == ':'))
1579:           {
1580:             CPStringBuilder posixSet = new CPStringBuilder ();
1581:             index = getPosixSet (pattern, index + 1, posixSet);
1582:             int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583:             if (posixId != -1)
1584:               {
1585:                 RETokenPOSIX t =
1586:                   new RETokenPOSIX (subIndex, posixId, insens, false);
1587:                 if (insensUSASCII)
1588:                   t.unicodeAware = false;
1589:                 options.add (t);
1590:               }
1591:           }
1592:         else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1593:           {
1594:             ParseCharClassResult result =
1595:               parseCharClass (subIndex, pattern, index, pLength, cflags,
1596:                               syntax, 0);
1597:             addition.add (result.token);
1598:             addition.add ("|");
1599:             index = result.index;
1600:           }
1601:         else if ((ch == '&') &&
1602:                  (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603:                  (index < pLength) && (pattern[index] == '&'))
1604:           {
1605:             if (returnAtAndOperator)
1606:               {
1607:                 ParseCharClassResult result = new ParseCharClassResult ();
1608:                 options.trimToSize ();
1609:                 if (additionAndAppeared)
1610:                   addition.add ("&");
1611:                 if (addition.size () == 0)
1612:                   addition = null;
1613:                 result.token = new RETokenOneOf (subIndex,
1614:                                                  options, addition, negative);
1615:                 result.index = index - 1;
1616:                 result.returnAtAndOperator = true;
1617:                 return result;
1618:               }
1619:             // The precedence of the operator "&&" is the lowest.
1620:             // So we postpone adding "&" until other elements
1621:             // are added. And we insert Boolean.FALSE at the
1622:             // beginning of the list of tokens following "&&".
1623:             // So, "&&[a-b][k-m]" will be stored in the Vecter
1624:             // addition in this order:
1625:             //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1626:             if (additionAndAppeared)
1627:               addition.add ("&");
1628:             addition.add (Boolean.FALSE);
1629:             additionAndAppeared = true;
1630: 
1631:             // The part on which "&&" operates may be either
1632:             //   (1) explicitly enclosed by []
1633:             //   or
1634:             //   (2) not enclosed by [] and terminated by the
1635:             //       next "&&" or the end of the character list.
1636:             //  Let the preceding else if block do the case (1).
1637:             //  We must do something in case of (2).
1638:             if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1639:               {
1640:                 ParseCharClassResult result =
1641:                   parseCharClass (subIndex, pattern, index + 1, pLength,
1642:                                   cflags, syntax,
1643:                                   RETURN_AT_AND);
1644:                 addition.add (result.token);
1645:                 addition.add ("|");
1646:                 // If the method returned at the next "&&", it is OK.
1647:                 // Otherwise we have eaten the mark of the end of this
1648:                 // character list "]".  In this case we must give back
1649:                 // the end mark.
1650:                 index = (result.returnAtAndOperator ?
1651:                          result.index : result.index - 1);
1652:               }
1653:           }
1654:         else
1655:           {
1656:             if (lastCharIsSet)
1657:               {
1658:                 RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659:                 if (insensUSASCII)
1660:                   t.unicodeAware = false;
1661:                 options.add (t);
1662:               }
1663:             lastChar = ch;
1664:             lastCharIsSet = true;
1665:           }
1666:         if (index == pLength)
1667:           throw new REException (getLocalizedMessage ("class.no.end"),
1668:                                  REException.REG_EBRACK, index);
1669:       }                         // while in list
1670:     // Out of list, index is one past ']'
1671: 
1672:     if (lastCharIsSet)
1673:       {
1674:         RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675:         if (insensUSASCII)
1676:           t.unicodeAware = false;
1677:         options.add (t);
1678:       }
1679: 
1680:     ParseCharClassResult result = new ParseCharClassResult ();
1681:     // Create a new RETokenOneOf
1682:     options.trimToSize ();
1683:     if (additionAndAppeared)
1684:       addition.add ("&");
1685:     if (addition.size () == 0)
1686:       addition = null;
1687:     result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688:     result.index = index;
1689:     return result;
1690:   }
1691: 
1692:   private static int getCharUnit (char[]input, int index, CharUnit unit,
1693:                                   boolean quot) throws REException
1694:   {
1695:     unit.ch = input[index++];
1696:     unit.bk = (unit.ch == '\\'
1697:                && (!quot || index >= input.length || input[index] == 'E'));
1698:     if (unit.bk)
1699:       if (index < input.length)
1700:         unit.ch = input[index++];
1701:       else
1702:         throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703:                                REException.REG_ESCAPE, index);
1704:     return index;
1705:   }
1706: 
1707:   private static int parseInt (char[]input, int pos, int len, int radix)
1708:   {
1709:     int ret = 0;
1710:     for (int i = pos; i < pos + len; i++)
1711:       {
1712:         ret = ret * radix + Character.digit (input[i], radix);
1713:       }
1714:     return ret;
1715:   }
1716: 
1717:   /**
1718:    * This class represents various expressions for a character.
1719:    * "a"      : 'a' itself.
1720:    * "\0123"  : Octal char 0123
1721:    * "\x1b"   : Hex char 0x1b
1722:    * "\u1234" : Unicode char \u1234
1723:    */
1724:   private static class CharExpression
1725:   {
1726:     /** character represented by this expression */
1727:     char ch;
1728:     /** String expression */
1729:     String expr;
1730:     /** length of this expression */
1731:     int len;
1732:     public String toString ()
1733:     {
1734:       return expr;
1735:     }
1736:   }
1737: 
1738:   private static CharExpression getCharExpression (char[]input, int pos,
1739:                                                    int lim, RESyntax syntax)
1740:   {
1741:     CharExpression ce = new CharExpression ();
1742:     char c = input[pos];
1743:     if (c == '\\')
1744:       {
1745:         if (pos + 1 >= lim)
1746:           return null;
1747:         c = input[pos + 1];
1748:         switch (c)
1749:           {
1750:           case 't':
1751:             ce.ch = '\t';
1752:             ce.len = 2;
1753:             break;
1754:           case 'n':
1755:             ce.ch = '\n';
1756:             ce.len = 2;
1757:             break;
1758:           case 'r':
1759:             ce.ch = '\r';
1760:             ce.len = 2;
1761:             break;
1762:           case 'x':
1763:           case 'u':
1764:             if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765:                 (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1766:               {
1767:                 int l = 0;
1768:                 int expectedLength = (c == 'x' ? 2 : 4);
1769:                 for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1770:                   {
1771:                     if (i >= lim)
1772:                       break;
1773:                     if (!((input[i] >= '0' && input[i] <= '9') ||
1774:                           (input[i] >= 'A' && input[i] <= 'F') ||
1775:                           (input[i] >= 'a' && input[i] <= 'f')))
1776:                       break;
1777:                     l++;
1778:                   }
1779:                 if (l != expectedLength)
1780:                   return null;
1781:                 ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782:                 ce.len = l + 2;
1783:               }
1784:             else
1785:               {
1786:                 ce.ch = c;
1787:                 ce.len = 2;
1788:               }
1789:             break;
1790:           case '0':
1791:             if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1792:               {
1793:                 int l = 0;
1794:                 for (int i = pos + 2; i < pos + 2 + 3; i++)
1795:                   {
1796:                     if (i >= lim)
1797:                       break;
1798:                     if (input[i] < '0' || input[i] > '7')
1799:                       break;
1800:                     l++;
1801:                   }
1802:                 if (l == 3 && input[pos + 2] > '3')
1803:                   l--;
1804:                 if (l <= 0)
1805:                   return null;
1806:                 ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807:                 ce.len = l + 2;
1808:               }
1809:             else
1810:               {
1811:                 ce.ch = c;
1812:                 ce.len = 2;
1813:               }
1814:             break;
1815:           default:
1816:             ce.ch = c;
1817:             ce.len = 2;
1818:             break;
1819:           }
1820:       }
1821:     else
1822:       {
1823:         ce.ch = input[pos];
1824:         ce.len = 1;
1825:       }
1826:     ce.expr = new String (input, pos, ce.len);
1827:     return ce;
1828:   }
1829: 
1830:   /**
1831:    * This class represents a substring in a pattern string expressing
1832:    * a named property.
1833:    * "\pA"      : Property named "A"
1834:    * "\p{prop}" : Property named "prop"
1835:    * "\PA"      : Property named "A" (Negated)
1836:    * "\P{prop}" : Property named "prop" (Negated)
1837:    */
1838:   private static class NamedProperty
1839:   {
1840:     /** Property name */
1841:     String name;
1842:     /** Negated or not */
1843:     boolean negate;
1844:     /** length of this expression */
1845:     int len;
1846:   }
1847: 
1848:   private static NamedProperty getNamedProperty (char[]input, int pos,
1849:                                                  int lim)
1850:   {
1851:     NamedProperty np = new NamedProperty ();
1852:     char c = input[pos];
1853:     if (c == '\\')
1854:       {
1855:         if (++pos >= lim)
1856:           return null;
1857:         c = input[pos++];
1858:         switch (c)
1859:           {
1860:           case 'p':
1861:             np.negate = false;
1862:             break;
1863:           case 'P':
1864:             np.negate = true;
1865:             break;
1866:           default:
1867:             return null;
1868:           }
1869:         c = input[pos++];
1870:         if (c == '{')
1871:           {
1872:             int p = -1;
1873:             for (int i = pos; i < lim; i++)
1874:               {
1875:                 if (input[i] == '}')
1876:                   {
1877:                     p = i;
1878:                     break;
1879:                   }
1880:               }
1881:             if (p < 0)
1882:               return null;
1883:             int len = p - pos;
1884:             np.name = new String (input, pos, len);
1885:             np.len = len + 4;
1886:           }
1887:         else
1888:           {
1889:             np.name = new String (input, pos - 1, 1);
1890:             np.len = 3;
1891:           }
1892:         return np;
1893:       }
1894:     else
1895:       return null;
1896:   }
1897: 
1898:   private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899:                                                                NamedProperty
1900:                                                                np,
1901:                                                                boolean insens,
1902:                                                                int index)
1903:     throws REException
1904:   {
1905:     try
1906:     {
1907:       return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1908:     }
1909:     catch (REException e)
1910:     {
1911:       REException ree;
1912:       ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913:       ree.initCause (e);
1914:       throw ree;
1915:     }
1916:   }
1917: 
1918:   /**
1919:    * Checks if the regular expression matches the input in its entirety.
1920:    *
1921:    * @param input The input text.
1922:    */
1923:   public boolean isMatch (Object input)
1924:   {
1925:     return isMatch (input, 0, 0);
1926:   }
1927: 
1928:   /**
1929:    * Checks if the input string, starting from index, is an exact match of
1930:    * this regular expression.
1931:    *
1932:    * @param input The input text.
1933:    * @param index The offset index at which the search should be begin.
1934:    */
1935:   public boolean isMatch (Object input, int index)
1936:   {
1937:     return isMatch (input, index, 0);
1938:   }
1939: 
1940: 
1941:   /**
1942:    * Checks if the input, starting from index and using the specified
1943:    * execution flags, is an exact match of this regular expression.
1944:    *
1945:    * @param input The input text.
1946:    * @param index The offset index at which the search should be begin.
1947:    * @param eflags The logical OR of any execution flags above.
1948:    */
1949:   public boolean isMatch (Object input, int index, int eflags)
1950:   {
1951:     return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1952:   }
1953: 
1954:   private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1955:   {
1956:     if (firstToken == null)     // Trivial case
1957:       return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958:     REMatch m = new REMatch (numSubs, index, eflags);
1959:     if (firstToken.match (input, m))
1960:       {
1961:         if (m != null)
1962:           {
1963:             if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1964:               {
1965:                 return true;
1966:               }
1967:           }
1968:       }
1969:     return false;
1970:   }
1971: 
1972:   /**
1973:    * Returns the maximum number of subexpressions in this regular expression.
1974:    * If the expression contains branches, the value returned will be the
1975:    * maximum subexpressions in any of the branches.
1976:    */
1977:   public int getNumSubs ()
1978:   {
1979:     return numSubs;
1980:   }
1981: 
1982:   // Overrides REToken.setUncle
1983:   void setUncle (REToken uncle)
1984:   {
1985:     if (lastToken != null)
1986:       {
1987:         lastToken.setUncle (uncle);
1988:       }
1989:     else
1990:       super.setUncle (uncle);   // to deal with empty subexpressions
1991:   }
1992: 
1993:   // Overrides REToken.chain
1994: 
1995:   boolean chain (REToken next)
1996:   {
1997:     super.chain (next);
1998:     setUncle (next);
1999:     return true;
2000:   }
2001: 
2002:   /**
2003:    * Returns the minimum number of characters that could possibly
2004:    * constitute a match of this regular expression.
2005:    */
2006:   public int getMinimumLength ()
2007:   {
2008:     return minimumLength;
2009:   }
2010: 
2011:   public int getMaximumLength ()
2012:   {
2013:     return maximumLength;
2014:   }
2015: 
2016:   /**
2017:    * Returns an array of all matches found in the input.
2018:    *
2019:    * If the regular expression allows the empty string to match, it will
2020:    * substitute matches at all positions except the end of the input.
2021:    *
2022:    * @param input The input text.
2023:    * @return a non-null (but possibly zero-length) array of matches
2024:    */
2025:   public REMatch[] getAllMatches (Object input)
2026:   {
2027:     return getAllMatches (input, 0, 0);
2028:   }
2029: 
2030:   /**
2031:    * Returns an array of all matches found in the input,
2032:    * beginning at the specified index position.
2033:    *
2034:    * If the regular expression allows the empty string to match, it will
2035:    * substitute matches at all positions except the end of the input.
2036:    *
2037:    * @param input The input text.
2038:    * @param index The offset index at which the search should be begin.
2039:    * @return a non-null (but possibly zero-length) array of matches
2040:    */
2041:   public REMatch[] getAllMatches (Object input, int index)
2042:   {
2043:     return getAllMatches (input, index, 0);
2044:   }
2045: 
2046:   /**
2047:    * Returns an array of all matches found in the input string,
2048:    * beginning at the specified index position and using the specified
2049:    * execution flags.
2050:    *
2051:    * If the regular expression allows the empty string to match, it will
2052:    * substitute matches at all positions except the end of the input.
2053:    *
2054:    * @param input The input text.
2055:    * @param index The offset index at which the search should be begin.
2056:    * @param eflags The logical OR of any execution flags above.
2057:    * @return a non-null (but possibly zero-length) array of matches
2058:    */
2059:   public REMatch[] getAllMatches (Object input, int index, int eflags)
2060:   {
2061:     return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2062:   }
2063: 
2064:   // this has been changed since 1.03 to be non-overlapping matches
2065:   private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066:                                        int eflags)
2067:   {
2068:     List < REMatch > all = new ArrayList < REMatch > ();
2069:     REMatch m = null;
2070:     while ((m = getMatchImpl (input, index, eflags, null)) != null)
2071:       {
2072:         all.add (m);
2073:         index = m.getEndIndex ();
2074:         if (m.end[0] == 0)
2075:           {                     // handle pathological case of zero-length match
2076:             index++;
2077:             input.move (1);
2078:           }
2079:         else
2080:           {
2081:             input.move (m.end[0]);
2082:           }
2083:         if (!input.isValid ())
2084:           break;
2085:       }
2086:     return all.toArray (new REMatch[all.size ()]);
2087:   }
2088: 
2089:   /* Implements abstract method REToken.match() */
2090:   boolean match (CharIndexed input, REMatch mymatch)
2091:   {
2092:     input.setHitEnd (mymatch);
2093:     if (firstToken == null)
2094:       {
2095:         return next (input, mymatch);
2096:       }
2097: 
2098:     // Note the start of this subexpression
2099:     mymatch.start1[subIndex] = mymatch.index;
2100: 
2101:     return firstToken.match (input, mymatch);
2102:   }
2103: 
2104:   REMatch findMatch (CharIndexed input, REMatch mymatch)
2105:   {
2106:     if (mymatch.backtrackStack == null)
2107:       mymatch.backtrackStack = new BacktrackStack ();
2108:     boolean b = match (input, mymatch);
2109:     if (b)
2110:       {
2111:         return mymatch;
2112:       }
2113:     return null;
2114:   }
2115: 
2116:   /**
2117:    * Returns the first match found in the input.  If no match is found,
2118:    * null is returned.
2119:    *
2120:    * @param input The input text.
2121:    * @return An REMatch instance referencing the match, or null if none.
2122:    */
2123:   public REMatch getMatch (Object input)
2124:   {
2125:     return getMatch (input, 0, 0);
2126:   }
2127: 
2128:   /**
2129:    * Returns the first match found in the input, beginning
2130:    * the search at the specified index.  If no match is found,
2131:    * returns null.
2132:    *
2133:    * @param input The input text.
2134:    * @param index The offset within the text to begin looking for a match.
2135:    * @return An REMatch instance referencing the match, or null if none.
2136:    */
2137:   public REMatch getMatch (Object input, int index)
2138:   {
2139:     return getMatch (input, index, 0);
2140:   }
2141: 
2142:   /**
2143:    * Returns the first match found in the input, beginning
2144:    * the search at the specified index, and using the specified
2145:    * execution flags.  If no match is found, returns null.
2146:    *
2147:    * @param input The input text.
2148:    * @param index The offset index at which the search should be begin.
2149:    * @param eflags The logical OR of any execution flags above.
2150:    * @return An REMatch instance referencing the match, or null if none.
2151:    */
2152:   public REMatch getMatch (Object input, int index, int eflags)
2153:   {
2154:     return getMatch (input, index, eflags, null);
2155:   }
2156: 
2157:   /**
2158:    * Returns the first match found in the input, beginning the search
2159:    * at the specified index, and using the specified execution flags.
2160:    * If no match is found, returns null.  If a StringBuffer is
2161:    * provided and is non-null, the contents of the input text from the
2162:    * index to the beginning of the match (or to the end of the input,
2163:    * if there is no match) are appended to the StringBuffer.
2164:    *
2165:    * @param input The input text.
2166:    * @param index The offset index at which the search should be begin.
2167:    * @param eflags The logical OR of any execution flags above.
2168:    * @param buffer The StringBuffer to save pre-match text in.
2169:    * @return An REMatch instance referencing the match, or null if none.  */
2170:   public REMatch getMatch (Object input, int index, int eflags,
2171:                            CPStringBuilder buffer)
2172:   {
2173:     return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174:                          buffer);
2175:   }
2176: 
2177:   REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178:                         CPStringBuilder buffer)
2179:   {
2180:     boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181:     boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182:     RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183:     if (tryEntireMatch)
2184:       {
2185:         RETokenEnd reEnd = new RETokenEnd (0, null);
2186:         reEnd.setFake (true);
2187:         re.chain (reEnd);
2188:       }
2189:     // Create a new REMatch to hold results
2190:     REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2191:     do
2192:       {
2193:         /* The following potimization is commented out because
2194:            the matching should be tried even if the length of
2195:            input is obviously too short in order that
2196:            java.util.regex.Matcher#hitEnd() may work correctly.
2197:            // Optimization: check if anchor + minimumLength > length
2198:            if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
2199:          */
2200:         if (re.match (input, mymatch))
2201:           {
2202:             REMatch best = mymatch;
2203:             // We assume that the match that coms first is the best.
2204:             // And the following "The longer, the better" rule has
2205:             // been commented out. The longest is not neccesarily
2206:             // the best. For example, "a" out of "aaa" is the best
2207:             // match for /a+?/.
2208:             /*
2209:                // Find best match of them all to observe leftmost longest
2210:                while ((mymatch = mymatch.next) != null) {
2211:                if (mymatch.index > best.index) {
2212:                best = mymatch;
2213:                }
2214:                }
2215:              */
2216:             best.end[0] = best.index;
2217:             best.finish (input);
2218:             input.setLastMatch (best);
2219:             return best;
2220:           }
2221:         /* End of the optimization commented out
2222:            }
2223:          */
2224:         mymatch.clear (++anchor);
2225:         // Append character to buffer if needed
2226:         if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2227:           {
2228:             buffer.append (input.charAt (0));
2229:           }
2230:         // java.util.regex.Matcher#hitEnd() requires that the search should
2231:         // be tried at the end of input, so we use move1(1) instead of move(1)
2232:       }
2233:     while (doMove && input.move1 (1));
2234: 
2235:     // Special handling at end of input for e.g. "$"
2236:     if (minimumLength == 0)
2237:       {
2238:         if (match (input, mymatch))
2239:           {
2240:             mymatch.finish (input);
2241:             return mymatch;
2242:           }
2243:       }
2244: 
2245:     return null;
2246:   }
2247: 
2248:   /**
2249:    * Returns an REMatchEnumeration that can be used to iterate over the
2250:    * matches found in the input text.
2251:    *
2252:    * @param input The input text.
2253:    * @return A non-null REMatchEnumeration instance.
2254:    */
2255:   public REMatchEnumeration getMatchEnumeration (Object input)
2256:   {
2257:     return getMatchEnumeration (input, 0, 0);
2258:   }
2259: 
2260: 
2261:   /**
2262:    * Returns an REMatchEnumeration that can be used to iterate over the
2263:    * matches found in the input text.
2264:    *
2265:    * @param input The input text.
2266:    * @param index The offset index at which the search should be begin.
2267:    * @return A non-null REMatchEnumeration instance, with its input cursor
2268:    *  set to the index position specified.
2269:    */
2270:   public REMatchEnumeration getMatchEnumeration (Object input, int index)
2271:   {
2272:     return getMatchEnumeration (input, index, 0);
2273:   }
2274: 
2275:   /**
2276:    * Returns an REMatchEnumeration that can be used to iterate over the
2277:    * matches found in the input text.
2278:    *
2279:    * @param input The input text.
2280:    * @param index The offset index at which the search should be begin.
2281:    * @param eflags The logical OR of any execution flags above.
2282:    * @return A non-null REMatchEnumeration instance, with its input cursor
2283:    *  set to the index position specified.
2284:    */
2285:   public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286:                                                  int eflags)
2287:   {
2288:     return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289:                                    index, eflags);
2290:   }
2291: 
2292: 
2293:   /**
2294:    * Substitutes the replacement text for the first match found in the input.
2295:    *
2296:    * @param input The input text.
2297:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2298:    * @return A String interpolating the substituted text.
2299:    * @see REMatch#substituteInto
2300:    */
2301:   public String substitute (Object input, String replace)
2302:   {
2303:     return substitute (input, replace, 0, 0);
2304:   }
2305: 
2306:   /**
2307:    * Substitutes the replacement text for the first match found in the input
2308:    * beginning at the specified index position.  Specifying an index
2309:    * effectively causes the regular expression engine to throw away the
2310:    * specified number of characters.
2311:    *
2312:    * @param input The input text.
2313:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2314:    * @param index The offset index at which the search should be begin.
2315:    * @return A String containing the substring of the input, starting
2316:    *   at the index position, and interpolating the substituted text.
2317:    * @see REMatch#substituteInto
2318:    */
2319:   public String substitute (Object input, String replace, int index)
2320:   {
2321:     return substitute (input, replace, index, 0);
2322:   }
2323: 
2324:   /**
2325:    * Substitutes the replacement text for the first match found in the input
2326:    * string, beginning at the specified index position and using the
2327:    * specified execution flags.
2328:    *
2329:    * @param input The input text.
2330:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2331:    * @param index The offset index at which the search should be begin.
2332:    * @param eflags The logical OR of any execution flags above.
2333:    * @return A String containing the substring of the input, starting
2334:    *   at the index position, and interpolating the substituted text.
2335:    * @see REMatch#substituteInto
2336:    */
2337:   public String substitute (Object input, String replace, int index,
2338:                             int eflags)
2339:   {
2340:     return substituteImpl (makeCharIndexed (input, index), replace, index,
2341:                            eflags);
2342:   }
2343: 
2344:   private String substituteImpl (CharIndexed input, String replace, int index,
2345:                                  int eflags)
2346:   {
2347:     CPStringBuilder buffer = new CPStringBuilder ();
2348:     REMatch m = getMatchImpl (input, index, eflags, buffer);
2349:     if (m == null)
2350:       return buffer.toString ();
2351:     buffer.append (getReplacement (replace, m, eflags));
2352:     if (input.move (m.end[0]))
2353:       {
2354:         do
2355:           {
2356:             buffer.append (input.charAt (0));
2357:           }
2358:         while (input.move (1));
2359:       }
2360:     return buffer.toString ();
2361:   }
2362: 
2363:   /**
2364:    * Substitutes the replacement text for each non-overlapping match found
2365:    * in the input text.
2366:    *
2367:    * @param input The input text.
2368:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2369:    * @return A String interpolating the substituted text.
2370:    * @see REMatch#substituteInto
2371:    */
2372:   public String substituteAll (Object input, String replace)
2373:   {
2374:     return substituteAll (input, replace, 0, 0);
2375:   }
2376: 
2377:   /**
2378:    * Substitutes the replacement text for each non-overlapping match found
2379:    * in the input text, starting at the specified index.
2380:    *
2381:    * If the regular expression allows the empty string to match, it will
2382:    * substitute matches at all positions except the end of the input.
2383:    *
2384:    * @param input The input text.
2385:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2386:    * @param index The offset index at which the search should be begin.
2387:    * @return A String containing the substring of the input, starting
2388:    *   at the index position, and interpolating the substituted text.
2389:    * @see REMatch#substituteInto
2390:    */
2391:   public String substituteAll (Object input, String replace, int index)
2392:   {
2393:     return substituteAll (input, replace, index, 0);
2394:   }
2395: 
2396:   /**
2397:    * Substitutes the replacement text for each non-overlapping match found
2398:    * in the input text, starting at the specified index and using the
2399:    * specified execution flags.
2400:    *
2401:    * @param input The input text.
2402:    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
2403:    * @param index The offset index at which the search should be begin.
2404:    * @param eflags The logical OR of any execution flags above.
2405:    * @return A String containing the substring of the input, starting
2406:    *   at the index position, and interpolating the substituted text.
2407:    * @see REMatch#substituteInto
2408:    */
2409:   public String substituteAll (Object input, String replace, int index,
2410:                                int eflags)
2411:   {
2412:     return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413:                               eflags);
2414:   }
2415: 
2416:   private String substituteAllImpl (CharIndexed input, String replace,
2417:                                     int index, int eflags)
2418:   {
2419:     CPStringBuilder buffer = new CPStringBuilder ();
2420:     REMatch m;
2421:     while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2422:       {
2423:         buffer.append (getReplacement (replace, m, eflags));
2424:         index = m.getEndIndex ();
2425:         if (m.end[0] == 0)
2426:           {
2427:             char ch = input.charAt (0);
2428:             if (ch != CharIndexed.OUT_OF_BOUNDS)
2429:               buffer.append (ch);
2430:             input.move (1);
2431:           }
2432:         else
2433:           {
2434:             input.move (m.end[0]);
2435:           }
2436: 
2437:         if (!input.isValid ())
2438:           break;
2439:       }
2440:     return buffer.toString ();
2441:   }
2442: 
2443:   public static String getReplacement (String replace, REMatch m, int eflags)
2444:   {
2445:     if ((eflags & REG_NO_INTERPOLATE) > 0)
2446:       return replace;
2447:     else
2448:       {
2449:         if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2450:           {
2451:             CPStringBuilder sb = new CPStringBuilder ();
2452:             int l = replace.length ();
2453:             for (int i = 0; i < l; i++)
2454:               {
2455:                 char c = replace.charAt (i);
2456:                 switch (c)
2457:                   {
2458:                   case '\\':
2459:                     i++;
2460:                     // Let StringIndexOutOfBoundsException be thrown.
2461:                     sb.append (replace.charAt (i));
2462:                     break;
2463:                   case '$':
2464:                     int i1 = i + 1;
2465:                     while (i1 < replace.length () &&
2466:                            Character.isDigit (replace.charAt (i1)))
2467:                       i1++;
2468:                     sb.append (m.substituteInto (replace.substring (i, i1)));
2469:                     i = i1 - 1;
2470:                     break;
2471:                   default:
2472:                     sb.append (c);
2473:                   }
2474:               }
2475:             return sb.toString ();
2476:           }
2477:         else
2478:           return m.substituteInto (replace);
2479:       }
2480:   }
2481: 
2482:   /* Helper function for constructor */
2483:   private void addToken (REToken next)
2484:   {
2485:     if (next == null)
2486:       return;
2487:     minimumLength += next.getMinimumLength ();
2488:     int nmax = next.getMaximumLength ();
2489:     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490:       maximumLength += nmax;
2491:     else
2492:       maximumLength = Integer.MAX_VALUE;
2493: 
2494:     if (firstToken == null)
2495:       {
2496:         lastToken = firstToken = next;
2497:       }
2498:     else
2499:       {
2500:         // if chain returns false, it "rejected" the token due to
2501:         // an optimization, and next was combined with lastToken
2502:         if (lastToken.chain (next))
2503:           {
2504:             lastToken = next;
2505:           }
2506:       }
2507:   }
2508: 
2509:   private static REToken setRepeated (REToken current, int min, int max,
2510:                                       int index) throws REException
2511:   {
2512:     if (current == null)
2513:       throw new REException (getLocalizedMessage ("repeat.no.token"),
2514:                              REException.REG_BADRPT, index);
2515:       return new RETokenRepeated (current.subIndex, current, min, max);
2516:   }
2517: 
2518:   private static int getPosixSet (char[]pattern, int index,
2519:                                   CPStringBuilder buf)
2520:   {
2521:     // Precondition: pattern[index-1] == ':'
2522:     // we will return pos of closing ']'.
2523:     int i;
2524:     for (i = index; i < (pattern.length - 1); i++)
2525:       {
2526:         if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527:           return i + 2;
2528:         buf.append (pattern[i]);
2529:       }
2530:     return index;               // didn't match up
2531:   }
2532: 
2533:   private int getMinMax (char[]input, int index, IntPair minMax,
2534:                          RESyntax syntax) throws REException
2535:   {
2536:     // Precondition: input[index-1] == '{', minMax != null
2537: 
2538:     boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539:     int startIndex = index;
2540:     if (index == input.length)
2541:       {
2542:         if (mustMatch)
2543:           throw new REException (getLocalizedMessage ("unmatched.brace"),
2544:                                  REException.REG_EBRACE, index);
2545:         else
2546:           return startIndex;
2547:       }
2548: 
2549:     int min, max = 0;
2550:     CharUnit unit = new CharUnit ();
2551:     CPStringBuilder buf = new CPStringBuilder ();
2552: 
2553:     // Read string of digits
2554:     do
2555:       {
2556:         index = getCharUnit (input, index, unit, false);
2557:         if (Character.isDigit (unit.ch))
2558:           buf.append (unit.ch);
2559:       }
2560:     while ((index != input.length) && Character.isDigit (unit.ch));
2561: 
2562:     // Check for {} tomfoolery
2563:     if (buf.length () == 0)
2564:       {
2565:         if (mustMatch)
2566:           throw new REException (getLocalizedMessage ("interval.error"),
2567:                                  REException.REG_EBRACE, index);
2568:         else
2569:         return startIndex;
2570:       }
2571: 
2572:     min = Integer.parseInt (buf.toString ());
2573: 
2574:     if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575:       max = min;
2576:     else if (index == input.length)
2577:       if (mustMatch)
2578:         throw new REException (getLocalizedMessage ("interval.no.end"),
2579:                                REException.REG_EBRACE, index);
2580:     else
2581:     return startIndex;
2582:     else
2583:   if ((unit.ch == ',') && !unit.bk)
2584:     {
2585:       buf = new CPStringBuilder ();
2586:       // Read string of digits
2587:       while (((index =
2588:                getCharUnit (input, index, unit, false)) != input.length)
2589:              && Character.isDigit (unit.ch))
2590:         buf.append (unit.ch);
2591: 
2592:       if (!
2593:           ((unit.ch == '}')
2594:            && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595:         if (mustMatch)
2596:           throw new REException (getLocalizedMessage ("interval.error"),
2597:                                  REException.REG_EBRACE, index);
2598:       else
2599:       return startIndex;
2600: 
2601:       // This is the case of {x,}
2602:       if (buf.length () == 0)
2603:         max = Integer.MAX_VALUE;
2604:       else
2605:         max = Integer.parseInt (buf.toString ());
2606:     }
2607:   else if (mustMatch)
2608:     throw new REException (getLocalizedMessage ("interval.error"),
2609:                            REException.REG_EBRACE, index);
2610:   else
2611:   return startIndex;
2612: 
2613:   // We know min and max now, and they are valid.
2614: 
2615:   minMax.first = min;
2616:   minMax.second = max;
2617: 
2618:   // return the index following the '}'
2619:   return index;
2620:   }
2621: 
2622:    /**
2623:     * Return a human readable form of the compiled regular expression,
2624:     * useful for debugging.
2625:     */
2626:   public String toString ()
2627:   {
2628:     CPStringBuilder sb = new CPStringBuilder ();
2629:     dump (sb);
2630:     return sb.toString ();
2631:   }
2632: 
2633:   void dump (CPStringBuilder os)
2634:   {
2635:     os.append ("(?#startRE subIndex=" + subIndex + ")");
2636:     if (subIndex == 0)
2637:       os.append ("?:");
2638:     if (firstToken != null)
2639:       firstToken.dumpAll (os);
2640:     if (subIndex == 0)
2641:       os.append (")");
2642:     os.append ("(?#endRE subIndex=" + subIndex + ")");
2643:   }
2644: 
2645:   // Cast input appropriately or throw exception
2646:   // This method was originally a private method, but has been made
2647:   // public because java.util.regex.Matcher uses this.
2648:   public static CharIndexed makeCharIndexed (Object input, int index)
2649:   {
2650:     // The case where input is already a CharIndexed is supposed
2651:     // be the most likely because this is the case with
2652:     // java.util.regex.Matcher.
2653:     // We could let a String or a CharSequence fall through
2654:     // to final input, but since it'a very likely input type,
2655:     // we check it first.
2656:     if (input instanceof CharIndexed)
2657:       {
2658:         CharIndexed ci = (CharIndexed) input;
2659:         ci.setAnchor (index);
2660:         return ci;
2661:       }
2662:     else if (input instanceof CharSequence)
2663:       return new CharIndexedCharSequence ((CharSequence) input, index);
2664:     else if (input instanceof String)
2665:       return new CharIndexedString ((String) input, index);
2666:     else if (input instanceof char[])
2667:       return new CharIndexedCharArray ((char[]) input, index);
2668:     else if (input instanceof StringBuffer)
2669:       return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670:     else if (input instanceof InputStream)
2671:       return new CharIndexedInputStream ((InputStream) input, index);
2672:     else
2673:       return new CharIndexedString (input.toString (), index);
2674:   }
2675: }