Source for gnu.java.util.regex.RESyntax

   1: /* gnu/regexp/RESyntax.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.util.regex;
  40: import java.io.Serializable;
  41: import java.util.BitSet;
  42: 
  43: /**
  44:  * An RESyntax specifies the way a regular expression will be compiled.
  45:  * This class provides a number of predefined useful constants for
  46:  * emulating popular regular expression syntaxes.  Additionally the
  47:  * user may construct his or her own syntax, using any combination of the
  48:  * syntax bit constants.  The syntax is an optional argument to any of the
  49:  * matching methods on class RE.
  50:  *
  51:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  52:  */
  53: 
  54: public final class RESyntax implements Serializable
  55: {
  56:   static final String DEFAULT_LINE_SEPARATOR =
  57:     System.getProperty ("line.separator");
  58: 
  59:   private BitSet bits;
  60: 
  61:   // true for the constant defined syntaxes
  62:   private boolean isFinal = false;
  63: 
  64:   private String lineSeparator = DEFAULT_LINE_SEPARATOR;
  65: 
  66:   // Values for constants are bit indexes
  67: 
  68:   /**
  69:    * Syntax bit. Backslash is an escape character in lists.
  70:    */
  71:   public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
  72: 
  73:   /**
  74:    * Syntax bit. Use \? instead of ? and \+ instead of +.
  75:    */
  76:   public static final int RE_BK_PLUS_QM = 1;
  77: 
  78:   /**
  79:    * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
  80:    */
  81:   public static final int RE_CHAR_CLASSES = 2;
  82: 
  83:   /**
  84:    * Syntax bit. ^ and $ are special everywhere.
  85:    * <B>Not implemented.</B>
  86:    */
  87:   public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
  88: 
  89:   /**
  90:    * Syntax bit. Repetition operators are only special in valid positions.
  91:    * <B>Not implemented.</B>
  92:    */
  93:   public static final int RE_CONTEXT_INDEP_OPS = 4;
  94: 
  95:   /**
  96:    * Syntax bit. Repetition and alternation operators are invalid
  97:    * at start and end of pattern and other places.
  98:    * <B>Not implemented</B>.
  99:    */
 100:   public static final int RE_CONTEXT_INVALID_OPS = 5;
 101: 
 102:   /**
 103:    * Syntax bit. Match-any-character operator (.) matches a newline.
 104:    */
 105:   public static final int RE_DOT_NEWLINE = 6;
 106: 
 107:   /**
 108:    * Syntax bit. Match-any-character operator (.) does not match a null.
 109:    */
 110:   public static final int RE_DOT_NOT_NULL = 7;
 111: 
 112:   /**
 113:    * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
 114:    */
 115:   public static final int RE_INTERVALS = 8;
 116: 
 117:   /**
 118:    * Syntax bit. No alternation (|), match one-or-more (+), or
 119:    * match zero-or-one (?) operators.
 120:    */
 121:   public static final int RE_LIMITED_OPS = 9;
 122: 
 123:   /**
 124:    * Syntax bit. Newline is an alternation operator.
 125:    */
 126:   public static final int RE_NEWLINE_ALT = 10;  // impl.
 127: 
 128:   /**
 129:    * Syntax bit. Intervals use { } instead of \{ \}
 130:    */
 131:   public static final int RE_NO_BK_BRACES = 11;
 132: 
 133:   /**
 134:    * Syntax bit. Grouping uses ( ) instead of \( \).
 135:    */
 136:   public static final int RE_NO_BK_PARENS = 12;
 137: 
 138:   /**
 139:    * Syntax bit. Backreferences not allowed.
 140:    */
 141:   public static final int RE_NO_BK_REFS = 13;
 142: 
 143:   /**
 144:    * Syntax bit. Alternation uses | instead of \|
 145:    */
 146:   public static final int RE_NO_BK_VBAR = 14;
 147: 
 148:   /**
 149:    * Syntax bit. <B>Not implemented</B>.
 150:    */
 151:   public static final int RE_NO_EMPTY_RANGES = 15;
 152: 
 153:   /**
 154:    * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
 155:    * on RE_NO_BK_PARENS) will throw an exception when compiling.
 156:    */
 157:   public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
 158: 
 159:   /**
 160:    * Syntax bit. <B>Not implemented.</B>
 161:    */
 162:   public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
 163: 
 164:   /**
 165:    * Syntax bit.  Stingy matching is allowed (+?, *?, ??, {x,y}?).
 166:    */
 167:   public static final int RE_STINGY_OPS = 18;
 168: 
 169:   /**
 170:    * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
 171:    */
 172:   public static final int RE_CHAR_CLASS_ESCAPES = 19;
 173: 
 174:   /**
 175:    * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
 176:    */
 177:   public static final int RE_PURE_GROUPING = 20;
 178: 
 179:   /**
 180:    * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
 181:    * to the text following the current position without consuming that text.
 182:    */
 183:   public static final int RE_LOOKAHEAD = 21;
 184: 
 185:   /**
 186:    * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
 187:    */
 188:   public static final int RE_STRING_ANCHORS = 22;
 189: 
 190:   /**
 191:    * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
 192:    */
 193:   public static final int RE_COMMENTS = 23;
 194: 
 195:   /**
 196:    * Syntax bit. Allow character class escapes within lists, as in Perl5.
 197:    */
 198:   public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
 199: 
 200:   /**
 201:    * Syntax bit.  Possessive matching is allowed (++, *+, ?+, {x,y}+).
 202:    */
 203:   public static final int RE_POSSESSIVE_OPS = 25;
 204: 
 205:   /**
 206:    * Syntax bit.  Allow embedded flags, (?is-x), as in Perl5.
 207:    */
 208:   public static final int RE_EMBEDDED_FLAGS = 26;
 209: 
 210:   /**
 211:    * Syntax bit.  Allow octal char (\0377), as in Perl5.
 212:    */
 213:   public static final int RE_OCTAL_CHAR = 27;
 214: 
 215:   /**
 216:    * Syntax bit.  Allow hex char (\x1b), as in Perl5.
 217:    */
 218:   public static final int RE_HEX_CHAR = 28;
 219: 
 220:   /**
 221:    * Syntax bit.  Allow Unicode char (\u1234), as in Java 1.4.
 222:    */
 223:   public static final int RE_UNICODE_CHAR = 29;
 224: 
 225:   /**
 226:    * Syntax bit.  Allow named property (\p{P}, \P{p}), as in Perl5.
 227:    */
 228:   public static final int RE_NAMED_PROPERTY = 30;
 229: 
 230:   /**
 231:    * Syntax bit.  Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
 232:    */
 233:   public static final int RE_NESTED_CHARCLASS = 31;
 234: 
 235:   private static final int BIT_TOTAL = 32;
 236: 
 237:   /**
 238:    * Predefined syntax.
 239:    * Emulates regular expression support in the awk utility.
 240:    */
 241:   public static final RESyntax RE_SYNTAX_AWK;
 242: 
 243:   /**
 244:    * Predefined syntax.
 245:    * Emulates regular expression support in the ed utility.
 246:    */
 247:   public static final RESyntax RE_SYNTAX_ED;
 248: 
 249:   /**
 250:    * Predefined syntax.
 251:    * Emulates regular expression support in the egrep utility.
 252:    */
 253:   public static final RESyntax RE_SYNTAX_EGREP;
 254: 
 255:   /**
 256:    * Predefined syntax.
 257:    * Emulates regular expression support in the GNU Emacs editor.
 258:    */
 259:   public static final RESyntax RE_SYNTAX_EMACS;
 260: 
 261:   /**
 262:    * Predefined syntax.
 263:    * Emulates regular expression support in the grep utility.
 264:    */
 265:   public static final RESyntax RE_SYNTAX_GREP;
 266: 
 267:   /**
 268:    * Predefined syntax.
 269:    * Emulates regular expression support in the POSIX awk specification.
 270:    */
 271:   public static final RESyntax RE_SYNTAX_POSIX_AWK;
 272: 
 273:   /**
 274:    * Predefined syntax.
 275:    * Emulates POSIX basic regular expression support.
 276:    */
 277:   public static final RESyntax RE_SYNTAX_POSIX_BASIC;
 278: 
 279:   /**
 280:    * Predefined syntax.
 281:    * Emulates regular expression support in the POSIX egrep specification.
 282:    */
 283:   public static final RESyntax RE_SYNTAX_POSIX_EGREP;
 284: 
 285:   /**
 286:    * Predefined syntax.
 287:    * Emulates POSIX extended regular expression support.
 288:    */
 289:   public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
 290: 
 291:   /**
 292:    * Predefined syntax.
 293:    * Emulates POSIX basic minimal regular expressions.
 294:    */
 295:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
 296: 
 297:   /**
 298:    * Predefined syntax.
 299:    * Emulates POSIX extended minimal regular expressions.
 300:    */
 301:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
 302: 
 303:   /**
 304:    * Predefined syntax.
 305:    * Emulates regular expression support in the sed utility.
 306:    */
 307:   public static final RESyntax RE_SYNTAX_SED;
 308: 
 309:   /**
 310:    * Predefined syntax.
 311:    * Emulates regular expression support in Larry Wall's perl, version 4,
 312:    */
 313:   public static final RESyntax RE_SYNTAX_PERL4;
 314: 
 315:   /**
 316:    * Predefined syntax.
 317:    * Emulates regular expression support in Larry Wall's perl, version 4,
 318:    * using single line mode (/s modifier).
 319:    */
 320:   public static final RESyntax RE_SYNTAX_PERL4_S;       // single line mode (/s)
 321: 
 322:   /**
 323:    * Predefined syntax.
 324:    * Emulates regular expression support in Larry Wall's perl, version 5.
 325:    */
 326:   public static final RESyntax RE_SYNTAX_PERL5;
 327: 
 328:   /**
 329:    * Predefined syntax.
 330:    * Emulates regular expression support in Larry Wall's perl, version 5,
 331:    * using single line mode (/s modifier).
 332:    */
 333:   public static final RESyntax RE_SYNTAX_PERL5_S;
 334: 
 335:     /**
 336:      * Predefined syntax.
 337:      * Emulates regular expression support in Java 1.4's java.util.regex
 338:      * package.
 339:      */
 340:   public static final RESyntax RE_SYNTAX_JAVA_1_4;
 341: 
 342:   static
 343:   {
 344:     // Define syntaxes
 345: 
 346:     RE_SYNTAX_EMACS = new RESyntax ().makeFinal ();
 347: 
 348:     RESyntax RE_SYNTAX_POSIX_COMMON =
 349:       new RESyntax ().set (RE_CHAR_CLASSES).set (RE_DOT_NEWLINE).
 350:       set (RE_DOT_NOT_NULL).set (RE_INTERVALS).set (RE_NO_EMPTY_RANGES).
 351:       makeFinal ();
 352: 
 353:       RE_SYNTAX_POSIX_BASIC =
 354:       new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_BK_PLUS_QM).makeFinal ();
 355: 
 356:       RE_SYNTAX_POSIX_EXTENDED =
 357:       new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_CONTEXT_INDEP_ANCHORS).
 358:       set (RE_CONTEXT_INDEP_OPS).set (RE_NO_BK_BRACES).set (RE_NO_BK_PARENS).
 359:       set (RE_NO_BK_VBAR).set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal ();
 360: 
 361:       RE_SYNTAX_AWK =
 362:       new RESyntax ().set (RE_BACKSLASH_ESCAPE_IN_LISTS).
 363:       set (RE_DOT_NOT_NULL).set (RE_NO_BK_PARENS).set (RE_NO_BK_REFS).
 364:       set (RE_NO_BK_VBAR).set (RE_NO_EMPTY_RANGES).
 365:       set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal ();
 366: 
 367:       RE_SYNTAX_POSIX_AWK =
 368:       new RESyntax (RE_SYNTAX_POSIX_EXTENDED).
 369:       set (RE_BACKSLASH_ESCAPE_IN_LISTS).makeFinal ();
 370: 
 371:       RE_SYNTAX_GREP =
 372:       new RESyntax ().set (RE_BK_PLUS_QM).set (RE_CHAR_CLASSES).
 373:       set (RE_HAT_LISTS_NOT_NEWLINE).set (RE_INTERVALS).set (RE_NEWLINE_ALT).
 374:       makeFinal ();
 375: 
 376:       RE_SYNTAX_EGREP =
 377:       new RESyntax ().set (RE_CHAR_CLASSES).set (RE_CONTEXT_INDEP_ANCHORS).
 378:       set (RE_CONTEXT_INDEP_OPS).set (RE_HAT_LISTS_NOT_NEWLINE).
 379:       set (RE_NEWLINE_ALT).set (RE_NO_BK_PARENS).set (RE_NO_BK_VBAR).
 380:       makeFinal ();
 381: 
 382:       RE_SYNTAX_POSIX_EGREP =
 383:       new RESyntax (RE_SYNTAX_EGREP).set (RE_INTERVALS).set (RE_NO_BK_BRACES).
 384:       makeFinal ();
 385: 
 386:     /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
 387: 
 388:       RE_SYNTAX_ED = new RESyntax (RE_SYNTAX_POSIX_BASIC).makeFinal ();
 389: 
 390:       RE_SYNTAX_SED = new RESyntax (RE_SYNTAX_POSIX_BASIC).makeFinal ();
 391: 
 392:       RE_SYNTAX_POSIX_MINIMAL_BASIC =
 393:       new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_LIMITED_OPS).makeFinal ();
 394: 
 395:     /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
 396:        replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
 397: 
 398:       RE_SYNTAX_POSIX_MINIMAL_EXTENDED =
 399:       new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_CONTEXT_INDEP_ANCHORS).
 400:       set (RE_CONTEXT_INVALID_OPS).set (RE_NO_BK_BRACES).
 401:       set (RE_NO_BK_PARENS).set (RE_NO_BK_REFS).set (RE_NO_BK_VBAR).
 402:       set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal ();
 403: 
 404:     /* There is no official Perl spec, but here's a "best guess" */
 405: 
 406:       RE_SYNTAX_PERL4 = new RESyntax ().set (RE_BACKSLASH_ESCAPE_IN_LISTS).set (RE_CONTEXT_INDEP_ANCHORS).set (RE_CONTEXT_INDEP_OPS)    // except for '{', apparently
 407:       .set (RE_INTERVALS).set (RE_NO_BK_BRACES).set (RE_NO_BK_PARENS).set (RE_NO_BK_VBAR).set (RE_NO_EMPTY_RANGES).set (RE_CHAR_CLASS_ESCAPES)  // \d,\D,\w,\W,\s,\S
 408:       .makeFinal ();
 409: 
 410:       RE_SYNTAX_PERL4_S =
 411:       new RESyntax (RE_SYNTAX_PERL4).set (RE_DOT_NEWLINE).makeFinal ();
 412: 
 413:       RE_SYNTAX_PERL5 = new RESyntax (RE_SYNTAX_PERL4).set (RE_PURE_GROUPING)   // (?:)
 414:       .set (RE_STINGY_OPS)      // *?,??,+?,{}?
 415:       .set (RE_LOOKAHEAD)       // (?=)(?!)
 416:       .set (RE_STRING_ANCHORS)  // \A,\Z
 417:       .set (RE_CHAR_CLASS_ESC_IN_LISTS) // \d,\D,\w,\W,\s,\S within []
 418:       .set (RE_COMMENTS)        // (?#)
 419:       .set (RE_EMBEDDED_FLAGS)  // (?imsx-imsx)
 420:       .set (RE_OCTAL_CHAR)      // \0377
 421:       .set (RE_HEX_CHAR)        // \x1b
 422:       .set (RE_NAMED_PROPERTY)  // \p{prop}, \P{prop}
 423:       .makeFinal ();
 424: 
 425:       RE_SYNTAX_PERL5_S =
 426:       new RESyntax (RE_SYNTAX_PERL5).set (RE_DOT_NEWLINE).makeFinal ();
 427: 
 428:       RE_SYNTAX_JAVA_1_4 = new RESyntax (RE_SYNTAX_PERL5)
 429:       // XXX
 430:       .set (RE_POSSESSIVE_OPS)  // *+,?+,++,{}+
 431:       .set (RE_UNICODE_CHAR)    // \u1234
 432:       .set (RE_NESTED_CHARCLASS)        // [a-z&&[^p-r]]
 433:       .makeFinal ();
 434:   }
 435: 
 436:   /**
 437:    * Construct a new syntax object with all bits turned off.
 438:    * This is equivalent to RE_SYNTAX_EMACS.
 439:    */
 440:   public RESyntax ()
 441:   {
 442:     bits = new BitSet (BIT_TOTAL);
 443:   }
 444: 
 445:     /**
 446:      * Called internally when constructing predefined syntaxes
 447:      * so their interpretation cannot vary.  Conceivably useful
 448:      * for your syntaxes as well.  Causes IllegalAccessError to
 449:      * be thrown if any attempt to modify the syntax is made.
 450:      *
 451:      * @return this object for convenient chaining
 452:      */
 453:   public RESyntax makeFinal ()
 454:   {
 455:     isFinal = true;
 456:     return this;
 457:   }
 458: 
 459:   /**
 460:    * Construct a new syntax object with all bits set the same
 461:    * as the other syntax.
 462:    */
 463:   public RESyntax (RESyntax other)
 464:   {
 465:     bits = (BitSet) other.bits.clone ();
 466:   }
 467: 
 468:   /**
 469:    * Check if a given bit is set in this syntax.
 470:    */
 471:   public boolean get (int index)
 472:   {
 473:     return bits.get (index);
 474:   }
 475: 
 476:   /**
 477:    * Set a given bit in this syntax.
 478:    *
 479:    * @param index the constant (RESyntax.RE_xxx) bit to set.
 480:    * @return a reference to this object for easy chaining.
 481:    */
 482:   public RESyntax set (int index)
 483:   {
 484:     if (isFinal)
 485:       throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final"));
 486:     bits.set (index);
 487:     return this;
 488:   }
 489: 
 490:   /**
 491:    * Clear a given bit in this syntax.
 492:    *
 493:    * @param index the constant (RESyntax.RE_xxx) bit to clear.
 494:    * @return a reference to this object for easy chaining.
 495:    */
 496:   public RESyntax clear (int index)
 497:   {
 498:     if (isFinal)
 499:       throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final"));
 500:     bits.clear (index);
 501:     return this;
 502:   }
 503: 
 504:     /**
 505:      * Changes the line separator string for regular expressions
 506:      * created using this RESyntax.  The default separator is the
 507:      * value returned by the system property "line.separator", which
 508:      * should be correct when reading platform-specific files from a
 509:      * filesystem.  However, many programs may collect input from
 510:      * sources where the line separator is differently specified (for
 511:      * example, in the applet environment, the text box widget
 512:      * interprets line breaks as single-character newlines,
 513:      * regardless of the host platform.
 514:      *
 515:      * Note that setting the line separator to a character or
 516:      * characters that have specific meaning within the current syntax
 517:      * can cause unexpected chronosynclastic infundibula.
 518:      *
 519:      * @return this object for convenient chaining
 520:      */
 521:   public RESyntax setLineSeparator (String aSeparator)
 522:   {
 523:     if (isFinal)
 524:       throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final"));
 525:     lineSeparator = aSeparator;
 526:     return this;
 527:   }
 528: 
 529:     /**
 530:      * Returns the currently active line separator string.  The default
 531:      * is the platform-dependent system property "line.separator".
 532:      */
 533:   public String getLineSeparator ()
 534:   {
 535:     return lineSeparator;
 536:   }
 537: }