Frames | No Frames |
1: /* gnu/regexp/RESyntax.java 2: Copyright (C) 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.java.util.regex; 40: import java.io.Serializable; 41: import java.util.BitSet; 42: 43: /** 44: * An RESyntax specifies the way a regular expression will be compiled. 45: * This class provides a number of predefined useful constants for 46: * emulating popular regular expression syntaxes. Additionally the 47: * user may construct his or her own syntax, using any combination of the 48: * syntax bit constants. The syntax is an optional argument to any of the 49: * matching methods on class RE. 50: * 51: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 52: */ 53: 54: public final class RESyntax implements Serializable 55: { 56: static final String DEFAULT_LINE_SEPARATOR = 57: System.getProperty ("line.separator"); 58: 59: private BitSet bits; 60: 61: // true for the constant defined syntaxes 62: private boolean isFinal = false; 63: 64: private String lineSeparator = DEFAULT_LINE_SEPARATOR; 65: 66: // Values for constants are bit indexes 67: 68: /** 69: * Syntax bit. Backslash is an escape character in lists. 70: */ 71: public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0; 72: 73: /** 74: * Syntax bit. Use \? instead of ? and \+ instead of +. 75: */ 76: public static final int RE_BK_PLUS_QM = 1; 77: 78: /** 79: * Syntax bit. POSIX character classes ([:...:]) in lists are allowed. 80: */ 81: public static final int RE_CHAR_CLASSES = 2; 82: 83: /** 84: * Syntax bit. ^ and $ are special everywhere. 85: * <B>Not implemented.</B> 86: */ 87: public static final int RE_CONTEXT_INDEP_ANCHORS = 3; 88: 89: /** 90: * Syntax bit. Repetition operators are only special in valid positions. 91: * <B>Not implemented.</B> 92: */ 93: public static final int RE_CONTEXT_INDEP_OPS = 4; 94: 95: /** 96: * Syntax bit. Repetition and alternation operators are invalid 97: * at start and end of pattern and other places. 98: * <B>Not implemented</B>. 99: */ 100: public static final int RE_CONTEXT_INVALID_OPS = 5; 101: 102: /** 103: * Syntax bit. Match-any-character operator (.) matches a newline. 104: */ 105: public static final int RE_DOT_NEWLINE = 6; 106: 107: /** 108: * Syntax bit. Match-any-character operator (.) does not match a null. 109: */ 110: public static final int RE_DOT_NOT_NULL = 7; 111: 112: /** 113: * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed. 114: */ 115: public static final int RE_INTERVALS = 8; 116: 117: /** 118: * Syntax bit. No alternation (|), match one-or-more (+), or 119: * match zero-or-one (?) operators. 120: */ 121: public static final int RE_LIMITED_OPS = 9; 122: 123: /** 124: * Syntax bit. Newline is an alternation operator. 125: */ 126: public static final int RE_NEWLINE_ALT = 10; // impl. 127: 128: /** 129: * Syntax bit. Intervals use { } instead of \{ \} 130: */ 131: public static final int RE_NO_BK_BRACES = 11; 132: 133: /** 134: * Syntax bit. Grouping uses ( ) instead of \( \). 135: */ 136: public static final int RE_NO_BK_PARENS = 12; 137: 138: /** 139: * Syntax bit. Backreferences not allowed. 140: */ 141: public static final int RE_NO_BK_REFS = 13; 142: 143: /** 144: * Syntax bit. Alternation uses | instead of \| 145: */ 146: public static final int RE_NO_BK_VBAR = 14; 147: 148: /** 149: * Syntax bit. <B>Not implemented</B>. 150: */ 151: public static final int RE_NO_EMPTY_RANGES = 15; 152: 153: /** 154: * Syntax bit. An unmatched right parenthesis (')' or '\)', depending 155: * on RE_NO_BK_PARENS) will throw an exception when compiling. 156: */ 157: public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16; 158: 159: /** 160: * Syntax bit. <B>Not implemented.</B> 161: */ 162: public static final int RE_HAT_LISTS_NOT_NEWLINE = 17; 163: 164: /** 165: * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?). 166: */ 167: public static final int RE_STINGY_OPS = 18; 168: 169: /** 170: * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W). 171: */ 172: public static final int RE_CHAR_CLASS_ESCAPES = 19; 173: 174: /** 175: * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved). 176: */ 177: public static final int RE_PURE_GROUPING = 20; 178: 179: /** 180: * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression 181: * to the text following the current position without consuming that text. 182: */ 183: public static final int RE_LOOKAHEAD = 21; 184: 185: /** 186: * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z). 187: */ 188: public static final int RE_STRING_ANCHORS = 22; 189: 190: /** 191: * Syntax bit. Allow embedded comments, (?#comment), as in Perl5. 192: */ 193: public static final int RE_COMMENTS = 23; 194: 195: /** 196: * Syntax bit. Allow character class escapes within lists, as in Perl5. 197: */ 198: public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24; 199: 200: /** 201: * Syntax bit. Possessive matching is allowed (++, *+, ?+, {x,y}+). 202: */ 203: public static final int RE_POSSESSIVE_OPS = 25; 204: 205: /** 206: * Syntax bit. Allow embedded flags, (?is-x), as in Perl5. 207: */ 208: public static final int RE_EMBEDDED_FLAGS = 26; 209: 210: /** 211: * Syntax bit. Allow octal char (\0377), as in Perl5. 212: */ 213: public static final int RE_OCTAL_CHAR = 27; 214: 215: /** 216: * Syntax bit. Allow hex char (\x1b), as in Perl5. 217: */ 218: public static final int RE_HEX_CHAR = 28; 219: 220: /** 221: * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4. 222: */ 223: public static final int RE_UNICODE_CHAR = 29; 224: 225: /** 226: * Syntax bit. Allow named property (\p{P}, \P{p}), as in Perl5. 227: */ 228: public static final int RE_NAMED_PROPERTY = 30; 229: 230: /** 231: * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4. 232: */ 233: public static final int RE_NESTED_CHARCLASS = 31; 234: 235: private static final int BIT_TOTAL = 32; 236: 237: /** 238: * Predefined syntax. 239: * Emulates regular expression support in the awk utility. 240: */ 241: public static final RESyntax RE_SYNTAX_AWK; 242: 243: /** 244: * Predefined syntax. 245: * Emulates regular expression support in the ed utility. 246: */ 247: public static final RESyntax RE_SYNTAX_ED; 248: 249: /** 250: * Predefined syntax. 251: * Emulates regular expression support in the egrep utility. 252: */ 253: public static final RESyntax RE_SYNTAX_EGREP; 254: 255: /** 256: * Predefined syntax. 257: * Emulates regular expression support in the GNU Emacs editor. 258: */ 259: public static final RESyntax RE_SYNTAX_EMACS; 260: 261: /** 262: * Predefined syntax. 263: * Emulates regular expression support in the grep utility. 264: */ 265: public static final RESyntax RE_SYNTAX_GREP; 266: 267: /** 268: * Predefined syntax. 269: * Emulates regular expression support in the POSIX awk specification. 270: */ 271: public static final RESyntax RE_SYNTAX_POSIX_AWK; 272: 273: /** 274: * Predefined syntax. 275: * Emulates POSIX basic regular expression support. 276: */ 277: public static final RESyntax RE_SYNTAX_POSIX_BASIC; 278: 279: /** 280: * Predefined syntax. 281: * Emulates regular expression support in the POSIX egrep specification. 282: */ 283: public static final RESyntax RE_SYNTAX_POSIX_EGREP; 284: 285: /** 286: * Predefined syntax. 287: * Emulates POSIX extended regular expression support. 288: */ 289: public static final RESyntax RE_SYNTAX_POSIX_EXTENDED; 290: 291: /** 292: * Predefined syntax. 293: * Emulates POSIX basic minimal regular expressions. 294: */ 295: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC; 296: 297: /** 298: * Predefined syntax. 299: * Emulates POSIX extended minimal regular expressions. 300: */ 301: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED; 302: 303: /** 304: * Predefined syntax. 305: * Emulates regular expression support in the sed utility. 306: */ 307: public static final RESyntax RE_SYNTAX_SED; 308: 309: /** 310: * Predefined syntax. 311: * Emulates regular expression support in Larry Wall's perl, version 4, 312: */ 313: public static final RESyntax RE_SYNTAX_PERL4; 314: 315: /** 316: * Predefined syntax. 317: * Emulates regular expression support in Larry Wall's perl, version 4, 318: * using single line mode (/s modifier). 319: */ 320: public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s) 321: 322: /** 323: * Predefined syntax. 324: * Emulates regular expression support in Larry Wall's perl, version 5. 325: */ 326: public static final RESyntax RE_SYNTAX_PERL5; 327: 328: /** 329: * Predefined syntax. 330: * Emulates regular expression support in Larry Wall's perl, version 5, 331: * using single line mode (/s modifier). 332: */ 333: public static final RESyntax RE_SYNTAX_PERL5_S; 334: 335: /** 336: * Predefined syntax. 337: * Emulates regular expression support in Java 1.4's java.util.regex 338: * package. 339: */ 340: public static final RESyntax RE_SYNTAX_JAVA_1_4; 341: 342: static 343: { 344: // Define syntaxes 345: 346: RE_SYNTAX_EMACS = new RESyntax ().makeFinal (); 347: 348: RESyntax RE_SYNTAX_POSIX_COMMON = 349: new RESyntax ().set (RE_CHAR_CLASSES).set (RE_DOT_NEWLINE). 350: set (RE_DOT_NOT_NULL).set (RE_INTERVALS).set (RE_NO_EMPTY_RANGES). 351: makeFinal (); 352: 353: RE_SYNTAX_POSIX_BASIC = 354: new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_BK_PLUS_QM).makeFinal (); 355: 356: RE_SYNTAX_POSIX_EXTENDED = 357: new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_CONTEXT_INDEP_ANCHORS). 358: set (RE_CONTEXT_INDEP_OPS).set (RE_NO_BK_BRACES).set (RE_NO_BK_PARENS). 359: set (RE_NO_BK_VBAR).set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal (); 360: 361: RE_SYNTAX_AWK = 362: new RESyntax ().set (RE_BACKSLASH_ESCAPE_IN_LISTS). 363: set (RE_DOT_NOT_NULL).set (RE_NO_BK_PARENS).set (RE_NO_BK_REFS). 364: set (RE_NO_BK_VBAR).set (RE_NO_EMPTY_RANGES). 365: set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal (); 366: 367: RE_SYNTAX_POSIX_AWK = 368: new RESyntax (RE_SYNTAX_POSIX_EXTENDED). 369: set (RE_BACKSLASH_ESCAPE_IN_LISTS).makeFinal (); 370: 371: RE_SYNTAX_GREP = 372: new RESyntax ().set (RE_BK_PLUS_QM).set (RE_CHAR_CLASSES). 373: set (RE_HAT_LISTS_NOT_NEWLINE).set (RE_INTERVALS).set (RE_NEWLINE_ALT). 374: makeFinal (); 375: 376: RE_SYNTAX_EGREP = 377: new RESyntax ().set (RE_CHAR_CLASSES).set (RE_CONTEXT_INDEP_ANCHORS). 378: set (RE_CONTEXT_INDEP_OPS).set (RE_HAT_LISTS_NOT_NEWLINE). 379: set (RE_NEWLINE_ALT).set (RE_NO_BK_PARENS).set (RE_NO_BK_VBAR). 380: makeFinal (); 381: 382: RE_SYNTAX_POSIX_EGREP = 383: new RESyntax (RE_SYNTAX_EGREP).set (RE_INTERVALS).set (RE_NO_BK_BRACES). 384: makeFinal (); 385: 386: /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 387: 388: RE_SYNTAX_ED = new RESyntax (RE_SYNTAX_POSIX_BASIC).makeFinal (); 389: 390: RE_SYNTAX_SED = new RESyntax (RE_SYNTAX_POSIX_BASIC).makeFinal (); 391: 392: RE_SYNTAX_POSIX_MINIMAL_BASIC = 393: new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_LIMITED_OPS).makeFinal (); 394: 395: /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 396: replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 397: 398: RE_SYNTAX_POSIX_MINIMAL_EXTENDED = 399: new RESyntax (RE_SYNTAX_POSIX_COMMON).set (RE_CONTEXT_INDEP_ANCHORS). 400: set (RE_CONTEXT_INVALID_OPS).set (RE_NO_BK_BRACES). 401: set (RE_NO_BK_PARENS).set (RE_NO_BK_REFS).set (RE_NO_BK_VBAR). 402: set (RE_UNMATCHED_RIGHT_PAREN_ORD).makeFinal (); 403: 404: /* There is no official Perl spec, but here's a "best guess" */ 405: 406: RE_SYNTAX_PERL4 = new RESyntax ().set (RE_BACKSLASH_ESCAPE_IN_LISTS).set (RE_CONTEXT_INDEP_ANCHORS).set (RE_CONTEXT_INDEP_OPS) // except for '{', apparently 407: .set (RE_INTERVALS).set (RE_NO_BK_BRACES).set (RE_NO_BK_PARENS).set (RE_NO_BK_VBAR).set (RE_NO_EMPTY_RANGES).set (RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S 408: .makeFinal (); 409: 410: RE_SYNTAX_PERL4_S = 411: new RESyntax (RE_SYNTAX_PERL4).set (RE_DOT_NEWLINE).makeFinal (); 412: 413: RE_SYNTAX_PERL5 = new RESyntax (RE_SYNTAX_PERL4).set (RE_PURE_GROUPING) // (?:) 414: .set (RE_STINGY_OPS) // *?,??,+?,{}? 415: .set (RE_LOOKAHEAD) // (?=)(?!) 416: .set (RE_STRING_ANCHORS) // \A,\Z 417: .set (RE_CHAR_CLASS_ESC_IN_LISTS) // \d,\D,\w,\W,\s,\S within [] 418: .set (RE_COMMENTS) // (?#) 419: .set (RE_EMBEDDED_FLAGS) // (?imsx-imsx) 420: .set (RE_OCTAL_CHAR) // \0377 421: .set (RE_HEX_CHAR) // \x1b 422: .set (RE_NAMED_PROPERTY) // \p{prop}, \P{prop} 423: .makeFinal (); 424: 425: RE_SYNTAX_PERL5_S = 426: new RESyntax (RE_SYNTAX_PERL5).set (RE_DOT_NEWLINE).makeFinal (); 427: 428: RE_SYNTAX_JAVA_1_4 = new RESyntax (RE_SYNTAX_PERL5) 429: // XXX 430: .set (RE_POSSESSIVE_OPS) // *+,?+,++,{}+ 431: .set (RE_UNICODE_CHAR) // \u1234 432: .set (RE_NESTED_CHARCLASS) // [a-z&&[^p-r]] 433: .makeFinal (); 434: } 435: 436: /** 437: * Construct a new syntax object with all bits turned off. 438: * This is equivalent to RE_SYNTAX_EMACS. 439: */ 440: public RESyntax () 441: { 442: bits = new BitSet (BIT_TOTAL); 443: } 444: 445: /** 446: * Called internally when constructing predefined syntaxes 447: * so their interpretation cannot vary. Conceivably useful 448: * for your syntaxes as well. Causes IllegalAccessError to 449: * be thrown if any attempt to modify the syntax is made. 450: * 451: * @return this object for convenient chaining 452: */ 453: public RESyntax makeFinal () 454: { 455: isFinal = true; 456: return this; 457: } 458: 459: /** 460: * Construct a new syntax object with all bits set the same 461: * as the other syntax. 462: */ 463: public RESyntax (RESyntax other) 464: { 465: bits = (BitSet) other.bits.clone (); 466: } 467: 468: /** 469: * Check if a given bit is set in this syntax. 470: */ 471: public boolean get (int index) 472: { 473: return bits.get (index); 474: } 475: 476: /** 477: * Set a given bit in this syntax. 478: * 479: * @param index the constant (RESyntax.RE_xxx) bit to set. 480: * @return a reference to this object for easy chaining. 481: */ 482: public RESyntax set (int index) 483: { 484: if (isFinal) 485: throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final")); 486: bits.set (index); 487: return this; 488: } 489: 490: /** 491: * Clear a given bit in this syntax. 492: * 493: * @param index the constant (RESyntax.RE_xxx) bit to clear. 494: * @return a reference to this object for easy chaining. 495: */ 496: public RESyntax clear (int index) 497: { 498: if (isFinal) 499: throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final")); 500: bits.clear (index); 501: return this; 502: } 503: 504: /** 505: * Changes the line separator string for regular expressions 506: * created using this RESyntax. The default separator is the 507: * value returned by the system property "line.separator", which 508: * should be correct when reading platform-specific files from a 509: * filesystem. However, many programs may collect input from 510: * sources where the line separator is differently specified (for 511: * example, in the applet environment, the text box widget 512: * interprets line breaks as single-character newlines, 513: * regardless of the host platform. 514: * 515: * Note that setting the line separator to a character or 516: * characters that have specific meaning within the current syntax 517: * can cause unexpected chronosynclastic infundibula. 518: * 519: * @return this object for convenient chaining 520: */ 521: public RESyntax setLineSeparator (String aSeparator) 522: { 523: if (isFinal) 524: throw new IllegalAccessError (RE.getLocalizedMessage ("syntax.final")); 525: lineSeparator = aSeparator; 526: return this; 527: } 528: 529: /** 530: * Returns the currently active line separator string. The default 531: * is the platform-dependent system property "line.separator". 532: */ 533: public String getLineSeparator () 534: { 535: return lineSeparator; 536: } 537: }