Frames | No Frames |
1: /* gnu/regexp/REMatch.java 2: Copyright (C) 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.java.util.regex; 40: 41: import gnu.java.lang.CPStringBuilder; 42: 43: import java.io.Serializable; 44: 45: /** 46: * An instance of this class represents a match 47: * completed by a gnu.regexp matching function. It can be used 48: * to obtain relevant information about the location of a match 49: * or submatch. 50: * 51: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 52: */ 53: public final class REMatch implements Serializable, Cloneable 54: { 55: private String matchedText; 56: private CharIndexed matchedCharIndexed; 57: 58: // These variables are package scope for fast access within the engine 59: int eflags; // execution flags this match was made using 60: 61: // Offset in source text where match was tried. This is zero-based; 62: // the actual position in the source text is given by (offset + anchor). 63: int offset; 64: 65: // Anchor position refers to the index into the source input 66: // at which the matching operation began. 67: // This is also useful for the ANCHORINDEX option. 68: int anchor; 69: 70: // Package scope; used by RE. 71: int index; // used while matching to mark current match position in input 72: // start1[i] is set when the i-th subexp starts. And start1[i] is copied 73: // to start[i] when the i-th subexp ends. So start[i] keeps the previously 74: // assigned value while the i-th subexp is being processed. This makes 75: // backreference to the i-th subexp within the i-th subexp possible. 76: int[] start; // start positions (relative to offset) for each (sub)exp. 77: int[] start1; // start positions (relative to offset) for each (sub)exp. 78: int[] end; // end positions for the same 79: // start[i] == -1 or end[i] == -1 means that the start/end position is void. 80: // start[i] == p or end[i] == p where p < 0 and p != -1 means that 81: // the actual start/end position is (p+1). Start/end positions may 82: // become negative when the subexpression is in a RETokenLookBehind. 83: boolean empty; // empty string matched. This flag is used only within 84: // RETokenRepeated. 85: 86: BacktrackStack backtrackStack; 87: 88: public Object clone () 89: { 90: try 91: { 92: REMatch copy = (REMatch) super.clone (); 93: 94: copy.start = (int[]) start.clone (); 95: copy.start1 = (int[]) start1.clone (); 96: copy.end = (int[]) end.clone (); 97: 98: return copy; 99: } 100: catch (CloneNotSupportedException e) 101: { 102: throw new Error (); // doesn't happen 103: } 104: } 105: 106: void assignFrom (REMatch other) 107: { 108: start = other.start; 109: start1 = other.start1; 110: end = other.end; 111: index = other.index; 112: backtrackStack = other.backtrackStack; 113: } 114: 115: REMatch (int subs, int anchor, int eflags) 116: { 117: start = new int[subs + 1]; 118: start1 = new int[subs + 1]; 119: end = new int[subs + 1]; 120: this.anchor = anchor; 121: this.eflags = eflags; 122: clear (anchor); 123: } 124: 125: void finish (CharIndexed text) 126: { 127: start[0] = 0; 128: CPStringBuilder sb = new CPStringBuilder (); 129: int i; 130: for (i = 0; i < end[0]; i++) 131: sb.append (text.charAt (i)); 132: matchedText = sb.toString (); 133: matchedCharIndexed = text; 134: for (i = 0; i < start.length; i++) 135: { 136: // If any subexpressions didn't terminate, they don't count 137: // TODO check if this code ever gets hit 138: if ((start[i] == -1) ^ (end[i] == -1)) 139: { 140: start[i] = -1; 141: end[i] = -1; 142: } 143: } 144: backtrackStack = null; 145: } 146: 147: /** Clears the current match and moves the offset to the new index. */ 148: void clear (int index) 149: { 150: offset = index; 151: this.index = 0; 152: for (int i = 0; i < start.length; i++) 153: { 154: start[i] = start1[i] = end[i] = -1; 155: } 156: backtrackStack = null; 157: } 158: 159: /** 160: * Returns the string matching the pattern. This makes it convenient 161: * to write code like the following: 162: * <P> 163: * <code> 164: * REMatch myMatch = myExpression.getMatch(myString);<br> 165: * if (myMatch != null) System.out.println("Regexp found: "+myMatch); 166: * </code> 167: */ 168: public String toString () 169: { 170: return matchedText; 171: } 172: 173: /** 174: * Returns the index within the input text where the match in its entirety 175: * began. 176: */ 177: public int getStartIndex () 178: { 179: return offset + start[0]; 180: } 181: 182: /** 183: * Returns the index within the input string where the match in 184: * its entirety ends. The return value is the next position after 185: * the end of the string; therefore, a match created by the 186: * following call: 187: * 188: * <P> 189: * <code>REMatch myMatch = myExpression.getMatch(myString);</code> 190: * <P> 191: * can be viewed (given that myMatch is not null) by creating 192: * <P> 193: * <code>String theMatch = myString.substring(myMatch.getStartIndex(), 194: * myMatch.getEndIndex());</code> 195: * <P> 196: * But you can save yourself that work, since the <code>toString()</code> 197: * method (above) does exactly that for you. 198: */ 199: public int getEndIndex () 200: { 201: return offset + end[0]; 202: } 203: 204: /** 205: * Returns the string matching the given subexpression. The subexpressions 206: * are indexed starting with one, not zero. That is, the subexpression 207: * identified by the first set of parentheses in a regular expression 208: * could be retrieved from an REMatch by calling match.toString(1). 209: * 210: * @param sub Index of the subexpression. 211: */ 212: public String toString (int sub) 213: { 214: if ((sub >= start.length) || sub < 0) 215: throw new IndexOutOfBoundsException ("No group " + sub); 216: if (start[sub] == -1) 217: return null; 218: if (start[sub] >= 0 && end[sub] <= matchedText.length ()) 219: return (matchedText.substring (start[sub], end[sub])); 220: else 221: { 222: // This case occurs with RETokenLookAhead or RETokenLookBehind. 223: CPStringBuilder sb = new CPStringBuilder (); 224: int s = start[sub]; 225: int e = end[sub]; 226: if (s < 0) 227: s += 1; 228: if (e < 0) 229: e += 1; 230: for (int i = start[0] + s; i < start[0] + e; i++) 231: sb.append (matchedCharIndexed.charAt (i)); 232: return sb.toString (); 233: } 234: } 235: 236: /** 237: * Returns the index within the input string used to generate this match 238: * where subexpression number <i>sub</i> begins, or <code>-1</code> if 239: * the subexpression does not exist. The initial position is zero. 240: * 241: * @param sub Subexpression index 242: * @deprecated Use getStartIndex(int) instead. 243: */ 244: public int getSubStartIndex (int sub) 245: { 246: if (sub >= start.length) 247: return -1; 248: int x = start[sub]; 249: return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1; 250: } 251: 252: /** 253: * Returns the index within the input string used to generate this match 254: * where subexpression number <i>sub</i> begins, or <code>-1</code> if 255: * the subexpression does not exist. The initial position is zero. 256: * 257: * @param sub Subexpression index 258: * @since gnu.regexp 1.1.0 259: */ 260: public int getStartIndex (int sub) 261: { 262: if (sub >= start.length) 263: return -1; 264: int x = start[sub]; 265: return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1; 266: } 267: 268: /** 269: * Returns the index within the input string used to generate this match 270: * where subexpression number <i>sub</i> ends, or <code>-1</code> if 271: * the subexpression does not exist. The initial position is zero. 272: * 273: * @param sub Subexpression index 274: * @deprecated Use getEndIndex(int) instead 275: */ 276: public int getSubEndIndex (int sub) 277: { 278: if (sub >= start.length) 279: return -1; 280: int x = end[sub]; 281: return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1; 282: } 283: 284: /** 285: * Returns the index within the input string used to generate this match 286: * where subexpression number <i>sub</i> ends, or <code>-1</code> if 287: * the subexpression does not exist. The initial position is zero. 288: * 289: * @param sub Subexpression index 290: */ 291: public int getEndIndex (int sub) 292: { 293: if (sub >= start.length) 294: return -1; 295: int x = end[sub]; 296: return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1; 297: } 298: 299: /** 300: * Substitute the results of this match to create a new string. 301: * This is patterned after PERL, so the tokens to watch out for are 302: * <code>$0</code> through <code>$9</code>. <code>$0</code> matches 303: * the full substring matched; <code>$<i>n</i></code> matches 304: * subexpression number <i>n</i>. 305: * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions 306: * if such subexpressions exist. 307: * 308: * @param input A string consisting of literals and <code>$<i>n</i></code> tokens. 309: */ 310: public String substituteInto (String input) 311: { 312: // a la Perl, $0 is whole thing, $1 - $9 are subexpressions 313: CPStringBuilder output = new CPStringBuilder (); 314: int pos; 315: for (pos = 0; pos < input.length () - 1; pos++) 316: { 317: if ((input.charAt (pos) == '$') 318: && (Character.isDigit (input.charAt (pos + 1)))) 319: { 320: int val = Character.digit (input.charAt (++pos), 10); 321: int pos1 = pos + 1; 322: while (pos1 < input.length () && 323: Character.isDigit (input.charAt (pos1))) 324: { 325: int val1 = 326: val * 10 + Character.digit (input.charAt (pos1), 10); 327: if (val1 >= start.length) 328: break; 329: pos1++; 330: val = val1; 331: } 332: pos = pos1 - 1; 333: 334: if (val < start.length) 335: { 336: output.append (toString (val)); 337: } 338: } 339: else 340: output.append (input.charAt (pos)); 341: } 342: if (pos < input.length ()) 343: output.append (input.charAt (pos)); 344: return output.toString (); 345: } 346: 347: /* The following are used for debugging purpose 348: public static String d(REMatch m) { 349: if (m == null) return "null"; 350: else return "[" + m.index + "]"; 351: } 352: 353: public String substringUptoIndex(CharIndexed input) { 354: StringBuffer sb = new StringBuffer(); 355: for (int i = 0; i < index; i++) { 356: sb.append(input.charAt(i)); 357: } 358: return sb.toString(); 359: } 360: */ 361: 362: }