Source for gnu.java.util.regex.REMatch

   1: /* gnu/regexp/REMatch.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.util.regex;
  40: 
  41: import gnu.java.lang.CPStringBuilder;
  42: 
  43: import java.io.Serializable;
  44: 
  45: /**
  46:  * An instance of this class represents a match
  47:  * completed by a gnu.regexp matching function. It can be used
  48:  * to obtain relevant information about the location of a match
  49:  * or submatch.
  50:  *
  51:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  52:  */
  53: public final class REMatch implements Serializable, Cloneable
  54: {
  55:   private String matchedText;
  56:   private CharIndexed matchedCharIndexed;
  57: 
  58:   // These variables are package scope for fast access within the engine
  59:   int eflags;                   // execution flags this match was made using
  60: 
  61:   // Offset in source text where match was tried.  This is zero-based;
  62:   // the actual position in the source text is given by (offset + anchor).
  63:   int offset;
  64: 
  65:   // Anchor position refers to the index into the source input
  66:   // at which the matching operation began.
  67:   // This is also useful for the ANCHORINDEX option.
  68:   int anchor;
  69: 
  70:   // Package scope; used by RE.
  71:   int index;                    // used while matching to mark current match position in input
  72:   // start1[i] is set when the i-th subexp starts. And start1[i] is copied
  73:   // to start[i] when the i-th subexp ends.  So start[i] keeps the previously
  74:   // assigned value while the i-th subexp is being processed. This makes
  75:   // backreference to the i-th subexp within the i-th subexp possible.
  76:   int[] start;                  // start positions (relative to offset) for each (sub)exp.
  77:   int[] start1;                 // start positions (relative to offset) for each (sub)exp.
  78:   int[] end;                    // end positions for the same
  79:   // start[i] == -1 or end[i] == -1 means that the start/end position is void.
  80:   // start[i] == p or end[i] == p where p < 0 and p != -1 means that
  81:   // the actual start/end position is (p+1). Start/end positions may
  82:   // become negative when the subexpression is in a RETokenLookBehind.
  83:   boolean empty;                // empty string matched. This flag is used only within
  84:   // RETokenRepeated.
  85: 
  86:   BacktrackStack backtrackStack;
  87: 
  88:   public Object clone ()
  89:   {
  90:     try
  91:     {
  92:       REMatch copy = (REMatch) super.clone ();
  93: 
  94:         copy.start = (int[]) start.clone ();
  95:         copy.start1 = (int[]) start1.clone ();
  96:         copy.end = (int[]) end.clone ();
  97: 
  98:         return copy;
  99:     }
 100:     catch (CloneNotSupportedException e)
 101:     {
 102:       throw new Error ();       // doesn't happen
 103:     }
 104:   }
 105: 
 106:   void assignFrom (REMatch other)
 107:   {
 108:     start = other.start;
 109:     start1 = other.start1;
 110:     end = other.end;
 111:     index = other.index;
 112:     backtrackStack = other.backtrackStack;
 113:   }
 114: 
 115:   REMatch (int subs, int anchor, int eflags)
 116:   {
 117:     start = new int[subs + 1];
 118:     start1 = new int[subs + 1];
 119:     end = new int[subs + 1];
 120:     this.anchor = anchor;
 121:     this.eflags = eflags;
 122:     clear (anchor);
 123:   }
 124: 
 125:   void finish (CharIndexed text)
 126:   {
 127:     start[0] = 0;
 128:     CPStringBuilder sb = new CPStringBuilder ();
 129:     int i;
 130:     for (i = 0; i < end[0]; i++)
 131:       sb.append (text.charAt (i));
 132:     matchedText = sb.toString ();
 133:     matchedCharIndexed = text;
 134:     for (i = 0; i < start.length; i++)
 135:       {
 136:         // If any subexpressions didn't terminate, they don't count
 137:         // TODO check if this code ever gets hit
 138:         if ((start[i] == -1) ^ (end[i] == -1))
 139:           {
 140:             start[i] = -1;
 141:             end[i] = -1;
 142:           }
 143:       }
 144:     backtrackStack = null;
 145:   }
 146: 
 147:     /** Clears the current match and moves the offset to the new index. */
 148:   void clear (int index)
 149:   {
 150:     offset = index;
 151:     this.index = 0;
 152:     for (int i = 0; i < start.length; i++)
 153:       {
 154:         start[i] = start1[i] = end[i] = -1;
 155:       }
 156:     backtrackStack = null;
 157:   }
 158: 
 159:     /**
 160:      * Returns the string matching the pattern.  This makes it convenient
 161:      * to write code like the following:
 162:      * <P>
 163:      * <code>
 164:      * REMatch myMatch = myExpression.getMatch(myString);<br>
 165:      * if (myMatch != null) System.out.println("Regexp found: "+myMatch);
 166:      * </code>
 167:      */
 168:   public String toString ()
 169:   {
 170:     return matchedText;
 171:   }
 172: 
 173:     /**
 174:      * Returns the index within the input text where the match in its entirety
 175:      * began.
 176:      */
 177:   public int getStartIndex ()
 178:   {
 179:     return offset + start[0];
 180:   }
 181: 
 182:     /**
 183:      * Returns the index within the input string where the match in
 184:      * its entirety ends.  The return value is the next position after
 185:      * the end of the string; therefore, a match created by the
 186:      * following call:
 187:      *
 188:      * <P>
 189:      * <code>REMatch myMatch = myExpression.getMatch(myString);</code>
 190:      * <P>
 191:      * can be viewed (given that myMatch is not null) by creating
 192:      * <P>
 193:      * <code>String theMatch = myString.substring(myMatch.getStartIndex(),
 194:      * myMatch.getEndIndex());</code>
 195:      * <P>
 196:      * But you can save yourself that work, since the <code>toString()</code>
 197:      * method (above) does exactly that for you.
 198:      */
 199:   public int getEndIndex ()
 200:   {
 201:     return offset + end[0];
 202:   }
 203: 
 204:     /**
 205:      * Returns the string matching the given subexpression.  The subexpressions
 206:      * are indexed starting with one, not zero.  That is, the subexpression
 207:      * identified by the first set of parentheses in a regular expression
 208:      * could be retrieved from an REMatch by calling match.toString(1).
 209:      *
 210:      * @param sub Index of the subexpression.
 211:      */
 212:   public String toString (int sub)
 213:   {
 214:     if ((sub >= start.length) || sub < 0)
 215:       throw new IndexOutOfBoundsException ("No group " + sub);
 216:     if (start[sub] == -1)
 217:       return null;
 218:     if (start[sub] >= 0 && end[sub] <= matchedText.length ())
 219:       return (matchedText.substring (start[sub], end[sub]));
 220:     else
 221:       {
 222:         // This case occurs with RETokenLookAhead or RETokenLookBehind.
 223:         CPStringBuilder sb = new CPStringBuilder ();
 224:         int s = start[sub];
 225:         int e = end[sub];
 226:         if (s < 0)
 227:           s += 1;
 228:         if (e < 0)
 229:           e += 1;
 230:         for (int i = start[0] + s; i < start[0] + e; i++)
 231:           sb.append (matchedCharIndexed.charAt (i));
 232:         return sb.toString ();
 233:       }
 234:   }
 235: 
 236:     /**
 237:      * Returns the index within the input string used to generate this match
 238:      * where subexpression number <i>sub</i> begins, or <code>-1</code> if
 239:      * the subexpression does not exist.  The initial position is zero.
 240:      *
 241:      * @param sub Subexpression index
 242:      * @deprecated Use getStartIndex(int) instead.
 243:      */
 244:   public int getSubStartIndex (int sub)
 245:   {
 246:     if (sub >= start.length)
 247:       return -1;
 248:     int x = start[sub];
 249:     return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1;
 250:   }
 251: 
 252:     /**
 253:      * Returns the index within the input string used to generate this match
 254:      * where subexpression number <i>sub</i> begins, or <code>-1</code> if
 255:      * the subexpression does not exist.  The initial position is zero.
 256:      *
 257:      * @param sub Subexpression index
 258:      * @since gnu.regexp 1.1.0
 259:      */
 260:   public int getStartIndex (int sub)
 261:   {
 262:     if (sub >= start.length)
 263:       return -1;
 264:     int x = start[sub];
 265:     return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1;
 266:   }
 267: 
 268:     /**
 269:      * Returns the index within the input string used to generate this match
 270:      * where subexpression number <i>sub</i> ends, or <code>-1</code> if
 271:      * the subexpression does not exist.  The initial position is zero.
 272:      *
 273:      * @param sub Subexpression index
 274:      * @deprecated Use getEndIndex(int) instead
 275:      */
 276:   public int getSubEndIndex (int sub)
 277:   {
 278:     if (sub >= start.length)
 279:       return -1;
 280:     int x = end[sub];
 281:     return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1;
 282:   }
 283: 
 284:     /**
 285:      * Returns the index within the input string used to generate this match
 286:      * where subexpression number <i>sub</i> ends, or <code>-1</code> if
 287:      * the subexpression does not exist.  The initial position is zero.
 288:      *
 289:      * @param sub Subexpression index
 290:      */
 291:   public int getEndIndex (int sub)
 292:   {
 293:     if (sub >= start.length)
 294:       return -1;
 295:     int x = end[sub];
 296:     return (x == -1) ? x : (x >= 0) ? offset + x : offset + x + 1;
 297:   }
 298: 
 299:     /**
 300:      * Substitute the results of this match to create a new string.
 301:      * This is patterned after PERL, so the tokens to watch out for are
 302:      * <code>$0</code> through <code>$9</code>.  <code>$0</code> matches
 303:      * the full substring matched; <code>$<i>n</i></code> matches
 304:      * subexpression number <i>n</i>.
 305:      * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions
 306:      * if such subexpressions exist.
 307:      *
 308:      * @param input A string consisting of literals and <code>$<i>n</i></code> tokens.
 309:      */
 310:   public String substituteInto (String input)
 311:   {
 312:     // a la Perl, $0 is whole thing, $1 - $9 are subexpressions
 313:     CPStringBuilder output = new CPStringBuilder ();
 314:     int pos;
 315:     for (pos = 0; pos < input.length () - 1; pos++)
 316:       {
 317:         if ((input.charAt (pos) == '$')
 318:             && (Character.isDigit (input.charAt (pos + 1))))
 319:           {
 320:             int val = Character.digit (input.charAt (++pos), 10);
 321:             int pos1 = pos + 1;
 322:             while (pos1 < input.length () &&
 323:                    Character.isDigit (input.charAt (pos1)))
 324:               {
 325:                 int val1 =
 326:                   val * 10 + Character.digit (input.charAt (pos1), 10);
 327:                 if (val1 >= start.length)
 328:                   break;
 329:                 pos1++;
 330:                 val = val1;
 331:               }
 332:             pos = pos1 - 1;
 333: 
 334:             if (val < start.length)
 335:               {
 336:                 output.append (toString (val));
 337:               }
 338:           }
 339:         else
 340:           output.append (input.charAt (pos));
 341:       }
 342:     if (pos < input.length ())
 343:       output.append (input.charAt (pos));
 344:     return output.toString ();
 345:   }
 346: 
 347: /*  The following are used for debugging purpose
 348:     public static String d(REMatch m) {
 349:         if (m == null) return "null";
 350:         else return "[" + m.index + "]";
 351:     }
 352: 
 353:     public String substringUptoIndex(CharIndexed input) {
 354:         StringBuffer sb = new StringBuffer();
 355:         for (int i = 0; i < index; i++) {
 356:             sb.append(input.charAt(i));
 357:         }
 358:         return sb.toString();
 359:     }
 360: */
 361: 
 362: }