1:
37:
38: package ;
39:
40: import ;
41:
42: import ;
43: import ;
44:
45: import ;
46: import ;
47: import ;
48: import ;
49: import ;
50:
51:
121:
122: public class RE extends REToken
123: {
124:
125: private static final class IntPair implements Serializable
126: {
127: public int first, second;
128: }
129:
130: private static final class CharUnit implements Serializable
131: {
132: public char ch;
133: public boolean bk;
134: }
135:
136:
137: private static final String VERSION = "1.1.5-dev";
138:
139:
140:
141: private static ResourceBundle messages;
142:
143:
144: private static final String bundle = "gnu/java/util/regex/MessagesBundle";
145:
146:
147:
148: private REToken firstToken, lastToken;
149:
150:
151:
152: private int numSubs;
153:
154:
155: private int minimumLength;
156: private int maximumLength;
157:
158:
162: public static final int REG_ICASE = 0x02;
163:
164:
170: public static final int REG_DOT_NEWLINE = 0x04;
171:
172:
177: public static final int REG_MULTILINE = 0x08;
178:
179:
206: public static final int REG_NOTBOL = 0x10;
207:
208:
213: public static final int REG_NOTEOL = 0x20;
214:
215:
227: public static final int REG_ANCHORINDEX = 0x40;
228:
229:
236: public static final int REG_NO_INTERPOLATE = 0x80;
237:
238:
243: public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
244:
245:
253: public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
254:
255:
259: public static final int REG_X_COMMENTS = 0x0400;
260:
261:
264: public static final int REG_ICASE_USASCII = 0x0800;
265:
266:
271: public static final int REG_FIX_STARTING_POSITION = 0x1000;
272:
273:
274: public static final String version ()
275: {
276: return VERSION;
277: }
278:
279:
280: static final String getLocalizedMessage (String key)
281: {
282: if (messages == null)
283: messages =
284: PropertyResourceBundle.getBundle (bundle, Locale.getDefault ());
285: return messages.getString (key);
286: }
287:
288:
298: public RE (Object pattern) throws REException
299: {
300: this (pattern, 0, RESyntax.RE_SYNTAX_PERL5, 0, 0);
301: }
302:
303:
314: public RE (Object pattern, int cflags) throws REException
315: {
316: this (pattern, cflags, RESyntax.RE_SYNTAX_PERL5, 0, 0);
317: }
318:
319:
331: public RE (Object pattern, int cflags, RESyntax syntax) throws REException
332: {
333: this (pattern, cflags, syntax, 0, 0);
334: }
335:
336:
337: private RE (REToken first, REToken last, int subs, int subIndex,
338: int minLength, int maxLength)
339: {
340: super (subIndex);
341: firstToken = first;
342: lastToken = last;
343: numSubs = subs;
344: minimumLength = minLength;
345: maximumLength = maxLength;
346: addToken (new RETokenEndSub (subIndex));
347: }
348:
349: private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex,
350: int nextSub) throws REException
351: {
352: super (myIndex);
353: initialize (patternObj, cflags, syntax, myIndex, nextSub);
354: }
355:
356:
357: protected RE ()
358: {
359: super (0);
360: }
361:
362:
363: protected void initialize (Object patternObj, int cflags, RESyntax syntax,
364: int myIndex, int nextSub) throws REException
365: {
366: char[] pattern;
367: if (patternObj instanceof String)
368: {
369: pattern = ((String) patternObj).toCharArray ();
370: }
371: else if (patternObj instanceof char[])
372: {
373: pattern = (char[]) patternObj;
374: }
375: else if (patternObj instanceof StringBuffer)
376: {
377: pattern = new char[((StringBuffer) patternObj).length ()];
378: ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0);
379: }
380: else if (patternObj instanceof StringBuilder)
381: {
382: pattern = new char[((StringBuilder) patternObj).length ()];
383: ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0);
384: }
385: else if (patternObj instanceof CPStringBuilder)
386: {
387: pattern = new char[((CPStringBuilder) patternObj).length ()];
388: ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern,
389: 0);
390: }
391: else
392: {
393: pattern = patternObj.toString ().toCharArray ();
394: }
395:
396: int pLength = pattern.length;
397:
398: numSubs = 0;
399: ArrayList < REToken > branches = null;
400:
401:
402: firstToken = lastToken = null;
403:
404:
405:
406: boolean insens = ((cflags & REG_ICASE) > 0);
407: boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
408:
409:
410:
411:
412:
413: int index = 0;
414:
415:
416: CharUnit unit = new CharUnit ();
417:
418:
419: IntPair minMax = new IntPair ();
420:
421:
422: REToken currentToken = null;
423: boolean quot = false;
424:
425:
426: RESyntax savedSyntax = null;
427: int savedCflags = 0;
428: boolean flagsSaved = false;
429:
430: while (index < pLength)
431: {
432:
433: index = getCharUnit (pattern, index, unit, quot);
434:
435: if (unit.bk)
436: if (unit.ch == 'Q')
437: {
438: quot = true;
439: continue;
440: }
441: else if (unit.ch == 'E')
442: {
443: quot = false;
444: continue;
445: }
446: if (quot)
447: unit.bk = false;
448:
449: if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot))
450: {
451: if (Character.isWhitespace (unit.ch))
452: {
453: continue;
454: }
455: if (unit.ch == '#')
456: {
457: for (int i = index; i < pLength; i++)
458: {
459: if (pattern[i] == '\n')
460: {
461: index = i + 1;
462: continue;
463: }
464: else if (pattern[i] == '\r')
465: {
466: if (i + 1 < pLength && pattern[i + 1] == '\n')
467: {
468: index = i + 2;
469: }
470: else
471: {
472: index = i + 1;
473: }
474: continue;
475: }
476: }
477: index = pLength;
478: continue;
479: }
480: }
481:
482:
483:
484:
485:
486:
487:
488: if (((unit.ch == '|'
489: && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
490: || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n')
491: && !(unit.bk || quot)))
492: && !syntax.get (RESyntax.RE_LIMITED_OPS))
493: {
494:
495: addToken (currentToken);
496: RE theBranch =
497: new RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
498: maximumLength);
499: minimumLength = 0;
500: maximumLength = 0;
501: if (branches == null)
502: {
503: branches = new ArrayList < REToken > ();
504: }
505: branches.add (theBranch);
506: firstToken = lastToken = currentToken = null;
507: }
508:
509:
510:
511:
512:
513:
514:
515:
516:
517:
518:
519: else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS)
520: && (syntax.
521: get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot)))
522: {
523: int newIndex = getMinMax (pattern, index, minMax, syntax);
524: if (newIndex > index)
525: {
526: if (minMax.first > minMax.second)
527: throw new
528: REException (getLocalizedMessage ("interval.order"),
529: REException.REG_BADRPT, newIndex);
530: if (currentToken == null)
531: throw new
532: REException (getLocalizedMessage ("repeat.no.token"),
533: REException.REG_BADRPT, newIndex);
534: if (currentToken instanceof RETokenRepeated)
535: throw new
536: REException (getLocalizedMessage ("repeat.chained"),
537: REException.REG_BADRPT, newIndex);
538: if (currentToken instanceof RETokenWordBoundary
539: || currentToken instanceof RETokenWordBoundary)
540: throw new
541: REException (getLocalizedMessage ("repeat.assertion"),
542: REException.REG_BADRPT, newIndex);
543: index = newIndex;
544: currentToken =
545: setRepeated (currentToken, minMax.first, minMax.second,
546: index);
547: }
548: else
549: {
550: addToken (currentToken);
551: currentToken = new RETokenChar (subIndex, unit.ch, insens);
552: if (insensUSASCII)
553: currentToken.unicodeAware = false;
554: }
555: }
556:
557:
558:
559:
560: else if ((unit.ch == '[') && !(unit.bk || quot))
561: {
562:
563: ParseCharClassResult result =
564: parseCharClass (subIndex, pattern, index, pLength, cflags,
565: syntax, 0);
566: addToken (currentToken);
567: currentToken = result.token;
568: index = result.index;
569: }
570:
571:
572:
573:
574: else if ((unit.ch == '(')
575: && (syntax.
576: get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
577: {
578: boolean pure = false;
579: boolean comment = false;
580: boolean lookAhead = false;
581: boolean lookBehind = false;
582: boolean independent = false;
583: boolean negativelh = false;
584: boolean negativelb = false;
585: if ((index + 1 < pLength) && (pattern[index] == '?'))
586: {
587: switch (pattern[index + 1])
588: {
589: case '!':
590: if (syntax.get (RESyntax.RE_LOOKAHEAD))
591: {
592: pure = true;
593: negativelh = true;
594: lookAhead = true;
595: index += 2;
596: }
597: break;
598: case '=':
599: if (syntax.get (RESyntax.RE_LOOKAHEAD))
600: {
601: pure = true;
602: lookAhead = true;
603: index += 2;
604: }
605: break;
606: case '<':
607:
608:
609: if (syntax.get (RESyntax.RE_LOOKAHEAD))
610: {
611: index++;
612: switch (pattern[index + 1])
613: {
614: case '!':
615: pure = true;
616: negativelb = true;
617: lookBehind = true;
618: index += 2;
619: break;
620: case '=':
621: pure = true;
622: lookBehind = true;
623: index += 2;
624: }
625: }
626: break;
627: case '>':
628:
629:
630: if (syntax.get (RESyntax.RE_LOOKAHEAD))
631: {
632: pure = true;
633: independent = true;
634: index += 2;
635: }
636: break;
637: case 'i':
638: case 'd':
639: case 'm':
640: case 's':
641: case 'u':
642: case 'x':
643: case '-':
644: if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS))
645: break;
646:
647: int flagIndex = index + 1;
648: int endFlag = -1;
649: RESyntax newSyntax = new RESyntax (syntax);
650: int newCflags = cflags;
651: boolean negate = false;
652: while (flagIndex < pLength && endFlag < 0)
653: {
654: switch (pattern[flagIndex])
655: {
656: case 'i':
657: if (negate)
658: newCflags &= ~REG_ICASE;
659: else
660: newCflags |= REG_ICASE;
661: flagIndex++;
662: break;
663: case 'd':
664: if (negate)
665: newSyntax.setLineSeparator (RESyntax.
666: DEFAULT_LINE_SEPARATOR);
667: else
668: newSyntax.setLineSeparator ("\n");
669: flagIndex++;
670: break;
671: case 'm':
672: if (negate)
673: newCflags &= ~REG_MULTILINE;
674: else
675: newCflags |= REG_MULTILINE;
676: flagIndex++;
677: break;
678: case 's':
679: if (negate)
680: newCflags &= ~REG_DOT_NEWLINE;
681: else
682: newCflags |= REG_DOT_NEWLINE;
683: flagIndex++;
684: break;
685: case 'u':
686: if (negate)
687: newCflags |= REG_ICASE_USASCII;
688: else
689: newCflags &= ~REG_ICASE_USASCII;
690: flagIndex++;
691: break;
692: case 'x':
693: if (negate)
694: newCflags &= ~REG_X_COMMENTS;
695: else
696: newCflags |= REG_X_COMMENTS;
697: flagIndex++;
698: break;
699: case '-':
700: negate = true;
701: flagIndex++;
702: break;
703: case ':':
704: case ')':
705: endFlag = pattern[flagIndex];
706: break;
707: default:
708: throw new
709: REException (getLocalizedMessage
710: ("repeat.no.token"),
711: REException.REG_BADRPT, index);
712: }
713: }
714: if (endFlag == ')')
715: {
716: syntax = newSyntax;
717: cflags = newCflags;
718: insens = ((cflags & REG_ICASE) > 0);
719: insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
720:
721: comment = true;
722: index = flagIndex - 1;
723: break;
724: }
725: if (endFlag == ':')
726: {
727: savedSyntax = syntax;
728: savedCflags = cflags;
729: flagsSaved = true;
730: syntax = newSyntax;
731: cflags = newCflags;
732: insens = ((cflags & REG_ICASE) > 0);
733: insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
734: index = flagIndex - 1;
735:
736: }
737: else
738: {
739: throw new
740: REException (getLocalizedMessage
741: ("unmatched.paren"),
742: REException.REG_ESUBREG, index);
743: }
744: case ':':
745: if (syntax.get (RESyntax.RE_PURE_GROUPING))
746: {
747: pure = true;
748: index += 2;
749: }
750: break;
751: case '#':
752: if (syntax.get (RESyntax.RE_COMMENTS))
753: {
754: comment = true;
755: }
756: break;
757: default:
758: throw new
759: REException (getLocalizedMessage ("repeat.no.token"),
760: REException.REG_BADRPT, index);
761: }
762: }
763:
764: if (index >= pLength)
765: {
766: throw new
767: REException (getLocalizedMessage ("unmatched.paren"),
768: REException.REG_ESUBREG, index);
769: }
770:
771:
772: int endIndex = index;
773: int nextIndex = index;
774: int nested = 0;
775:
776: while (((nextIndex =
777: getCharUnit (pattern, endIndex, unit, false)) > 0)
778: && !(nested == 0 && (unit.ch == ')')
779: && (syntax.
780: get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
781: || quot))))
782: {
783: if ((endIndex = nextIndex) >= pLength)
784: throw new
785: REException (getLocalizedMessage ("subexpr.no.end"),
786: REException.REG_ESUBREG, nextIndex);
787: else
788: if ((unit.ch == '[') && !(unit.bk || quot))
789: {
790:
791:
792: int listIndex = nextIndex;
793: if (listIndex < pLength && pattern[listIndex] == '^')
794: listIndex++;
795: if (listIndex < pLength && pattern[listIndex] == ']')
796: listIndex++;
797: int listEndIndex = -1;
798: int listNest = 0;
799: while (listIndex < pLength && listEndIndex < 0)
800: {
801: switch (pattern[listIndex++])
802: {
803: case '\\':
804: listIndex++;
805: break;
806: case '[':
807:
808:
809: listNest++;
810: if (listIndex < pLength
811: && pattern[listIndex] == '^')
812: listIndex++;
813: if (listIndex < pLength
814: && pattern[listIndex] == ']')
815: listIndex++;
816: break;
817: case ']':
818: if (listNest == 0)
819: listEndIndex = listIndex;
820: listNest--;
821: break;
822: }
823: }
824: if (listEndIndex >= 0)
825: {
826: nextIndex = listEndIndex;
827: if ((endIndex = nextIndex) >= pLength)
828: throw new
829: REException (getLocalizedMessage ("subexpr.no.end"),
830: REException.REG_ESUBREG, nextIndex);
831: else
832: continue;
833: }
834: throw new
835: REException (getLocalizedMessage ("subexpr.no.end"),
836: REException.REG_ESUBREG, nextIndex);
837: }
838: else if (unit.ch == '('
839: && (syntax.
840: get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
841: || quot)))
842: nested++;
843: else if (unit.ch == ')'
844: && (syntax.
845: get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk
846: || quot)))
847: nested--;
848: }
849:
850:
851:
852:
853: if (comment)
854: index = nextIndex;
855: else
856: {
857:
858: addToken (currentToken);
859: if (!pure)
860: {
861: numSubs++;
862: }
863:
864: int useIndex = (pure || lookAhead || lookBehind
865: || independent) ? 0 : nextSub + numSubs;
866: currentToken =
867: new RE (String.valueOf (pattern, index, endIndex - index).
868: toCharArray (), cflags, syntax, useIndex,
869: nextSub + numSubs);
870: numSubs += ((RE) currentToken).getNumSubs ();
871:
872: if (lookAhead)
873: {
874: currentToken =
875: new RETokenLookAhead (currentToken, negativelh);
876: }
877: else if (lookBehind)
878: {
879: currentToken =
880: new RETokenLookBehind (currentToken, negativelb);
881: }
882: else if (independent)
883: {
884: currentToken = new RETokenIndependent (currentToken);
885: }
886:
887: index = nextIndex;
888: if (flagsSaved)
889: {
890: syntax = savedSyntax;
891: cflags = savedCflags;
892: insens = ((cflags & REG_ICASE) > 0);
893: insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
894: flagsSaved = false;
895: }
896: }
897: }
898:
899:
900:
901:
902: else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
903: && ((unit.ch == ')')
904: && (syntax.
905: get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))))
906: {
907: throw new REException (getLocalizedMessage ("unmatched.paren"),
908: REException.REG_EPAREN, index);
909: }
910:
911:
912:
913:
914: else if ((unit.ch == '^') && !(unit.bk || quot))
915: {
916: addToken (currentToken);
917: currentToken = null;
918: RETokenStart token = null;
919: if ((cflags & REG_MULTILINE) > 0)
920: {
921: String sep = syntax.getLineSeparator ();
922: if (sep == null)
923: {
924: token = new RETokenStart (subIndex, null, true);
925: }
926: else
927: {
928: token = new RETokenStart (subIndex, sep);
929: }
930: }
931: else
932: {
933: token = new RETokenStart (subIndex, null);
934: }
935: addToken (token);
936: }
937:
938:
939:
940:
941: else if ((unit.ch == '$') && !(unit.bk || quot))
942: {
943: addToken (currentToken);
944: currentToken = null;
945: RETokenEnd token = null;
946: if ((cflags & REG_MULTILINE) > 0)
947: {
948: String sep = syntax.getLineSeparator ();
949: if (sep == null)
950: {
951: token = new RETokenEnd (subIndex, null, true);
952: }
953: else
954: {
955: token = new RETokenEnd (subIndex, sep);
956: }
957: }
958: else
959: {
960: token = new RETokenEnd (subIndex, null);
961: }
962: addToken (token);
963: }
964:
965:
966:
967:
968: else if ((unit.ch == '.') && !(unit.bk || quot))
969: {
970: addToken (currentToken);
971: currentToken =
972: new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE)
973: || ((cflags & REG_DOT_NEWLINE) > 0),
974: syntax.get (RESyntax.RE_DOT_NOT_NULL));
975: }
976:
977:
978:
979:
980:
981:
982:
983: else if ((unit.ch == '*') && !(unit.bk || quot))
984: {
985: if (currentToken == null)
986: throw new REException (getLocalizedMessage ("repeat.no.token"),
987: REException.REG_BADRPT, index);
988: if (currentToken instanceof RETokenRepeated)
989: throw new REException (getLocalizedMessage ("repeat.chained"),
990: REException.REG_BADRPT, index);
991: if (currentToken instanceof RETokenWordBoundary
992: || currentToken instanceof RETokenWordBoundary)
993: throw new REException (getLocalizedMessage ("repeat.assertion"),
994: REException.REG_BADRPT, index);
995: currentToken =
996: setRepeated (currentToken, 0, Integer.MAX_VALUE, index);
997: }
998:
999:
1000:
1001:
1002:
1003:
1004:
1005:
1006: else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1007: && (!syntax.
1008: get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1009: {
1010: if (currentToken == null)
1011: throw new REException (getLocalizedMessage ("repeat.no.token"),
1012: REException.REG_BADRPT, index);
1013:
1014:
1015: if (currentToken instanceof RETokenRepeated)
1016: {
1017: RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1018: if (syntax.get (RESyntax.RE_POSSESSIVE_OPS)
1019: && !tokenRep.isPossessive () && !tokenRep.isStingy ())
1020: tokenRep.makePossessive ();
1021: else
1022: throw new
1023: REException (getLocalizedMessage ("repeat.chained"),
1024: REException.REG_BADRPT, index);
1025:
1026: }
1027: else if (currentToken instanceof RETokenWordBoundary
1028: || currentToken instanceof RETokenWordBoundary)
1029: throw new REException (getLocalizedMessage ("repeat.assertion"),
1030: REException.REG_BADRPT, index);
1031: else
1032: currentToken =
1033: setRepeated (currentToken, 1, Integer.MAX_VALUE, index);
1034: }
1035:
1036:
1037:
1038:
1039:
1040:
1041: else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS)
1042: && (!syntax.
1043: get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot)))
1044: {
1045: if (currentToken == null)
1046: throw new REException (getLocalizedMessage ("repeat.no.token"),
1047: REException.REG_BADRPT, index);
1048:
1049:
1050: if (currentToken instanceof RETokenRepeated)
1051: {
1052: RETokenRepeated tokenRep = (RETokenRepeated) currentToken;
1053: if (syntax.get (RESyntax.RE_STINGY_OPS)
1054: && !tokenRep.isStingy () && !tokenRep.isPossessive ())
1055: tokenRep.makeStingy ();
1056: else
1057: throw new
1058: REException (getLocalizedMessage ("repeat.chained"),
1059: REException.REG_BADRPT, index);
1060: }
1061: else if (currentToken instanceof RETokenWordBoundary
1062: || currentToken instanceof RETokenWordBoundary)
1063: throw new REException (getLocalizedMessage ("repeat.assertion"),
1064: REException.REG_BADRPT, index);
1065: else
1066: currentToken = setRepeated (currentToken, 0, 1, index);
1067: }
1068:
1069:
1070:
1071:
1072: else if (unit.bk && (unit.ch == '0')
1073: && syntax.get (RESyntax.RE_OCTAL_CHAR))
1074: {
1075: CharExpression ce =
1076: getCharExpression (pattern, index - 2, pLength, syntax);
1077: if (ce == null)
1078: throw new REException ("invalid octal character",
1079: REException.REG_ESCAPE, index);
1080: index = index - 2 + ce.len;
1081: addToken (currentToken);
1082: currentToken = new RETokenChar (subIndex, ce.ch, insens);
1083: if (insensUSASCII)
1084: currentToken.unicodeAware = false;
1085: }
1086:
1087:
1088:
1089:
1090:
1091:
1092:
1093:
1094:
1095:
1096:
1097:
1098:
1099: else if (unit.bk && Character.isDigit (unit.ch)
1100: && !syntax.get (RESyntax.RE_NO_BK_REFS))
1101: {
1102: addToken (currentToken);
1103: int numBegin = index - 1;
1104: int numEnd = pLength;
1105: for (int i = index; i < pLength; i++)
1106: {
1107: if (!Character.isDigit (pattern[i]))
1108: {
1109: numEnd = i;
1110: break;
1111: }
1112: }
1113: int num = parseInt (pattern, numBegin, numEnd - numBegin, 10);
1114:
1115: currentToken = new RETokenBackRef (subIndex, num, insens);
1116: if (insensUSASCII)
1117: currentToken.unicodeAware = false;
1118: index = numEnd;
1119: }
1120:
1121:
1122:
1123:
1124: else if (unit.bk && (unit.ch == 'A')
1125: && syntax.get (RESyntax.RE_STRING_ANCHORS))
1126: {
1127: addToken (currentToken);
1128: currentToken = new RETokenStart (subIndex, null);
1129: }
1130:
1131:
1132:
1133:
1134: else if (unit.bk && (unit.ch == 'b')
1135: && syntax.get (RESyntax.RE_STRING_ANCHORS))
1136: {
1137: addToken (currentToken);
1138: currentToken =
1139: new RETokenWordBoundary (subIndex,
1140: RETokenWordBoundary.
1141: BEGIN | RETokenWordBoundary.END,
1142: false);
1143: }
1144:
1145:
1146:
1147: else if (unit.bk && (unit.ch == '<'))
1148: {
1149: addToken (currentToken);
1150: currentToken =
1151: new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN,
1152: false);
1153: }
1154:
1155:
1156:
1157: else if (unit.bk && (unit.ch == '>'))
1158: {
1159: addToken (currentToken);
1160: currentToken =
1161: new RETokenWordBoundary (subIndex, RETokenWordBoundary.END,
1162: false);
1163: }
1164:
1165:
1166:
1167:
1168: else if (unit.bk && (unit.ch == 'B')
1169: && syntax.get (RESyntax.RE_STRING_ANCHORS))
1170: {
1171: addToken (currentToken);
1172: currentToken =
1173: new RETokenWordBoundary (subIndex,
1174: RETokenWordBoundary.
1175: BEGIN | RETokenWordBoundary.END, true);
1176: }
1177:
1178:
1179:
1180:
1181:
1182: else if (unit.bk && (unit.ch == 'd')
1183: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1184: {
1185: addToken (currentToken);
1186: currentToken =
1187: new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false);
1188: if (insensUSASCII)
1189: currentToken.unicodeAware = false;
1190: }
1191:
1192:
1193:
1194:
1195: else if (unit.bk && (unit.ch == 'D')
1196: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1197: {
1198: addToken (currentToken);
1199: currentToken =
1200: new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true);
1201: if (insensUSASCII)
1202: currentToken.unicodeAware = false;
1203: }
1204:
1205:
1206:
1207:
1208: else if (unit.bk && (unit.ch == 'n'))
1209: {
1210: addToken (currentToken);
1211: currentToken = new RETokenChar (subIndex, '\n', false);
1212: }
1213:
1214:
1215:
1216:
1217: else if (unit.bk && (unit.ch == 'r'))
1218: {
1219: addToken (currentToken);
1220: currentToken = new RETokenChar (subIndex, '\r', false);
1221: }
1222:
1223:
1224:
1225:
1226: else if (unit.bk && (unit.ch == 's')
1227: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1228: {
1229: addToken (currentToken);
1230: currentToken =
1231: new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false);
1232: if (insensUSASCII)
1233: currentToken.unicodeAware = false;
1234: }
1235:
1236:
1237:
1238:
1239: else if (unit.bk && (unit.ch == 'S')
1240: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1241: {
1242: addToken (currentToken);
1243: currentToken =
1244: new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true);
1245: if (insensUSASCII)
1246: currentToken.unicodeAware = false;
1247: }
1248:
1249:
1250:
1251:
1252: else if (unit.bk && (unit.ch == 't'))
1253: {
1254: addToken (currentToken);
1255: currentToken = new RETokenChar (subIndex, '\t', false);
1256: }
1257:
1258:
1259:
1260:
1261: else if (unit.bk && (unit.ch == 'w')
1262: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1263: {
1264: addToken (currentToken);
1265: currentToken =
1266: new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false);
1267: if (insensUSASCII)
1268: currentToken.unicodeAware = false;
1269: }
1270:
1271:
1272:
1273:
1274: else if (unit.bk && (unit.ch == 'W')
1275: && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES))
1276: {
1277: addToken (currentToken);
1278: currentToken =
1279: new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true);
1280: if (insensUSASCII)
1281: currentToken.unicodeAware = false;
1282: }
1283:
1284:
1285:
1286:
1287:
1288:
1289:
1290:
1291:
1292: else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1293: syntax.get (RESyntax.RE_STRING_ANCHORS))
1294: {
1295: addToken (currentToken);
1296: currentToken = new RETokenEnd (subIndex, null);
1297: }
1298:
1299:
1300:
1301:
1302: else
1303: if ((unit.bk && (unit.ch == 'x')
1304: && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk
1305: && (unit.ch == 'u')
1306: && syntax.
1307: get (RESyntax.
1308: RE_UNICODE_CHAR)))
1309: {
1310: CharExpression ce =
1311: getCharExpression (pattern, index - 2, pLength, syntax);
1312: if (ce == null)
1313: throw new REException ("invalid hex character",
1314: REException.REG_ESCAPE, index);
1315: index = index - 2 + ce.len;
1316: addToken (currentToken);
1317: currentToken = new RETokenChar (subIndex, ce.ch, insens);
1318: if (insensUSASCII)
1319: currentToken.unicodeAware = false;
1320: }
1321:
1322:
1323:
1324:
1325: else
1326: if ((unit.bk && (unit.ch == 'p')
1327: && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk
1328: && (unit.ch ==
1329: 'P')
1330: && syntax.
1331: get (RESyntax.
1332: RE_NAMED_PROPERTY)))
1333: {
1334: NamedProperty np = getNamedProperty (pattern, index - 2, pLength);
1335: if (np == null)
1336: throw new REException ("invalid escape sequence",
1337: REException.REG_ESCAPE, index);
1338: index = index - 2 + np.len;
1339: addToken (currentToken);
1340: currentToken =
1341: getRETokenNamedProperty (subIndex, np, insens, index);
1342: if (insensUSASCII)
1343: currentToken.unicodeAware = false;
1344: }
1345:
1346:
1347:
1348:
1349: else if (unit.bk && (unit.ch == 'G') &&
1350: syntax.get (RESyntax.RE_STRING_ANCHORS))
1351: {
1352: addToken (currentToken);
1353: currentToken = new RETokenEndOfPreviousMatch (subIndex);
1354: }
1355:
1356:
1357:
1358:
1359: else
1360: {
1361: addToken (currentToken);
1362: currentToken = new RETokenChar (subIndex, unit.ch, insens);
1363: if (insensUSASCII)
1364: currentToken.unicodeAware = false;
1365: }
1366: }
1367:
1368:
1369: addToken (currentToken);
1370:
1371: if (branches != null)
1372: {
1373: branches.
1374: add (new
1375: RE (firstToken, lastToken, numSubs, subIndex, minimumLength,
1376: maximumLength));
1377: branches.trimToSize ();
1378: minimumLength = 0;
1379: maximumLength = 0;
1380: firstToken = lastToken = null;
1381: addToken (new RETokenOneOf (subIndex, branches, false));
1382: }
1383: else
1384: addToken (new RETokenEndSub (subIndex));
1385:
1386: }
1387:
1388: private static class ParseCharClassResult
1389: {
1390: RETokenOneOf token;
1391: int index;
1392: boolean returnAtAndOperator = false;
1393: }
1394:
1395:
1405: private static ParseCharClassResult parseCharClass (int subIndex,
1406: char[]pattern,
1407: int index, int pLength,
1408: int cflags,
1409: RESyntax syntax,
1410: int pflags) throws
1411: REException
1412: {
1413:
1414: boolean insens = ((cflags & REG_ICASE) > 0);
1415: boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1416: final ArrayList < REToken > options = new ArrayList < REToken > ();
1417: ArrayList < Object > addition = new ArrayList < Object > ();
1418: boolean additionAndAppeared = false;
1419: final int RETURN_AT_AND = 0x01;
1420: boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1421: boolean negative = false;
1422: char ch;
1423:
1424: char lastChar = 0;
1425: boolean lastCharIsSet = false;
1426: if (index == pLength)
1427: throw new REException (getLocalizedMessage ("unmatched.bracket"),
1428: REException.REG_EBRACK, index);
1429:
1430:
1431: if ((ch = pattern[index]) == '^')
1432: {
1433: negative = true;
1434: if (++index == pLength)
1435: throw new REException (getLocalizedMessage ("class.no.end"),
1436: REException.REG_EBRACK, index);
1437: ch = pattern[index];
1438: }
1439:
1440:
1441: if (ch == ']')
1442: {
1443: lastChar = ch;
1444: lastCharIsSet = true;
1445: if (++index == pLength)
1446: throw new REException (getLocalizedMessage ("class.no.end"),
1447: REException.REG_EBRACK, index);
1448: }
1449:
1450: while ((ch = pattern[index++]) != ']')
1451: {
1452: if ((ch == '-') && (lastCharIsSet))
1453: {
1454: if (index == pLength)
1455: throw new REException (getLocalizedMessage ("class.no.end"),
1456: REException.REG_EBRACK, index);
1457: if ((ch = pattern[index]) == ']')
1458: {
1459: RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1460: if (insensUSASCII)
1461: t.unicodeAware = false;
1462: options.add (t);
1463: lastChar = '-';
1464: }
1465: else
1466: {
1467: if ((ch == '\\')
1468: && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1469: {
1470: CharExpression ce =
1471: getCharExpression (pattern, index, pLength, syntax);
1472: if (ce == null)
1473: throw new REException ("invalid escape sequence",
1474: REException.REG_ESCAPE, index);
1475: ch = ce.ch;
1476: index = index + ce.len - 1;
1477: }
1478: RETokenRange t =
1479: new RETokenRange (subIndex, lastChar, ch, insens);
1480: if (insensUSASCII)
1481: t.unicodeAware = false;
1482: options.add (t);
1483: lastChar = 0;
1484: lastCharIsSet = false;
1485: index++;
1486: }
1487: }
1488: else if ((ch == '\\')
1489: && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS))
1490: {
1491: if (index == pLength)
1492: throw new REException (getLocalizedMessage ("class.no.end"),
1493: REException.REG_EBRACK, index);
1494: int posixID = -1;
1495: boolean negate = false;
1496: char asciiEsc = 0;
1497: boolean asciiEscIsSet = false;
1498: NamedProperty np = null;
1499: if (("dswDSW".indexOf (pattern[index]) != -1)
1500: && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS))
1501: {
1502: switch (pattern[index])
1503: {
1504: case 'D':
1505: negate = true;
1506: case 'd':
1507: posixID = RETokenPOSIX.DIGIT;
1508: break;
1509: case 'S':
1510: negate = true;
1511: case 's':
1512: posixID = RETokenPOSIX.SPACE;
1513: break;
1514: case 'W':
1515: negate = true;
1516: case 'w':
1517: posixID = RETokenPOSIX.ALNUM;
1518: break;
1519: }
1520: }
1521: if (("pP".indexOf (pattern[index]) != -1)
1522: && syntax.get (RESyntax.RE_NAMED_PROPERTY))
1523: {
1524: np = getNamedProperty (pattern, index - 1, pLength);
1525: if (np == null)
1526: throw new REException ("invalid escape sequence",
1527: REException.REG_ESCAPE, index);
1528: index = index - 1 + np.len - 1;
1529: }
1530: else
1531: {
1532: CharExpression ce =
1533: getCharExpression (pattern, index - 1, pLength, syntax);
1534: if (ce == null)
1535: throw new REException ("invalid escape sequence",
1536: REException.REG_ESCAPE, index);
1537: asciiEsc = ce.ch;
1538: asciiEscIsSet = true;
1539: index = index - 1 + ce.len - 1;
1540: }
1541: if (lastCharIsSet)
1542: {
1543: RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1544: if (insensUSASCII)
1545: t.unicodeAware = false;
1546: options.add (t);
1547: }
1548:
1549: if (posixID != -1)
1550: {
1551: RETokenPOSIX t =
1552: new RETokenPOSIX (subIndex, posixID, insens, negate);
1553: if (insensUSASCII)
1554: t.unicodeAware = false;
1555: options.add (t);
1556: }
1557: else if (np != null)
1558: {
1559: RETokenNamedProperty t =
1560: getRETokenNamedProperty (subIndex, np, insens, index);
1561: if (insensUSASCII)
1562: t.unicodeAware = false;
1563: options.add (t);
1564: }
1565: else if (asciiEscIsSet)
1566: {
1567: lastChar = asciiEsc;
1568: lastCharIsSet = true;
1569: }
1570: else
1571: {
1572: lastChar = pattern[index];
1573: lastCharIsSet = true;
1574: }
1575: ++index;
1576: }
1577: else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES))
1578: && (index < pLength) && (pattern[index] == ':'))
1579: {
1580: CPStringBuilder posixSet = new CPStringBuilder ();
1581: index = getPosixSet (pattern, index + 1, posixSet);
1582: int posixId = RETokenPOSIX.intValue (posixSet.toString ());
1583: if (posixId != -1)
1584: {
1585: RETokenPOSIX t =
1586: new RETokenPOSIX (subIndex, posixId, insens, false);
1587: if (insensUSASCII)
1588: t.unicodeAware = false;
1589: options.add (t);
1590: }
1591: }
1592: else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS)))
1593: {
1594: ParseCharClassResult result =
1595: parseCharClass (subIndex, pattern, index, pLength, cflags,
1596: syntax, 0);
1597: addition.add (result.token);
1598: addition.add ("|");
1599: index = result.index;
1600: }
1601: else if ((ch == '&') &&
1602: (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) &&
1603: (index < pLength) && (pattern[index] == '&'))
1604: {
1605: if (returnAtAndOperator)
1606: {
1607: ParseCharClassResult result = new ParseCharClassResult ();
1608: options.trimToSize ();
1609: if (additionAndAppeared)
1610: addition.add ("&");
1611: if (addition.size () == 0)
1612: addition = null;
1613: result.token = new RETokenOneOf (subIndex,
1614: options, addition, negative);
1615: result.index = index - 1;
1616: result.returnAtAndOperator = true;
1617: return result;
1618: }
1619:
1620:
1621:
1622:
1623:
1624:
1625:
1626: if (additionAndAppeared)
1627: addition.add ("&");
1628: addition.add (Boolean.FALSE);
1629: additionAndAppeared = true;
1630:
1631:
1632:
1633:
1634:
1635:
1636:
1637:
1638: if ((index + 1 < pLength) && (pattern[index + 1] != '['))
1639: {
1640: ParseCharClassResult result =
1641: parseCharClass (subIndex, pattern, index + 1, pLength,
1642: cflags, syntax,
1643: RETURN_AT_AND);
1644: addition.add (result.token);
1645: addition.add ("|");
1646:
1647:
1648:
1649:
1650: index = (result.returnAtAndOperator ?
1651: result.index : result.index - 1);
1652: }
1653: }
1654: else
1655: {
1656: if (lastCharIsSet)
1657: {
1658: RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1659: if (insensUSASCII)
1660: t.unicodeAware = false;
1661: options.add (t);
1662: }
1663: lastChar = ch;
1664: lastCharIsSet = true;
1665: }
1666: if (index == pLength)
1667: throw new REException (getLocalizedMessage ("class.no.end"),
1668: REException.REG_EBRACK, index);
1669: }
1670:
1671:
1672: if (lastCharIsSet)
1673: {
1674: RETokenChar t = new RETokenChar (subIndex, lastChar, insens);
1675: if (insensUSASCII)
1676: t.unicodeAware = false;
1677: options.add (t);
1678: }
1679:
1680: ParseCharClassResult result = new ParseCharClassResult ();
1681:
1682: options.trimToSize ();
1683: if (additionAndAppeared)
1684: addition.add ("&");
1685: if (addition.size () == 0)
1686: addition = null;
1687: result.token = new RETokenOneOf (subIndex, options, addition, negative);
1688: result.index = index;
1689: return result;
1690: }
1691:
1692: private static int getCharUnit (char[]input, int index, CharUnit unit,
1693: boolean quot) throws REException
1694: {
1695: unit.ch = input[index++];
1696: unit.bk = (unit.ch == '\\'
1697: && (!quot || index >= input.length || input[index] == 'E'));
1698: if (unit.bk)
1699: if (index < input.length)
1700: unit.ch = input[index++];
1701: else
1702: throw new REException (getLocalizedMessage ("ends.with.backslash"),
1703: REException.REG_ESCAPE, index);
1704: return index;
1705: }
1706:
1707: private static int parseInt (char[]input, int pos, int len, int radix)
1708: {
1709: int ret = 0;
1710: for (int i = pos; i < pos + len; i++)
1711: {
1712: ret = ret * radix + Character.digit (input[i], radix);
1713: }
1714: return ret;
1715: }
1716:
1717:
1724: private static class CharExpression
1725: {
1726:
1727: char ch;
1728:
1729: String expr;
1730:
1731: int len;
1732: public String toString ()
1733: {
1734: return expr;
1735: }
1736: }
1737:
1738: private static CharExpression getCharExpression (char[]input, int pos,
1739: int lim, RESyntax syntax)
1740: {
1741: CharExpression ce = new CharExpression ();
1742: char c = input[pos];
1743: if (c == '\\')
1744: {
1745: if (pos + 1 >= lim)
1746: return null;
1747: c = input[pos + 1];
1748: switch (c)
1749: {
1750: case 't':
1751: ce.ch = '\t';
1752: ce.len = 2;
1753: break;
1754: case 'n':
1755: ce.ch = '\n';
1756: ce.len = 2;
1757: break;
1758: case 'r':
1759: ce.ch = '\r';
1760: ce.len = 2;
1761: break;
1762: case 'x':
1763: case 'u':
1764: if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) ||
1765: (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR)))
1766: {
1767: int l = 0;
1768: int expectedLength = (c == 'x' ? 2 : 4);
1769: for (int i = pos + 2; i < pos + 2 + expectedLength; i++)
1770: {
1771: if (i >= lim)
1772: break;
1773: if (!((input[i] >= '0' && input[i] <= '9') ||
1774: (input[i] >= 'A' && input[i] <= 'F') ||
1775: (input[i] >= 'a' && input[i] <= 'f')))
1776: break;
1777: l++;
1778: }
1779: if (l != expectedLength)
1780: return null;
1781: ce.ch = (char) (parseInt (input, pos + 2, l, 16));
1782: ce.len = l + 2;
1783: }
1784: else
1785: {
1786: ce.ch = c;
1787: ce.len = 2;
1788: }
1789: break;
1790: case '0':
1791: if (syntax.get (RESyntax.RE_OCTAL_CHAR))
1792: {
1793: int l = 0;
1794: for (int i = pos + 2; i < pos + 2 + 3; i++)
1795: {
1796: if (i >= lim)
1797: break;
1798: if (input[i] < '0' || input[i] > '7')
1799: break;
1800: l++;
1801: }
1802: if (l == 3 && input[pos + 2] > '3')
1803: l--;
1804: if (l <= 0)
1805: return null;
1806: ce.ch = (char) (parseInt (input, pos + 2, l, 8));
1807: ce.len = l + 2;
1808: }
1809: else
1810: {
1811: ce.ch = c;
1812: ce.len = 2;
1813: }
1814: break;
1815: default:
1816: ce.ch = c;
1817: ce.len = 2;
1818: break;
1819: }
1820: }
1821: else
1822: {
1823: ce.ch = input[pos];
1824: ce.len = 1;
1825: }
1826: ce.expr = new String (input, pos, ce.len);
1827: return ce;
1828: }
1829:
1830:
1838: private static class NamedProperty
1839: {
1840:
1841: String name;
1842:
1843: boolean negate;
1844:
1845: int len;
1846: }
1847:
1848: private static NamedProperty getNamedProperty (char[]input, int pos,
1849: int lim)
1850: {
1851: NamedProperty np = new NamedProperty ();
1852: char c = input[pos];
1853: if (c == '\\')
1854: {
1855: if (++pos >= lim)
1856: return null;
1857: c = input[pos++];
1858: switch (c)
1859: {
1860: case 'p':
1861: np.negate = false;
1862: break;
1863: case 'P':
1864: np.negate = true;
1865: break;
1866: default:
1867: return null;
1868: }
1869: c = input[pos++];
1870: if (c == '{')
1871: {
1872: int p = -1;
1873: for (int i = pos; i < lim; i++)
1874: {
1875: if (input[i] == '}')
1876: {
1877: p = i;
1878: break;
1879: }
1880: }
1881: if (p < 0)
1882: return null;
1883: int len = p - pos;
1884: np.name = new String (input, pos, len);
1885: np.len = len + 4;
1886: }
1887: else
1888: {
1889: np.name = new String (input, pos - 1, 1);
1890: np.len = 3;
1891: }
1892: return np;
1893: }
1894: else
1895: return null;
1896: }
1897:
1898: private static RETokenNamedProperty getRETokenNamedProperty (int subIndex,
1899: NamedProperty
1900: np,
1901: boolean insens,
1902: int index)
1903: throws REException
1904: {
1905: try
1906: {
1907: return new RETokenNamedProperty (subIndex, np.name, insens, np.negate);
1908: }
1909: catch (REException e)
1910: {
1911: REException ree;
1912: ree = new REException (e.getMessage (), REException.REG_ESCAPE, index);
1913: ree.initCause (e);
1914: throw ree;
1915: }
1916: }
1917:
1918:
1923: public boolean isMatch (Object input)
1924: {
1925: return isMatch (input, 0, 0);
1926: }
1927:
1928:
1935: public boolean isMatch (Object input, int index)
1936: {
1937: return isMatch (input, index, 0);
1938: }
1939:
1940:
1941:
1949: public boolean isMatch (Object input, int index, int eflags)
1950: {
1951: return isMatchImpl (makeCharIndexed (input, index), index, eflags);
1952: }
1953:
1954: private boolean isMatchImpl (CharIndexed input, int index, int eflags)
1955: {
1956: if (firstToken == null)
1957: return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS);
1958: REMatch m = new REMatch (numSubs, index, eflags);
1959: if (firstToken.match (input, m))
1960: {
1961: if (m != null)
1962: {
1963: if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS)
1964: {
1965: return true;
1966: }
1967: }
1968: }
1969: return false;
1970: }
1971:
1972:
1977: public int getNumSubs ()
1978: {
1979: return numSubs;
1980: }
1981:
1982:
1983: void setUncle (REToken uncle)
1984: {
1985: if (lastToken != null)
1986: {
1987: lastToken.setUncle (uncle);
1988: }
1989: else
1990: super.setUncle (uncle);
1991: }
1992:
1993:
1994:
1995: boolean chain (REToken next)
1996: {
1997: super.chain (next);
1998: setUncle (next);
1999: return true;
2000: }
2001:
2002:
2006: public int getMinimumLength ()
2007: {
2008: return minimumLength;
2009: }
2010:
2011: public int getMaximumLength ()
2012: {
2013: return maximumLength;
2014: }
2015:
2016:
2025: public REMatch[] getAllMatches (Object input)
2026: {
2027: return getAllMatches (input, 0, 0);
2028: }
2029:
2030:
2041: public REMatch[] getAllMatches (Object input, int index)
2042: {
2043: return getAllMatches (input, index, 0);
2044: }
2045:
2046:
2059: public REMatch[] getAllMatches (Object input, int index, int eflags)
2060: {
2061: return getAllMatchesImpl (makeCharIndexed (input, index), index, eflags);
2062: }
2063:
2064:
2065: private REMatch[] getAllMatchesImpl (CharIndexed input, int index,
2066: int eflags)
2067: {
2068: List < REMatch > all = new ArrayList < REMatch > ();
2069: REMatch m = null;
2070: while ((m = getMatchImpl (input, index, eflags, null)) != null)
2071: {
2072: all.add (m);
2073: index = m.getEndIndex ();
2074: if (m.end[0] == 0)
2075: {
2076: index++;
2077: input.move (1);
2078: }
2079: else
2080: {
2081: input.move (m.end[0]);
2082: }
2083: if (!input.isValid ())
2084: break;
2085: }
2086: return all.toArray (new REMatch[all.size ()]);
2087: }
2088:
2089:
2090: boolean match (CharIndexed input, REMatch mymatch)
2091: {
2092: input.setHitEnd (mymatch);
2093: if (firstToken == null)
2094: {
2095: return next (input, mymatch);
2096: }
2097:
2098:
2099: mymatch.start1[subIndex] = mymatch.index;
2100:
2101: return firstToken.match (input, mymatch);
2102: }
2103:
2104: REMatch findMatch (CharIndexed input, REMatch mymatch)
2105: {
2106: if (mymatch.backtrackStack == null)
2107: mymatch.backtrackStack = new BacktrackStack ();
2108: boolean b = match (input, mymatch);
2109: if (b)
2110: {
2111: return mymatch;
2112: }
2113: return null;
2114: }
2115:
2116:
2123: public REMatch getMatch (Object input)
2124: {
2125: return getMatch (input, 0, 0);
2126: }
2127:
2128:
2137: public REMatch getMatch (Object input, int index)
2138: {
2139: return getMatch (input, index, 0);
2140: }
2141:
2142:
2152: public REMatch getMatch (Object input, int index, int eflags)
2153: {
2154: return getMatch (input, index, eflags, null);
2155: }
2156:
2157:
2170: public REMatch getMatch (Object input, int index, int eflags,
2171: CPStringBuilder buffer)
2172: {
2173: return getMatchImpl (makeCharIndexed (input, index), index, eflags,
2174: buffer);
2175: }
2176:
2177: REMatch getMatchImpl (CharIndexed input, int anchor, int eflags,
2178: CPStringBuilder buffer)
2179: {
2180: boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
2181: boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
2182: RE re = (tryEntireMatch ? (RE) this.clone () : this);
2183: if (tryEntireMatch)
2184: {
2185: RETokenEnd reEnd = new RETokenEnd (0, null);
2186: reEnd.setFake (true);
2187: re.chain (reEnd);
2188: }
2189:
2190: REMatch mymatch = new REMatch (numSubs, anchor, eflags);
2191: do
2192: {
2193:
2200: if (re.match (input, mymatch))
2201: {
2202: REMatch best = mymatch;
2203:
2204:
2205:
2206:
2207:
2208:
2216: best.end[0] = best.index;
2217: best.finish (input);
2218: input.setLastMatch (best);
2219: return best;
2220: }
2221:
2224: mymatch.clear (++anchor);
2225:
2226: if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS)
2227: {
2228: buffer.append (input.charAt (0));
2229: }
2230:
2231:
2232: }
2233: while (doMove && input.move1 (1));
2234:
2235:
2236: if (minimumLength == 0)
2237: {
2238: if (match (input, mymatch))
2239: {
2240: mymatch.finish (input);
2241: return mymatch;
2242: }
2243: }
2244:
2245: return null;
2246: }
2247:
2248:
2255: public REMatchEnumeration getMatchEnumeration (Object input)
2256: {
2257: return getMatchEnumeration (input, 0, 0);
2258: }
2259:
2260:
2261:
2270: public REMatchEnumeration getMatchEnumeration (Object input, int index)
2271: {
2272: return getMatchEnumeration (input, index, 0);
2273: }
2274:
2275:
2285: public REMatchEnumeration getMatchEnumeration (Object input, int index,
2286: int eflags)
2287: {
2288: return new REMatchEnumeration (this, makeCharIndexed (input, index),
2289: index, eflags);
2290: }
2291:
2292:
2293:
2301: public String substitute (Object input, String replace)
2302: {
2303: return substitute (input, replace, 0, 0);
2304: }
2305:
2306:
2319: public String substitute (Object input, String replace, int index)
2320: {
2321: return substitute (input, replace, index, 0);
2322: }
2323:
2324:
2337: public String substitute (Object input, String replace, int index,
2338: int eflags)
2339: {
2340: return substituteImpl (makeCharIndexed (input, index), replace, index,
2341: eflags);
2342: }
2343:
2344: private String substituteImpl (CharIndexed input, String replace, int index,
2345: int eflags)
2346: {
2347: CPStringBuilder buffer = new CPStringBuilder ();
2348: REMatch m = getMatchImpl (input, index, eflags, buffer);
2349: if (m == null)
2350: return buffer.toString ();
2351: buffer.append (getReplacement (replace, m, eflags));
2352: if (input.move (m.end[0]))
2353: {
2354: do
2355: {
2356: buffer.append (input.charAt (0));
2357: }
2358: while (input.move (1));
2359: }
2360: return buffer.toString ();
2361: }
2362:
2363:
2372: public String substituteAll (Object input, String replace)
2373: {
2374: return substituteAll (input, replace, 0, 0);
2375: }
2376:
2377:
2391: public String substituteAll (Object input, String replace, int index)
2392: {
2393: return substituteAll (input, replace, index, 0);
2394: }
2395:
2396:
2409: public String substituteAll (Object input, String replace, int index,
2410: int eflags)
2411: {
2412: return substituteAllImpl (makeCharIndexed (input, index), replace, index,
2413: eflags);
2414: }
2415:
2416: private String substituteAllImpl (CharIndexed input, String replace,
2417: int index, int eflags)
2418: {
2419: CPStringBuilder buffer = new CPStringBuilder ();
2420: REMatch m;
2421: while ((m = getMatchImpl (input, index, eflags, buffer)) != null)
2422: {
2423: buffer.append (getReplacement (replace, m, eflags));
2424: index = m.getEndIndex ();
2425: if (m.end[0] == 0)
2426: {
2427: char ch = input.charAt (0);
2428: if (ch != CharIndexed.OUT_OF_BOUNDS)
2429: buffer.append (ch);
2430: input.move (1);
2431: }
2432: else
2433: {
2434: input.move (m.end[0]);
2435: }
2436:
2437: if (!input.isValid ())
2438: break;
2439: }
2440: return buffer.toString ();
2441: }
2442:
2443: public static String getReplacement (String replace, REMatch m, int eflags)
2444: {
2445: if ((eflags & REG_NO_INTERPOLATE) > 0)
2446: return replace;
2447: else
2448: {
2449: if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0)
2450: {
2451: CPStringBuilder sb = new CPStringBuilder ();
2452: int l = replace.length ();
2453: for (int i = 0; i < l; i++)
2454: {
2455: char c = replace.charAt (i);
2456: switch (c)
2457: {
2458: case '\\':
2459: i++;
2460:
2461: sb.append (replace.charAt (i));
2462: break;
2463: case '$':
2464: int i1 = i + 1;
2465: while (i1 < replace.length () &&
2466: Character.isDigit (replace.charAt (i1)))
2467: i1++;
2468: sb.append (m.substituteInto (replace.substring (i, i1)));
2469: i = i1 - 1;
2470: break;
2471: default:
2472: sb.append (c);
2473: }
2474: }
2475: return sb.toString ();
2476: }
2477: else
2478: return m.substituteInto (replace);
2479: }
2480: }
2481:
2482:
2483: private void addToken (REToken next)
2484: {
2485: if (next == null)
2486: return;
2487: minimumLength += next.getMinimumLength ();
2488: int nmax = next.getMaximumLength ();
2489: if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2490: maximumLength += nmax;
2491: else
2492: maximumLength = Integer.MAX_VALUE;
2493:
2494: if (firstToken == null)
2495: {
2496: lastToken = firstToken = next;
2497: }
2498: else
2499: {
2500:
2501:
2502: if (lastToken.chain (next))
2503: {
2504: lastToken = next;
2505: }
2506: }
2507: }
2508:
2509: private static REToken setRepeated (REToken current, int min, int max,
2510: int index) throws REException
2511: {
2512: if (current == null)
2513: throw new REException (getLocalizedMessage ("repeat.no.token"),
2514: REException.REG_BADRPT, index);
2515: return new RETokenRepeated (current.subIndex, current, min, max);
2516: }
2517:
2518: private static int getPosixSet (char[]pattern, int index,
2519: CPStringBuilder buf)
2520: {
2521:
2522:
2523: int i;
2524: for (i = index; i < (pattern.length - 1); i++)
2525: {
2526: if ((pattern[i] == ':') && (pattern[i + 1] == ']'))
2527: return i + 2;
2528: buf.append (pattern[i]);
2529: }
2530: return index;
2531: }
2532:
2533: private int getMinMax (char[]input, int index, IntPair minMax,
2534: RESyntax syntax) throws REException
2535: {
2536:
2537:
2538: boolean mustMatch = !syntax.get (RESyntax.RE_NO_BK_BRACES);
2539: int startIndex = index;
2540: if (index == input.length)
2541: {
2542: if (mustMatch)
2543: throw new REException (getLocalizedMessage ("unmatched.brace"),
2544: REException.REG_EBRACE, index);
2545: else
2546: return startIndex;
2547: }
2548:
2549: int min, max = 0;
2550: CharUnit unit = new CharUnit ();
2551: CPStringBuilder buf = new CPStringBuilder ();
2552:
2553:
2554: do
2555: {
2556: index = getCharUnit (input, index, unit, false);
2557: if (Character.isDigit (unit.ch))
2558: buf.append (unit.ch);
2559: }
2560: while ((index != input.length) && Character.isDigit (unit.ch));
2561:
2562:
2563: if (buf.length () == 0)
2564: {
2565: if (mustMatch)
2566: throw new REException (getLocalizedMessage ("interval.error"),
2567: REException.REG_EBRACE, index);
2568: else
2569: return startIndex;
2570: }
2571:
2572: min = Integer.parseInt (buf.toString ());
2573:
2574: if ((unit.ch == '}') && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2575: max = min;
2576: else if (index == input.length)
2577: if (mustMatch)
2578: throw new REException (getLocalizedMessage ("interval.no.end"),
2579: REException.REG_EBRACE, index);
2580: else
2581: return startIndex;
2582: else
2583: if ((unit.ch == ',') && !unit.bk)
2584: {
2585: buf = new CPStringBuilder ();
2586:
2587: while (((index =
2588: getCharUnit (input, index, unit, false)) != input.length)
2589: && Character.isDigit (unit.ch))
2590: buf.append (unit.ch);
2591:
2592: if (!
2593: ((unit.ch == '}')
2594: && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2595: if (mustMatch)
2596: throw new REException (getLocalizedMessage ("interval.error"),
2597: REException.REG_EBRACE, index);
2598: else
2599: return startIndex;
2600:
2601:
2602: if (buf.length () == 0)
2603: max = Integer.MAX_VALUE;
2604: else
2605: max = Integer.parseInt (buf.toString ());
2606: }
2607: else if (mustMatch)
2608: throw new REException (getLocalizedMessage ("interval.error"),
2609: REException.REG_EBRACE, index);
2610: else
2611: return startIndex;
2612:
2613:
2614:
2615: minMax.first = min;
2616: minMax.second = max;
2617:
2618:
2619: return index;
2620: }
2621:
2622:
2626: public String toString ()
2627: {
2628: CPStringBuilder sb = new CPStringBuilder ();
2629: dump (sb);
2630: return sb.toString ();
2631: }
2632:
2633: void dump (CPStringBuilder os)
2634: {
2635: os.append ("(?#startRE subIndex=" + subIndex + ")");
2636: if (subIndex == 0)
2637: os.append ("?:");
2638: if (firstToken != null)
2639: firstToken.dumpAll (os);
2640: if (subIndex == 0)
2641: os.append (")");
2642: os.append ("(?#endRE subIndex=" + subIndex + ")");
2643: }
2644:
2645:
2646:
2647:
2648: public static CharIndexed makeCharIndexed (Object input, int index)
2649: {
2650:
2651:
2652:
2653:
2654:
2655:
2656: if (input instanceof CharIndexed)
2657: {
2658: CharIndexed ci = (CharIndexed) input;
2659: ci.setAnchor (index);
2660: return ci;
2661: }
2662: else if (input instanceof CharSequence)
2663: return new CharIndexedCharSequence ((CharSequence) input, index);
2664: else if (input instanceof String)
2665: return new CharIndexedString ((String) input, index);
2666: else if (input instanceof char[])
2667: return new CharIndexedCharArray ((char[]) input, index);
2668: else if (input instanceof StringBuffer)
2669: return new CharIndexedStringBuffer ((StringBuffer) input, index);
2670: else if (input instanceof InputStream)
2671: return new CharIndexedInputStream ((InputStream) input, index);
2672: else
2673: return new CharIndexedString (input.toString (), index);
2674: }
2675: }