Frames | No Frames |
1: /* URI.java -- An URI class 2: Copyright (C) 2002, 2004, 2005, 2006, 2008 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.net; 40: 41: import gnu.java.lang.CPStringBuilder; 42: 43: import java.io.IOException; 44: import java.io.ObjectInputStream; 45: import java.io.ObjectOutputStream; 46: import java.io.Serializable; 47: import java.util.regex.Matcher; 48: import java.util.regex.Pattern; 49: 50: /** 51: * <p> 52: * A URI instance represents that defined by 53: * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>, 54: * with some deviations. 55: * </p> 56: * <p> 57: * At its highest level, a URI consists of: 58: * </p> 59: * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em> 60: * [<strong>#</strong><em>fragment</em>]</code> 61: * </p> 62: * <p> 63: * where <strong>#</strong> and <strong>:</strong> are literal characters, 64: * and those parts enclosed in square brackets are optional. 65: * </p> 66: * <p> 67: * There are two main types of URI. An <em>opaque</em> URI is one 68: * which just consists of the above three parts, and is not further 69: * defined. An example of such a URI would be <em>mailto:</em> URI. 70: * In contrast, <em>hierarchical</em> URIs give further definition 71: * to the scheme-specific part, so as represent some part of a hierarchical 72: * structure. 73: * </p> 74: * <p> 75: * <code>[<strong>//</strong><em>authority</em>][<em>path</em>] 76: * [<strong>?</strong><em>query</em>]</code> 77: * </p> 78: * <p> 79: * with <strong>/</strong> and <strong>?</strong> being literal characters. 80: * When server-based, the authority section is further subdivided into: 81: * </p> 82: * <p> 83: * <code>[<em>user-info</em><strong>@</strong>]<em>host</em> 84: * [<strong>:</strong><em>port</em>]</code> 85: * </p> 86: * <p> 87: * with <strong>@</strong> and <strong>:</strong> as literal characters. 88: * Authority sections that are not server-based are said to be registry-based. 89: * </p> 90: * <p> 91: * Hierarchical URIs can be either relative or absolute. Absolute URIs 92: * always start with a `<strong>/</strong>', while relative URIs don't 93: * specify a scheme. Opaque URIs are always absolute. 94: * </p> 95: * <p> 96: * Each part of the URI may have one of three states: undefined, empty 97: * or containing some content. The former two of these are represented 98: * by <code>null</code> and the empty string in Java, respectively. 99: * The scheme-specific part may never be undefined. It also follows from 100: * this that the path sub-part may also not be undefined, so as to ensure 101: * the former. 102: * </p> 103: * <h2>Character Escaping and Quoting</h2> 104: * <p> 105: * The characters that can be used within a valid URI are restricted. 106: * There are two main classes of characters which can't be used as is 107: * within the URI: 108: * </p> 109: * <ol> 110: * <li><strong>Characters outside the US-ASCII character set</strong>. 111: * These have to be <strong>escaped</strong> in order to create 112: * an RFC-compliant URI; this means replacing the character with the 113: * appropriate hexadecimal value, preceded by a `%'.</li> 114: * <li><strong>Illegal characters</strong> (e.g. space characters, 115: * control characters) are quoted, which results in them being encoded 116: * in the same way as non-US-ASCII characters.</li> 117: * </ol> 118: * <p> 119: * The set of valid characters differs depending on the section of the URI: 120: * </p> 121: * <ul> 122: * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li> 123: * <li><strong>Authority</strong>:Composed of the username, host, port, `@' 124: * and `:'.</li> 125: * <li><strong>Username</strong>: Allows unreserved or percent-encoded 126: * characters, sub-delimiters and `:'.</li> 127: * <li><strong>Host</strong>: Allows unreserved or percent-encoded 128: * characters, sub-delimiters and square brackets (`[' and `]') for IPv6 129: * addresses.</li> 130: * <li><strong>Port</strong>: Digits only.</li> 131: * <li><strong>Path</strong>: Allows the path characters and `/'. 132: * <li><strong>Query</strong>: Allows the path characters, `?' and '/'. 133: * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'. 134: * </ul> 135: * <p> 136: * These definitions reference the following sets of characters: 137: * </p> 138: * <ul> 139: * <li><strong>Unreserved characters</strong>: The alphanumerics plus 140: * `-', `.', `_', and `~'.</li> 141: * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*', 142: * `+', `,', `;', `=' and the single-quote itself.</li> 143: * <li><strong>Path characters</strong>: Unreserved and percent-encoded 144: * characters and the sub-delimiters along with `@' and `:'.</li> 145: * </ul> 146: * <p> 147: * The constructors and accessor methods allow the use and retrieval of 148: * URI components which contain non-US-ASCII characters directly. 149: * They are only escaped when the <code>toASCIIString()</code> method 150: * is used. In contrast, illegal characters are always quoted, with the 151: * exception of the return values of the non-raw accessors. 152: * </p> 153: * 154: * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp) 155: * @author Dalibor Topic (robilad@kaffe.org) 156: * @author Michael Koch (konqueror@gmx.de) 157: * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 158: * @since 1.4 159: */ 160: public final class URI 161: implements Comparable<URI>, Serializable 162: { 163: /** 164: * For serialization compatability. 165: */ 166: static final long serialVersionUID = -6052424284110960213L; 167: 168: /** 169: * Regular expression for parsing URIs. 170: * 171: * Taken from RFC 2396, Appendix B. 172: * This expression doesn't parse IPv6 addresses. 173: */ 174: private static final String URI_REGEXP = 175: "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"; 176: 177: /** 178: * Regular expression for parsing the authority segment. 179: */ 180: private static final String AUTHORITY_REGEXP = 181: "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?"; 182: 183: /** 184: * Valid characters (taken from rfc2396/3986) 185: */ 186: private static final String RFC2396_DIGIT = "0123456789"; 187: private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz"; 188: private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; 189: private static final String RFC2396_ALPHA = 190: RFC2396_LOWALPHA + RFC2396_UPALPHA; 191: private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA; 192: private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~"; 193: private static final String RFC3986_SUBDELIMS = "!$&'()*+,;="; 194: private static final String RFC3986_REG_NAME = 195: RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%"; 196: private static final String RFC3986_PCHAR = RFC3986_UNRESERVED + 197: RFC3986_SUBDELIMS + ":@%"; 198: private static final String RFC3986_SEGMENT = RFC3986_PCHAR; 199: private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/"; 200: private static final String RFC3986_SSP = RFC3986_PCHAR + "?/"; 201: private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]"; 202: private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":"; 203: 204: /** 205: * Index of scheme component in parsed URI. 206: */ 207: private static final int SCHEME_GROUP = 2; 208: 209: /** 210: * Index of scheme-specific-part in parsed URI. 211: */ 212: private static final int SCHEME_SPEC_PART_GROUP = 3; 213: 214: /** 215: * Index of authority component in parsed URI. 216: */ 217: private static final int AUTHORITY_GROUP = 5; 218: 219: /** 220: * Index of path component in parsed URI. 221: */ 222: private static final int PATH_GROUP = 6; 223: 224: /** 225: * Index of query component in parsed URI. 226: */ 227: private static final int QUERY_GROUP = 8; 228: 229: /** 230: * Index of fragment component in parsed URI. 231: */ 232: private static final int FRAGMENT_GROUP = 10; 233: 234: /** 235: * Index of userinfo component in parsed authority section. 236: */ 237: private static final int AUTHORITY_USERINFO_GROUP = 2; 238: 239: /** 240: * Index of host component in parsed authority section. 241: */ 242: private static final int AUTHORITY_HOST_GROUP = 3; 243: 244: /** 245: * Index of port component in parsed authority section. 246: */ 247: private static final int AUTHORITY_PORT_GROUP = 5; 248: 249: /** 250: * The compiled version of the URI regular expression. 251: */ 252: private static final Pattern URI_PATTERN; 253: 254: /** 255: * The compiled version of the authority regular expression. 256: */ 257: private static final Pattern AUTHORITY_PATTERN; 258: 259: /** 260: * The set of valid hexadecimal characters. 261: */ 262: private static final String HEX = "0123456789ABCDEF"; 263: 264: private transient String scheme; 265: private transient String rawSchemeSpecificPart; 266: private transient String schemeSpecificPart; 267: private transient String rawAuthority; 268: private transient String authority; 269: private transient String rawUserInfo; 270: private transient String userInfo; 271: private transient String rawHost; 272: private transient String host; 273: private transient int port = -1; 274: private transient String rawPath; 275: private transient String path; 276: private transient String rawQuery; 277: private transient String query; 278: private transient String rawFragment; 279: private transient String fragment; 280: private String string; 281: 282: /** 283: * Static initializer to pre-compile the regular expressions. 284: */ 285: static 286: { 287: URI_PATTERN = Pattern.compile(URI_REGEXP); 288: AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP); 289: } 290: 291: private void readObject(ObjectInputStream is) 292: throws ClassNotFoundException, IOException 293: { 294: this.string = (String) is.readObject(); 295: try 296: { 297: parseURI(this.string); 298: } 299: catch (URISyntaxException x) 300: { 301: // Should not happen. 302: throw new RuntimeException(x); 303: } 304: } 305: 306: private void writeObject(ObjectOutputStream os) throws IOException 307: { 308: if (string == null) 309: string = toString(); 310: os.writeObject(string); 311: } 312: 313: /** 314: * <p> 315: * Returns the string content of the specified group of the supplied 316: * matcher. The returned value is modified according to the following: 317: * </p> 318: * <ul> 319: * <li>If the resulting string has a length greater than 0, then 320: * that string is returned.</li> 321: * <li>If a string of zero length, is matched, then the content 322: * of the preceding group is considered. If this is also an empty 323: * string, then <code>null</code> is returned to indicate an undefined 324: * value. Otherwise, the value is truly the empty string and this is 325: * the returned value.</li> 326: * </ul> 327: * <p> 328: * This method is used for matching against all parts of the URI 329: * that may be either undefined or empty (i.e. all those but the 330: * scheme-specific part and the path). In each case, the preceding 331: * group is the content of the original group, along with some 332: * additional distinguishing feature. For example, the preceding 333: * group for the query includes the preceding question mark, 334: * while that of the fragment includes the hash symbol. The presence 335: * of these features enables disambiguation between the two cases 336: * of a completely unspecified value and a simple non-existant value. 337: * The scheme differs in that it will never return an empty string; 338: * the delimiter follows the scheme rather than preceding it, so 339: * it becomes part of the following section. The same is true 340: * of the user information. 341: * </p> 342: * 343: * @param match the matcher, which contains the results of the URI 344: * matched against the URI regular expression. 345: * @return either the matched content, <code>null</code> for undefined 346: * values, or an empty string for a URI part with empty content. 347: */ 348: private static String getURIGroup(Matcher match, int group) 349: { 350: String matched = match.group(group); 351: if (matched == null || matched.length() == 0) 352: { 353: String prevMatched = match.group(group -1); 354: if (prevMatched == null || prevMatched.length() == 0) 355: return null; 356: else 357: return ""; 358: } 359: return matched; 360: } 361: 362: /** 363: * Sets fields of this URI by parsing the given string. 364: * 365: * @param str The string to parse 366: * 367: * @exception URISyntaxException If the given string violates RFC 2396 368: */ 369: private void parseURI(String str) throws URISyntaxException 370: { 371: Matcher matcher = URI_PATTERN.matcher(str); 372: 373: if (matcher.matches()) 374: { 375: scheme = getURIGroup(matcher, SCHEME_GROUP); 376: rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP); 377: schemeSpecificPart = unquote(rawSchemeSpecificPart); 378: if (!isOpaque()) 379: { 380: rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP); 381: rawPath = matcher.group(PATH_GROUP); 382: rawQuery = getURIGroup(matcher, QUERY_GROUP); 383: } 384: rawFragment = getURIGroup(matcher, FRAGMENT_GROUP); 385: } 386: else 387: throw new URISyntaxException(str, 388: "doesn't match URI regular expression"); 389: parseServerAuthority(); 390: 391: // We must eagerly unquote the parts, because this is the only time 392: // we may throw an exception. 393: authority = unquote(rawAuthority); 394: userInfo = unquote(rawUserInfo); 395: host = unquote(rawHost); 396: path = unquote(rawPath); 397: query = unquote(rawQuery); 398: fragment = unquote(rawFragment); 399: } 400: 401: /** 402: * Unquote "%" + hex quotes characters 403: * 404: * @param str The string to unquote or null. 405: * 406: * @return The unquoted string or null if str was null. 407: * 408: * @exception URISyntaxException If the given string contains invalid 409: * escape sequences. 410: */ 411: private static String unquote(String str) throws URISyntaxException 412: { 413: if (str == null) 414: return null; 415: byte[] buf = new byte[str.length()]; 416: int pos = 0; 417: for (int i = 0; i < str.length(); i++) 418: { 419: char c = str.charAt(i); 420: if (c == '%') 421: { 422: if (i + 2 >= str.length()) 423: throw new URISyntaxException(str, "Invalid quoted character"); 424: int hi = Character.digit(str.charAt(++i), 16); 425: int lo = Character.digit(str.charAt(++i), 16); 426: if (lo < 0 || hi < 0) 427: throw new URISyntaxException(str, "Invalid quoted character"); 428: buf[pos++] = (byte) (hi * 16 + lo); 429: } 430: else 431: buf[pos++] = (byte) c; 432: } 433: try 434: { 435: return new String(buf, 0, pos, "utf-8"); 436: } 437: catch (java.io.UnsupportedEncodingException x2) 438: { 439: throw (Error) new InternalError().initCause(x2); 440: } 441: } 442: 443: /** 444: * Quote characters illegal in URIs in given string. 445: * 446: * Replace illegal characters by encoding their UTF-8 447: * representation as "%" + hex code for each resulting 448: * UTF-8 character. 449: * 450: * @param str The string to quote 451: * 452: * @return The quoted string. 453: */ 454: private static String quote(String str) 455: { 456: return quote(str, RFC3986_SSP); 457: } 458: 459: /** 460: * Quote characters illegal in URI authorities in given string. 461: * 462: * Replace illegal characters by encoding their UTF-8 463: * representation as "%" + hex code for each resulting 464: * UTF-8 character. 465: * 466: * @param str The string to quote 467: * 468: * @return The quoted string. 469: */ 470: private static String quoteAuthority(String str) 471: { 472: // Technically, we should be using RFC2396_AUTHORITY, but 473: // it contains no additional characters. 474: return quote(str, RFC3986_REG_NAME); 475: } 476: 477: /** 478: * Quotes the characters in the supplied string that are not part of 479: * the specified set of legal characters. 480: * 481: * @param str the string to quote 482: * @param legalCharacters the set of legal characters 483: * 484: * @return the quoted string. 485: */ 486: private static String quote(String str, String legalCharacters) 487: { 488: CPStringBuilder sb = new CPStringBuilder(str.length()); 489: for (int i = 0; i < str.length(); i++) 490: { 491: char c = str.charAt(i); 492: if ((legalCharacters.indexOf(c) == -1) 493: && (c <= 127)) 494: { 495: sb.append('%'); 496: sb.append(HEX.charAt(c / 16)); 497: sb.append(HEX.charAt(c % 16)); 498: } 499: else 500: sb.append(c); 501: } 502: return sb.toString(); 503: } 504: 505: /** 506: * Quote characters illegal in URI hosts in given string. 507: * 508: * Replace illegal characters by encoding their UTF-8 509: * representation as "%" + hex code for each resulting 510: * UTF-8 character. 511: * 512: * @param str The string to quote 513: * 514: * @return The quoted string. 515: */ 516: private static String quoteHost(String str) 517: { 518: return quote(str, RFC3986_HOST); 519: } 520: 521: /** 522: * Quote characters illegal in URI paths in given string. 523: * 524: * Replace illegal characters by encoding their UTF-8 525: * representation as "%" + hex code for each resulting 526: * UTF-8 character. 527: * 528: * @param str The string to quote 529: * 530: * @return The quoted string. 531: */ 532: private static String quotePath(String str) 533: { 534: // Technically, we should be using RFC2396_PATH, but 535: // it contains no additional characters. 536: return quote(str, RFC3986_PATH_SEGMENTS); 537: } 538: 539: /** 540: * Quote characters illegal in URI user infos in given string. 541: * 542: * Replace illegal characters by encoding their UTF-8 543: * representation as "%" + hex code for each resulting 544: * UTF-8 character. 545: * 546: * @param str The string to quote 547: * 548: * @return The quoted string. 549: */ 550: private static String quoteUserInfo(String str) 551: { 552: return quote(str, RFC3986_USERINFO); 553: } 554: 555: /** 556: * Creates an URI from the given string 557: * 558: * @param str The string to create the URI from 559: * 560: * @exception URISyntaxException If the given string violates RFC 2396 561: * @exception NullPointerException If str is null 562: */ 563: public URI(String str) throws URISyntaxException 564: { 565: this.string = str; 566: parseURI(str); 567: } 568: 569: /** 570: * Create an URI from the given components 571: * 572: * @param scheme The scheme name 573: * @param userInfo The username and authorization info 574: * @param host The hostname 575: * @param port The port number 576: * @param path The path 577: * @param query The query 578: * @param fragment The fragment 579: * 580: * @exception URISyntaxException If the given string violates RFC 2396 581: */ 582: public URI(String scheme, String userInfo, String host, int port, 583: String path, String query, String fragment) 584: throws URISyntaxException 585: { 586: this((scheme == null ? "" : scheme + ":") 587: + (userInfo == null && host == null && port == -1 ? "" : "//") 588: + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@") 589: + (host == null ? "" : quoteHost(host)) 590: + (port == -1 ? "" : ":" + String.valueOf(port)) 591: + (path == null ? "" : quotePath(path)) 592: + (query == null ? "" : "?" + quote(query)) 593: + (fragment == null ? "" : "#" + quote(fragment))); 594: } 595: 596: /** 597: * Create an URI from the given components 598: * 599: * @param scheme The scheme name 600: * @param authority The authority 601: * @param path The apth 602: * @param query The query 603: * @param fragment The fragment 604: * 605: * @exception URISyntaxException If the given string violates RFC 2396 606: */ 607: public URI(String scheme, String authority, String path, String query, 608: String fragment) throws URISyntaxException 609: { 610: this((scheme == null ? "" : scheme + ":") 611: + (authority == null ? "" : "//" + quoteAuthority(authority)) 612: + (path == null ? "" : quotePath(path)) 613: + (query == null ? "" : "?" + quote(query)) 614: + (fragment == null ? "" : "#" + quote(fragment))); 615: } 616: 617: /** 618: * Create an URI from the given components 619: * 620: * @param scheme The scheme name 621: * @param host The hostname 622: * @param path The path 623: * @param fragment The fragment 624: * 625: * @exception URISyntaxException If the given string violates RFC 2396 626: */ 627: public URI(String scheme, String host, String path, String fragment) 628: throws URISyntaxException 629: { 630: this(scheme, null, host, -1, path, null, fragment); 631: } 632: 633: /** 634: * Create an URI from the given components 635: * 636: * @param scheme The scheme name 637: * @param ssp The scheme specific part 638: * @param fragment The fragment 639: * 640: * @exception URISyntaxException If the given string violates RFC 2396 641: */ 642: public URI(String scheme, String ssp, String fragment) 643: throws URISyntaxException 644: { 645: this((scheme == null ? "" : scheme + ":") 646: + (ssp == null ? "" : quote(ssp)) 647: + (fragment == null ? "" : "#" + quote(fragment))); 648: } 649: 650: /** 651: * Create an URI from the given string 652: * 653: * @param str The string to create the URI from 654: * 655: * @exception IllegalArgumentException If the given string violates RFC 2396 656: * @exception NullPointerException If str is null 657: */ 658: public static URI create(String str) 659: { 660: try 661: { 662: return new URI(str); 663: } 664: catch (URISyntaxException e) 665: { 666: throw (IllegalArgumentException) new IllegalArgumentException() 667: .initCause(e); 668: } 669: } 670: 671: /** 672: * Attempts to parse this URI's authority component, if defined, 673: * into user-information, host, and port components. The purpose 674: * of this method was to disambiguate between some authority sections, 675: * which form invalid server-based authories, but valid registry 676: * based authorities. In the updated RFC 3986, the authority section 677: * is defined differently, with registry-based authorities part of 678: * the host section. Thus, this method is now simply an explicit 679: * way of parsing any authority section. 680: * 681: * @return the URI, with the authority section parsed into user 682: * information, host and port components. 683: * @throws URISyntaxException if the given string violates RFC 2396 684: */ 685: public URI parseServerAuthority() throws URISyntaxException 686: { 687: if (rawAuthority != null) 688: { 689: Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority); 690: 691: if (matcher.matches()) 692: { 693: rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP); 694: rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP); 695: 696: String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP); 697: 698: if (portStr != null && ! portStr.isEmpty()) 699: try 700: { 701: port = Integer.parseInt(portStr); 702: } 703: catch (NumberFormatException e) 704: { 705: URISyntaxException use = 706: new URISyntaxException 707: (string, "doesn't match URI regular expression"); 708: use.initCause(e); 709: throw use; 710: } 711: } 712: else 713: throw new URISyntaxException(string, 714: "doesn't match URI regular expression"); 715: } 716: return this; 717: } 718: 719: /** 720: * <p> 721: * Returns a normalized version of the URI. If the URI is opaque, 722: * or its path is already in normal form, then this URI is simply 723: * returned. Otherwise, the following transformation of the path 724: * element takes place: 725: * </p> 726: * <ol> 727: * <li>All `.' segments are removed.</li> 728: * <li>Each `..' segment which can be paired with a prior non-`..' segment 729: * is removed along with the preceding segment.</li> 730: * <li>A `.' segment is added to the front if the first segment contains 731: * a colon (`:'). This is a deviation from the RFC, which prevents 732: * confusion between the path and the scheme.</li> 733: * </ol> 734: * <p> 735: * The resulting URI will be free of `.' and `..' segments, barring those 736: * that were prepended or which couldn't be paired, respectively. 737: * </p> 738: * 739: * @return the normalized URI. 740: */ 741: public URI normalize() 742: { 743: if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1) 744: return this; 745: try 746: { 747: return new URI(scheme, authority, normalizePath(path), query, 748: fragment); 749: } 750: catch (URISyntaxException e) 751: { 752: throw (Error) new InternalError("Normalized URI variant could not "+ 753: "be constructed").initCause(e); 754: } 755: } 756: 757: /** 758: * <p> 759: * Normalize the given path. The following transformation takes place: 760: * </p> 761: * <ol> 762: * <li>All `.' segments are removed.</li> 763: * <li>Each `..' segment which can be paired with a prior non-`..' segment 764: * is removed along with the preceding segment.</li> 765: * <li>A `.' segment is added to the front if the first segment contains 766: * a colon (`:'). This is a deviation from the RFC, which prevents 767: * confusion between the path and the scheme.</li> 768: * </ol> 769: * <p> 770: * The resulting URI will be free of `.' and `..' segments, barring those 771: * that were prepended or which couldn't be paired, respectively. 772: * </p> 773: * 774: * @param relativePath the relative path to be normalized. 775: * @return the normalized path. 776: */ 777: private String normalizePath(String relativePath) 778: { 779: /* 780: This follows the algorithm in section 5.2.4. of RFC3986, 781: but doesn't modify the input buffer. 782: */ 783: CPStringBuilder input = new CPStringBuilder(relativePath); 784: CPStringBuilder output = new CPStringBuilder(); 785: int start = 0; 786: while (start < input.length()) 787: { 788: /* A */ 789: if (input.indexOf("../",start) == start) 790: { 791: start += 3; 792: continue; 793: } 794: if (input.indexOf("./",start) == start) 795: { 796: start += 2; 797: continue; 798: } 799: /* B */ 800: if (input.indexOf("/./",start) == start) 801: { 802: start += 2; 803: continue; 804: } 805: if (input.indexOf("/.",start) == start 806: && input.charAt(start + 2) != '.') 807: { 808: start += 1; 809: input.setCharAt(start,'/'); 810: continue; 811: } 812: /* C */ 813: if (input.indexOf("/../",start) == start) 814: { 815: start += 3; 816: removeLastSegment(output); 817: continue; 818: } 819: if (input.indexOf("/..",start) == start) 820: { 821: start += 2; 822: input.setCharAt(start,'/'); 823: removeLastSegment(output); 824: continue; 825: } 826: /* D */ 827: if (start == input.length() - 1 && input.indexOf(".",start) == start) 828: { 829: input.delete(0,1); 830: continue; 831: } 832: if (start == input.length() - 2 && input.indexOf("..",start) == start) 833: { 834: input.delete(0,2); 835: continue; 836: } 837: /* E */ 838: int indexOfSlash = input.indexOf("/",start); 839: while (indexOfSlash == start) 840: { 841: output.append("/"); 842: ++start; 843: indexOfSlash = input.indexOf("/",start); 844: } 845: if (indexOfSlash == -1) 846: indexOfSlash = input.length(); 847: output.append(input.substring(start, indexOfSlash)); 848: start = indexOfSlash; 849: } 850: return output.toString(); 851: } 852: 853: /** 854: * Removes the last segment of the path from the specified buffer. 855: * 856: * @param buffer the buffer containing the path. 857: */ 858: private void removeLastSegment(CPStringBuilder buffer) 859: { 860: int lastSlash = buffer.lastIndexOf("/"); 861: if (lastSlash == -1) 862: buffer.setLength(0); 863: else 864: buffer.setLength(lastSlash); 865: } 866: 867: /** 868: * Resolves the given URI against this URI 869: * 870: * @param uri The URI to resolve against this URI 871: * 872: * @return The resulting URI, or null when it couldn't be resolved 873: * for some reason. 874: * 875: * @throws NullPointerException if uri is null 876: */ 877: public URI resolve(URI uri) 878: { 879: if (uri.isAbsolute()) 880: return uri; 881: if (uri.isOpaque()) 882: return uri; 883: 884: String scheme = uri.getScheme(); 885: String schemeSpecificPart = uri.getSchemeSpecificPart(); 886: String authority = uri.getAuthority(); 887: String path = uri.getPath(); 888: String query = uri.getQuery(); 889: String fragment = uri.getFragment(); 890: 891: try 892: { 893: if (fragment != null && path != null && path.equals("") 894: && scheme == null && authority == null && query == null) 895: return new URI(this.scheme, this.schemeSpecificPart, fragment); 896: 897: if (authority == null) 898: { 899: authority = this.authority; 900: if (path == null) 901: path = ""; 902: if (! (path.startsWith("/"))) 903: { 904: CPStringBuilder basepath = new CPStringBuilder(this.path); 905: int i = this.path.lastIndexOf('/'); 906: 907: if (i >= 0) 908: basepath.delete(i + 1, basepath.length()); 909: 910: basepath.append(path); 911: path = normalizePath(basepath.toString()); 912: } 913: } 914: return new URI(this.scheme, authority, path, query, fragment); 915: } 916: catch (URISyntaxException e) 917: { 918: throw (Error) new InternalError("Resolved URI variant could not "+ 919: "be constructed").initCause(e); 920: } 921: } 922: 923: /** 924: * Resolves the given URI string against this URI 925: * 926: * @param str The URI as string to resolve against this URI 927: * 928: * @return The resulting URI 929: * 930: * @throws IllegalArgumentException If the given URI string 931: * violates RFC 2396 932: * @throws NullPointerException If uri is null 933: */ 934: public URI resolve(String str) throws IllegalArgumentException 935: { 936: return resolve(create(str)); 937: } 938: 939: /** 940: * <p> 941: * Relativizes the given URI against this URI. The following 942: * algorithm is used: 943: * </p> 944: * <ul> 945: * <li>If either URI is opaque, the given URI is returned.</li> 946: * <li>If the schemes of the URIs differ, the given URI is returned.</li> 947: * <li>If the authority components of the URIs differ, then the given 948: * URI is returned.</li> 949: * <li>If the path of this URI is not a prefix of the supplied URI, 950: * then the given URI is returned.</li> 951: * <li>If all the above conditions hold, a new URI is created using the 952: * query and fragment components of the given URI, along with a path 953: * computed by removing the path of this URI from the start of the path 954: * of the supplied URI.</li> 955: * </ul> 956: * 957: * @param uri the URI to relativize agsint this URI 958: * @return the resulting URI 959: * @throws NullPointerException if the uri is null 960: */ 961: public URI relativize(URI uri) 962: { 963: if (isOpaque() || uri.isOpaque()) 964: return uri; 965: if (scheme == null && uri.getScheme() != null) 966: return uri; 967: if (scheme != null && !(scheme.equals(uri.getScheme()))) 968: return uri; 969: if (rawAuthority == null && uri.getRawAuthority() != null) 970: return uri; 971: if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority()))) 972: return uri; 973: String basePath = rawPath; 974: if (!(uri.getRawPath().equals(rawPath))) 975: { 976: if (!(basePath.endsWith("/"))) 977: basePath = basePath.concat("/"); 978: if (!(uri.getRawPath().startsWith(basePath))) 979: return uri; 980: } 981: try 982: { 983: return new URI(null, null, 984: uri.getRawPath().substring(basePath.length()), 985: uri.getRawQuery(), uri.getRawFragment()); 986: } 987: catch (URISyntaxException e) 988: { 989: throw (Error) new InternalError("Relativized URI variant could not "+ 990: "be constructed").initCause(e); 991: } 992: } 993: 994: /** 995: * Creates an URL from an URI 996: * 997: * @throws MalformedURLException If a protocol handler for the URL could 998: * not be found, or if some other error occurred while constructing the URL 999: * @throws IllegalArgumentException If the URI is not absolute 1000: */ 1001: public URL toURL() throws IllegalArgumentException, MalformedURLException 1002: { 1003: if (isAbsolute()) 1004: return new URL(this.toString()); 1005: 1006: throw new IllegalArgumentException("not absolute"); 1007: } 1008: 1009: /** 1010: * Returns the scheme of the URI 1011: */ 1012: public String getScheme() 1013: { 1014: return scheme; 1015: } 1016: 1017: /** 1018: * Tells whether this URI is absolute or not 1019: */ 1020: public boolean isAbsolute() 1021: { 1022: return scheme != null; 1023: } 1024: 1025: /** 1026: * Tell whether this URI is opaque or not 1027: */ 1028: public boolean isOpaque() 1029: { 1030: return ((scheme != null) && ! (schemeSpecificPart.startsWith("/"))); 1031: } 1032: 1033: /** 1034: * Returns the raw scheme specific part of this URI. 1035: * The scheme-specific part is never undefined, though it may be empty 1036: */ 1037: public String getRawSchemeSpecificPart() 1038: { 1039: return rawSchemeSpecificPart; 1040: } 1041: 1042: /** 1043: * Returns the decoded scheme specific part of this URI. 1044: */ 1045: public String getSchemeSpecificPart() 1046: { 1047: return schemeSpecificPart; 1048: } 1049: 1050: /** 1051: * Returns the raw authority part of this URI 1052: */ 1053: public String getRawAuthority() 1054: { 1055: return rawAuthority; 1056: } 1057: 1058: /** 1059: * Returns the decoded authority part of this URI 1060: */ 1061: public String getAuthority() 1062: { 1063: return authority; 1064: } 1065: 1066: /** 1067: * Returns the raw user info part of this URI 1068: */ 1069: public String getRawUserInfo() 1070: { 1071: return rawUserInfo; 1072: } 1073: 1074: /** 1075: * Returns the decoded user info part of this URI 1076: */ 1077: public String getUserInfo() 1078: { 1079: return userInfo; 1080: } 1081: 1082: /** 1083: * Returns the hostname of the URI 1084: */ 1085: public String getHost() 1086: { 1087: return host; 1088: } 1089: 1090: /** 1091: * Returns the port number of the URI 1092: */ 1093: public int getPort() 1094: { 1095: return port; 1096: } 1097: 1098: /** 1099: * Returns the raw path part of this URI 1100: */ 1101: public String getRawPath() 1102: { 1103: return rawPath; 1104: } 1105: 1106: /** 1107: * Returns the path of the URI 1108: */ 1109: public String getPath() 1110: { 1111: return path; 1112: } 1113: 1114: /** 1115: * Returns the raw query part of this URI 1116: */ 1117: public String getRawQuery() 1118: { 1119: return rawQuery; 1120: } 1121: 1122: /** 1123: * Returns the query of the URI 1124: */ 1125: public String getQuery() 1126: { 1127: return query; 1128: } 1129: 1130: /** 1131: * Return the raw fragment part of this URI 1132: */ 1133: public String getRawFragment() 1134: { 1135: return rawFragment; 1136: } 1137: 1138: /** 1139: * Returns the fragment of the URI 1140: */ 1141: public String getFragment() 1142: { 1143: return fragment; 1144: } 1145: 1146: /** 1147: * <p> 1148: * Compares the URI with the given object for equality. If the 1149: * object is not a <code>URI</code>, then the method returns false. 1150: * Otherwise, the following criteria are observed: 1151: * </p> 1152: * <ul> 1153: * <li>The scheme of the URIs must either be null (undefined) in both cases, 1154: * or equal, ignorant of case.</li> 1155: * <li>The raw fragment of the URIs must either be null (undefined) in both 1156: * cases, or equal, ignorant of case.</li> 1157: * <li>Both URIs must be of the same type (opaque or hierarchial)</li> 1158: * <li><strong>For opaque URIs:</strong></li> 1159: * <ul> 1160: * <li>The raw scheme-specific parts must be equal.</li> 1161: * </ul> 1162: * <li>For hierarchical URIs:</li> 1163: * <ul> 1164: * <li>The raw paths must be equal, ignorant of case.</li> 1165: * <li>The raw queries are either both undefined or both equal, ignorant 1166: * of case.</li> 1167: * <li>The raw authority sections are either both undefined or:</li> 1168: * <li><strong>For registry-based authorities:</strong></li> 1169: * <ul><li>they are equal.</li></ul> 1170: * <li><strong>For server-based authorities:</strong></li> 1171: * <ul> 1172: * <li>the hosts are equal, ignoring case</li> 1173: * <li>the ports are equal</li> 1174: * <li>the user information components are equal</li> 1175: * </ul> 1176: * </ul> 1177: * </ul> 1178: * 1179: * @param obj the obj to compare the URI with. 1180: * @return <code>true</code> if the objects are equal, according to 1181: * the specification above. 1182: */ 1183: public boolean equals(Object obj) 1184: { 1185: if (!(obj instanceof URI)) 1186: return false; 1187: URI uriObj = (URI) obj; 1188: if (scheme == null) 1189: { 1190: if (uriObj.getScheme() != null) 1191: return false; 1192: } 1193: else 1194: if (!(scheme.equalsIgnoreCase(uriObj.getScheme()))) 1195: return false; 1196: if (rawFragment == null) 1197: { 1198: if (uriObj.getRawFragment() != null) 1199: return false; 1200: } 1201: else 1202: if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment()))) 1203: return false; 1204: boolean opaqueThis = isOpaque(); 1205: boolean opaqueObj = uriObj.isOpaque(); 1206: if (opaqueThis && opaqueObj) 1207: return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart()); 1208: else if (!opaqueThis && !opaqueObj) 1209: { 1210: boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath()) 1211: && ((rawQuery == null && uriObj.getRawQuery() == null) 1212: || rawQuery.equalsIgnoreCase(uriObj.getRawQuery())); 1213: if (rawAuthority == null && uriObj.getRawAuthority() == null) 1214: return common; 1215: if (host == null) 1216: return common 1217: && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority()); 1218: return common 1219: && host.equalsIgnoreCase(uriObj.getHost()) 1220: && port == uriObj.getPort() 1221: && (rawUserInfo == null ? 1222: uriObj.getRawUserInfo() == null : 1223: rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo())); 1224: } 1225: else 1226: return false; 1227: } 1228: 1229: /** 1230: * Computes the hashcode of the URI 1231: */ 1232: public int hashCode() 1233: { 1234: return (getScheme() == null ? 0 : 13 * getScheme().hashCode()) 1235: + 17 * getRawSchemeSpecificPart().hashCode() 1236: + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode()); 1237: } 1238: 1239: /** 1240: * Compare the URI with another URI. 1241: * Undefined components are taken to be less than any other component. 1242: * The following criteria are observed: 1243: * </p> 1244: * <ul> 1245: * <li>Two URIs with different schemes are compared according to their 1246: * scheme, regardless of case.</li> 1247: * <li>A hierarchical URI is less than an opaque URI with the same 1248: * scheme.</li> 1249: * <li><strong>For opaque URIs:</strong></li> 1250: * <ul> 1251: * <li>URIs with differing scheme-specific parts are ordered according 1252: * to the ordering of the scheme-specific part.</li> 1253: * <li>URIs with the same scheme-specific part are ordered by the 1254: * raw fragment.</li> 1255: * </ul> 1256: * <li>For hierarchical URIs:</li> 1257: * <ul> 1258: * <li>URIs are ordered according to their raw authority sections, 1259: * if they are unequal.</li> 1260: * <li><strong>For registry-based authorities:</strong></li> 1261: * <ul><li>they are ordered according to the ordering of the authority 1262: * component.</li></ul> 1263: * <li><strong>For server-based authorities:</strong></li> 1264: * <ul> 1265: * <li>URIs are ordered according to the raw user information.</li> 1266: * <li>URIs with the same user information are ordered by the host, 1267: * ignoring case.</li> 1268: * <lI>URIs with the same host are ordered by the port.</li> 1269: * </ul> 1270: * <li>URIs with the same authority section are ordered by the raw path.</li> 1271: * <li>URIs with the same path are ordered by their raw query.</li> 1272: * <li>URIs with the same query are ordered by their raw fragments.</li> 1273: * </ul> 1274: * </ul> 1275: * 1276: * @param uri The other URI to compare this URI with 1277: * @return a negative integer, zero or a positive integer depending 1278: * on whether this URI is less than, equal to or greater 1279: * than that supplied, respectively. 1280: */ 1281: public int compareTo(URI uri) 1282: throws ClassCastException 1283: { 1284: if (scheme == null && uri.getScheme() != null) 1285: return -1; 1286: if (scheme != null) 1287: { 1288: int sCompare = scheme.compareToIgnoreCase(uri.getScheme()); 1289: if (sCompare != 0) 1290: return sCompare; 1291: } 1292: boolean opaqueThis = isOpaque(); 1293: boolean opaqueObj = uri.isOpaque(); 1294: if (opaqueThis && !opaqueObj) 1295: return 1; 1296: if (!opaqueThis && opaqueObj) 1297: return -1; 1298: if (opaqueThis) 1299: { 1300: int ssCompare = 1301: rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart()); 1302: if (ssCompare == 0) 1303: return compareFragments(uri); 1304: else 1305: return ssCompare; 1306: } 1307: if (rawAuthority == null && uri.getRawAuthority() != null) 1308: return -1; 1309: if (rawAuthority != null) 1310: { 1311: int aCompare = rawAuthority.compareTo(uri.getRawAuthority()); 1312: if (aCompare != 0) 1313: { 1314: if (host == null) 1315: return aCompare; 1316: if (rawUserInfo == null && uri.getRawUserInfo() != null) 1317: return -1; 1318: int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo()); 1319: if (uCompare != 0) 1320: return uCompare; 1321: if (host == null && uri.getHost() != null) 1322: return -1; 1323: int hCompare = host.compareTo(uri.getHost()); 1324: if (hCompare != 0) 1325: return hCompare; 1326: int uriPort = uri.getPort(); 1327: return (uriPort == port) ? 0 : (uriPort > port) ? -1 : 1; 1328: } 1329: } 1330: if (rawPath == null && uri.getRawPath() != null) 1331: return -1; 1332: if (rawPath != null) 1333: { 1334: int pCompare = rawPath.compareTo(uri.getRawPath()); 1335: if (pCompare != 0) 1336: return pCompare; 1337: } 1338: if (rawQuery == null && uri.getRawQuery() != null) 1339: return -1; 1340: if (rawQuery != null) 1341: { 1342: int qCompare = rawQuery.compareTo(uri.getRawQuery()); 1343: if (qCompare != 0) 1344: return qCompare; 1345: } 1346: return compareFragments(uri); 1347: } 1348: 1349: /** 1350: * Compares the fragment of this URI with that of the supplied URI. 1351: * 1352: * @param uri the URI to compare with this one. 1353: * @return a negative integer, zero or a positive integer depending 1354: * on whether this uri's fragment is less than, equal to 1355: * or greater than the fragment of the uri supplied, respectively. 1356: */ 1357: private int compareFragments(URI uri) 1358: { 1359: if (rawFragment == null && uri.getRawFragment() != null) 1360: return -1; 1361: else if (rawFragment == null) 1362: return 0; 1363: else 1364: return rawFragment.compareTo(uri.getRawFragment()); 1365: } 1366: 1367: /** 1368: * Returns the URI as a String. If the URI was created using a constructor, 1369: * then this will be the same as the original input string. 1370: * 1371: * @return a string representation of the URI. 1372: */ 1373: public String toString() 1374: { 1375: return (scheme == null ? "" : scheme + ":") 1376: + rawSchemeSpecificPart 1377: + (rawFragment == null ? "" : "#" + rawFragment); 1378: } 1379: 1380: /** 1381: * Returns the URI as US-ASCII string. This is the same as the result 1382: * from <code>toString()</code> for URIs that don't contain any non-US-ASCII 1383: * characters. Otherwise, the non-US-ASCII characters are replaced 1384: * by their percent-encoded representations. 1385: * 1386: * @return a string representation of the URI, containing only US-ASCII 1387: * characters. 1388: */ 1389: public String toASCIIString() 1390: { 1391: String strRep = toString(); 1392: boolean inNonAsciiBlock = false; 1393: CPStringBuilder buffer = new CPStringBuilder(); 1394: CPStringBuilder encBuffer = null; 1395: for (int i = 0; i < strRep.length(); i++) 1396: { 1397: char c = strRep.charAt(i); 1398: if (c <= 127) 1399: { 1400: if (inNonAsciiBlock) 1401: { 1402: buffer.append(escapeCharacters(encBuffer.toString())); 1403: inNonAsciiBlock = false; 1404: } 1405: buffer.append(c); 1406: } 1407: else 1408: { 1409: if (!inNonAsciiBlock) 1410: { 1411: encBuffer = new CPStringBuilder(); 1412: inNonAsciiBlock = true; 1413: } 1414: encBuffer.append(c); 1415: } 1416: } 1417: return buffer.toString(); 1418: } 1419: 1420: /** 1421: * Converts the non-ASCII characters in the supplied string 1422: * to their equivalent percent-encoded representations. 1423: * That is, they are replaced by "%" followed by their hexadecimal value. 1424: * 1425: * @param str a string including non-ASCII characters. 1426: * @return the string with the non-ASCII characters converted to their 1427: * percent-encoded representations. 1428: */ 1429: private static String escapeCharacters(String str) 1430: { 1431: try 1432: { 1433: CPStringBuilder sb = new CPStringBuilder(); 1434: // this is far from optimal, but it works 1435: byte[] utf8 = str.getBytes("utf-8"); 1436: for (int j = 0; j < utf8.length; j++) 1437: { 1438: sb.append('%'); 1439: sb.append(HEX.charAt((utf8[j] & 0xff) / 16)); 1440: sb.append(HEX.charAt((utf8[j] & 0xff) % 16)); 1441: } 1442: return sb.toString(); 1443: } 1444: catch (java.io.UnsupportedEncodingException x) 1445: { 1446: throw (Error) new InternalError("Escaping error").initCause(x); 1447: } 1448: } 1449: 1450: }