1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/emul/compact/src/main/java/java/net/URI.java Sat Sep 07 13:51:24 2013 +0200
1.3 @@ -0,0 +1,3524 @@
1.4 +/*
1.5 + * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
1.7 + *
1.8 + * This code is free software; you can redistribute it and/or modify it
1.9 + * under the terms of the GNU General Public License version 2 only, as
1.10 + * published by the Free Software Foundation. Oracle designates this
1.11 + * particular file as subject to the "Classpath" exception as provided
1.12 + * by Oracle in the LICENSE file that accompanied this code.
1.13 + *
1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT
1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1.17 + * version 2 for more details (a copy is included in the LICENSE file that
1.18 + * accompanied this code).
1.19 + *
1.20 + * You should have received a copy of the GNU General Public License version
1.21 + * 2 along with this work; if not, write to the Free Software Foundation,
1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
1.23 + *
1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
1.25 + * or visit www.oracle.com if you need additional information or have any
1.26 + * questions.
1.27 + */
1.28 +
1.29 +package java.net;
1.30 +
1.31 +import java.io.IOException;
1.32 +import java.io.InvalidObjectException;
1.33 +import java.io.ObjectInputStream;
1.34 +import java.io.ObjectOutputStream;
1.35 +import java.io.Serializable;
1.36 +import java.nio.ByteBuffer;
1.37 +import java.nio.CharBuffer;
1.38 +import java.nio.charset.CharsetDecoder;
1.39 +import java.nio.charset.CharsetEncoder;
1.40 +import java.nio.charset.CoderResult;
1.41 +import java.nio.charset.CodingErrorAction;
1.42 +import java.nio.charset.CharacterCodingException;
1.43 +import java.text.Normalizer;
1.44 +import sun.nio.cs.ThreadLocalCoders;
1.45 +
1.46 +import java.lang.Character; // for javadoc
1.47 +import java.lang.NullPointerException; // for javadoc
1.48 +
1.49 +
1.50 +/**
1.51 + * Represents a Uniform Resource Identifier (URI) reference.
1.52 + *
1.53 + * <p> Aside from some minor deviations noted below, an instance of this
1.54 + * class represents a URI reference as defined by
1.55 + * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform
1.56 + * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
1.57 + * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for
1.58 + * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
1.59 + * also supports scope_ids. The syntax and usage of scope_ids is described
1.60 + * <a href="Inet6Address.html#scoped">here</a>.
1.61 + * This class provides constructors for creating URI instances from
1.62 + * their components or by parsing their string forms, methods for accessing the
1.63 + * various components of an instance, and methods for normalizing, resolving,
1.64 + * and relativizing URI instances. Instances of this class are immutable.
1.65 + *
1.66 + *
1.67 + * <h4> URI syntax and components </h4>
1.68 + *
1.69 + * At the highest level a URI reference (hereinafter simply "URI") in string
1.70 + * form has the syntax
1.71 + *
1.72 + * <blockquote>
1.73 + * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
1.74 + * </blockquote>
1.75 + *
1.76 + * where square brackets [...] delineate optional components and the characters
1.77 + * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
1.78 + *
1.79 + * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
1.80 + * said to be <i>relative</i>. URIs are also classified according to whether
1.81 + * they are <i>opaque</i> or <i>hierarchical</i>.
1.82 + *
1.83 + * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
1.84 + * not begin with a slash character (<tt>'/'</tt>). Opaque URIs are not
1.85 + * subject to further parsing. Some examples of opaque URIs are:
1.86 + *
1.87 + * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
1.88 + * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
1.89 + * <tr><td><tt>news:comp.lang.java</tt><td></tr>
1.90 + * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
1.91 + * </table></blockquote>
1.92 + *
1.93 + * <p> A <i>hierarchical</i> URI is either an absolute URI whose
1.94 + * scheme-specific part begins with a slash character, or a relative URI, that
1.95 + * is, a URI that does not specify a scheme. Some examples of hierarchical
1.96 + * URIs are:
1.97 + *
1.98 + * <blockquote>
1.99 + * <tt>http://java.sun.com/j2se/1.3/</tt><br>
1.100 + * <tt>docs/guide/collections/designfaq.html#28</tt><br>
1.101 + * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
1.102 + * <tt>file:///~/calendar</tt>
1.103 + * </blockquote>
1.104 + *
1.105 + * <p> A hierarchical URI is subject to further parsing according to the syntax
1.106 + *
1.107 + * <blockquote>
1.108 + * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
1.109 + * </blockquote>
1.110 + *
1.111 + * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
1.112 + * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves. The
1.113 + * scheme-specific part of a hierarchical URI consists of the characters
1.114 + * between the scheme and fragment components.
1.115 + *
1.116 + * <p> The authority component of a hierarchical URI is, if specified, either
1.117 + * <i>server-based</i> or <i>registry-based</i>. A server-based authority
1.118 + * parses according to the familiar syntax
1.119 + *
1.120 + * <blockquote>
1.121 + * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
1.122 + * </blockquote>
1.123 + *
1.124 + * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
1.125 + * themselves. Nearly all URI schemes currently in use are server-based. An
1.126 + * authority component that does not parse in this way is considered to be
1.127 + * registry-based.
1.128 + *
1.129 + * <p> The path component of a hierarchical URI is itself said to be absolute
1.130 + * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
1.131 + * relative. The path of a hierarchical URI that is either absolute or
1.132 + * specifies an authority is always absolute.
1.133 + *
1.134 + * <p> All told, then, a URI instance has the following nine components:
1.135 + *
1.136 + * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
1.137 + * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
1.138 + * <tr><td>scheme</td><td><tt>String</tt></td></tr>
1.139 + * <tr><td>scheme-specific-part </td><td><tt>String</tt></td></tr>
1.140 + * <tr><td>authority</td><td><tt>String</tt></td></tr>
1.141 + * <tr><td>user-info</td><td><tt>String</tt></td></tr>
1.142 + * <tr><td>host</td><td><tt>String</tt></td></tr>
1.143 + * <tr><td>port</td><td><tt>int</tt></td></tr>
1.144 + * <tr><td>path</td><td><tt>String</tt></td></tr>
1.145 + * <tr><td>query</td><td><tt>String</tt></td></tr>
1.146 + * <tr><td>fragment</td><td><tt>String</tt></td></tr>
1.147 + * </table></blockquote>
1.148 + *
1.149 + * In a given instance any particular component is either <i>undefined</i> or
1.150 + * <i>defined</i> with a distinct value. Undefined string components are
1.151 + * represented by <tt>null</tt>, while undefined integer components are
1.152 + * represented by <tt>-1</tt>. A string component may be defined to have the
1.153 + * empty string as its value; this is not equivalent to that component being
1.154 + * undefined.
1.155 + *
1.156 + * <p> Whether a particular component is or is not defined in an instance
1.157 + * depends upon the type of the URI being represented. An absolute URI has a
1.158 + * scheme component. An opaque URI has a scheme, a scheme-specific part, and
1.159 + * possibly a fragment, but has no other components. A hierarchical URI always
1.160 + * has a path (though it may be empty) and a scheme-specific-part (which at
1.161 + * least contains the path), and may have any of the other components. If the
1.162 + * authority component is present and is server-based then the host component
1.163 + * will be defined and the user-information and port components may be defined.
1.164 + *
1.165 + *
1.166 + * <h4> Operations on URI instances </h4>
1.167 + *
1.168 + * The key operations supported by this class are those of
1.169 + * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
1.170 + *
1.171 + * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
1.172 + * and <tt>".."</tt> segments from the path component of a hierarchical URI.
1.173 + * Each <tt>"."</tt> segment is simply removed. A <tt>".."</tt> segment is
1.174 + * removed only if it is preceded by a non-<tt>".."</tt> segment.
1.175 + * Normalization has no effect upon opaque URIs.
1.176 + *
1.177 + * <p> <i>Resolution</i> is the process of resolving one URI against another,
1.178 + * <i>base</i> URI. The resulting URI is constructed from components of both
1.179 + * URIs in the manner specified by RFC 2396, taking components from the
1.180 + * base URI for those not specified in the original. For hierarchical URIs,
1.181 + * the path of the original is resolved against the path of the base and then
1.182 + * normalized. The result, for example, of resolving
1.183 + *
1.184 + * <blockquote>
1.185 + * <tt>docs/guide/collections/designfaq.html#28 </tt>(1)
1.186 + * </blockquote>
1.187 + *
1.188 + * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
1.189 + * URI
1.190 + *
1.191 + * <blockquote>
1.192 + * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
1.193 + * </blockquote>
1.194 + *
1.195 + * Resolving the relative URI
1.196 + *
1.197 + * <blockquote>
1.198 + * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java </tt>(2)
1.199 + * </blockquote>
1.200 + *
1.201 + * against this result yields, in turn,
1.202 + *
1.203 + * <blockquote>
1.204 + * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
1.205 + * </blockquote>
1.206 + *
1.207 + * Resolution of both absolute and relative URIs, and of both absolute and
1.208 + * relative paths in the case of hierarchical URIs, is supported. Resolving
1.209 + * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
1.210 + * original URI, since it is absolute. Resolving the relative URI (2) above
1.211 + * against the relative base URI (1) yields the normalized, but still relative,
1.212 + * URI
1.213 + *
1.214 + * <blockquote>
1.215 + * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
1.216 + * </blockquote>
1.217 + *
1.218 + * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
1.219 + * two normalized URIs <i>u</i> and <i>v</i>,
1.220 + *
1.221 + * <blockquote>
1.222 + * <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt> and<br>
1.223 + * <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt> .<br>
1.224 + * </blockquote>
1.225 + *
1.226 + * This operation is often useful when constructing a document containing URIs
1.227 + * that must be made relative to the base URI of the document wherever
1.228 + * possible. For example, relativizing the URI
1.229 + *
1.230 + * <blockquote>
1.231 + * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
1.232 + * </blockquote>
1.233 + *
1.234 + * against the base URI
1.235 + *
1.236 + * <blockquote>
1.237 + * <tt>http://java.sun.com/j2se/1.3</tt>
1.238 + * </blockquote>
1.239 + *
1.240 + * yields the relative URI <tt>docs/guide/index.html</tt>.
1.241 + *
1.242 + *
1.243 + * <h4> Character categories </h4>
1.244 + *
1.245 + * RFC 2396 specifies precisely which characters are permitted in the
1.246 + * various components of a URI reference. The following categories, most of
1.247 + * which are taken from that specification, are used below to describe these
1.248 + * constraints:
1.249 + *
1.250 + * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
1.251 + * <tr><th valign=top><i>alpha</i></th>
1.252 + * <td>The US-ASCII alphabetic characters,
1.253 + * <tt>'A'</tt> through <tt>'Z'</tt>
1.254 + * and <tt>'a'</tt> through <tt>'z'</tt></td></tr>
1.255 + * <tr><th valign=top><i>digit</i></th>
1.256 + * <td>The US-ASCII decimal digit characters,
1.257 + * <tt>'0'</tt> through <tt>'9'</tt></td></tr>
1.258 + * <tr><th valign=top><i>alphanum</i></th>
1.259 + * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
1.260 + * <tr><th valign=top><i>unreserved</i> </th>
1.261 + * <td>All <i>alphanum</i> characters together with those in the string
1.262 + * <tt>"_-!.~'()*"</tt></td></tr>
1.263 + * <tr><th valign=top><i>punct</i></th>
1.264 + * <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
1.265 + * <tr><th valign=top><i>reserved</i></th>
1.266 + * <td>All <i>punct</i> characters together with those in the string
1.267 + * <tt>"?/[]@"</tt></td></tr>
1.268 + * <tr><th valign=top><i>escaped</i></th>
1.269 + * <td>Escaped octets, that is, triplets consisting of the percent
1.270 + * character (<tt>'%'</tt>) followed by two hexadecimal digits
1.271 + * (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
1.272 + * <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
1.273 + * <tr><th valign=top><i>other</i></th>
1.274 + * <td>The Unicode characters that are not in the US-ASCII character set,
1.275 + * are not control characters (according to the {@link
1.276 + * java.lang.Character#isISOControl(char) Character.isISOControl}
1.277 + * method), and are not space characters (according to the {@link
1.278 + * java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
1.279 + * method) <i>(<b>Deviation from RFC 2396</b>, which is
1.280 + * limited to US-ASCII)</i></td></tr>
1.281 + * </table></blockquote>
1.282 + *
1.283 + * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
1.284 + * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
1.285 + * characters.
1.286 + *
1.287 + *
1.288 + * <h4> Escaped octets, quotation, encoding, and decoding </h4>
1.289 + *
1.290 + * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
1.291 + * fragment components. Escaping serves two purposes in URIs:
1.292 + *
1.293 + * <ul>
1.294 + *
1.295 + * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
1.296 + * conform strictly to RFC 2396 by not containing any <i>other</i>
1.297 + * characters. </p></li>
1.298 + *
1.299 + * <li><p> To <i>quote</i> characters that are otherwise illegal in a
1.300 + * component. The user-info, path, query, and fragment components differ
1.301 + * slightly in terms of which characters are considered legal and illegal.
1.302 + * </p></li>
1.303 + *
1.304 + * </ul>
1.305 + *
1.306 + * These purposes are served in this class by three related operations:
1.307 + *
1.308 + * <ul>
1.309 + *
1.310 + * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
1.311 + * with the sequence of escaped octets that represent that character in the
1.312 + * UTF-8 character set. The Euro currency symbol (<tt>'\u20AC'</tt>),
1.313 + * for example, is encoded as <tt>"%E2%82%AC"</tt>. <i>(<b>Deviation from
1.314 + * RFC 2396</b>, which does not specify any particular character
1.315 + * set.)</i> </p></li>
1.316 + *
1.317 + * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
1.318 + * encoding it. The space character, for example, is quoted by replacing it
1.319 + * with <tt>"%20"</tt>. UTF-8 contains US-ASCII, hence for US-ASCII
1.320 + * characters this transformation has exactly the effect required by
1.321 + * RFC 2396. </p></li>
1.322 + *
1.323 + * <li><p><a name="decode"></a>
1.324 + * A sequence of escaped octets is <i>decoded</i> by
1.325 + * replacing it with the sequence of characters that it represents in the
1.326 + * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the
1.327 + * effect of de-quoting any quoted US-ASCII characters as well as that of
1.328 + * decoding any encoded non-US-ASCII characters. If a <a
1.329 + * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
1.330 + * when decoding the escaped octets then the erroneous octets are replaced by
1.331 + * <tt>'\uFFFD'</tt>, the Unicode replacement character. </p></li>
1.332 + *
1.333 + * </ul>
1.334 + *
1.335 + * These operations are exposed in the constructors and methods of this class
1.336 + * as follows:
1.337 + *
1.338 + * <ul>
1.339 + *
1.340 + * <li><p> The {@link #URI(java.lang.String) <code>single-argument
1.341 + * constructor</code>} requires any illegal characters in its argument to be
1.342 + * quoted and preserves any escaped octets and <i>other</i> characters that
1.343 + * are present. </p></li>
1.344 + *
1.345 + * <li><p> The {@link
1.346 + * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
1.347 + * <code>multi-argument constructors</code>} quote illegal characters as
1.348 + * required by the components in which they appear. The percent character
1.349 + * (<tt>'%'</tt>) is always quoted by these constructors. Any <i>other</i>
1.350 + * characters are preserved. </p></li>
1.351 + *
1.352 + * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
1.353 + * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
1.354 + * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
1.355 + * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
1.356 + * values of their corresponding components in raw form, without interpreting
1.357 + * any escaped octets. The strings returned by these methods may contain
1.358 + * both escaped octets and <i>other</i> characters, and will not contain any
1.359 + * illegal characters. </p></li>
1.360 + *
1.361 + * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
1.362 + * getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
1.363 + * getFragment}, {@link #getAuthority() getAuthority}, and {@link
1.364 + * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
1.365 + * octets in their corresponding components. The strings returned by these
1.366 + * methods may contain both <i>other</i> characters and illegal characters,
1.367 + * and will not contain any escaped octets. </p></li>
1.368 + *
1.369 + * <li><p> The {@link #toString() toString} method returns a URI string with
1.370 + * all necessary quotation but which may contain <i>other</i> characters.
1.371 + * </p></li>
1.372 + *
1.373 + * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
1.374 + * quoted and encoded URI string that does not contain any <i>other</i>
1.375 + * characters. </p></li>
1.376 + *
1.377 + * </ul>
1.378 + *
1.379 + *
1.380 + * <h4> Identities </h4>
1.381 + *
1.382 + * For any URI <i>u</i>, it is always the case that
1.383 + *
1.384 + * <blockquote>
1.385 + * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt> .
1.386 + * </blockquote>
1.387 + *
1.388 + * For any URI <i>u</i> that does not contain redundant syntax such as two
1.389 + * slashes before an empty authority (as in <tt>file:///tmp/</tt> ) or a
1.390 + * colon following a host name but no port (as in
1.391 + * <tt>http://java.sun.com:</tt> ), and that does not encode characters
1.392 + * except those that must be quoted, the following identities also hold:
1.393 + *
1.394 + * <blockquote>
1.395 + * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
1.396 + * </tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
1.397 + * </tt><i>u</i><tt>.getFragment())<br>
1.398 + * .equals(</tt><i>u</i><tt>)</tt>
1.399 + * </blockquote>
1.400 + *
1.401 + * in all cases,
1.402 + *
1.403 + * <blockquote>
1.404 + * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
1.405 + * </tt><i>u</i><tt>.getUserInfo(), </tt><i>u</i><tt>.getAuthority(),<br>
1.406 + * </tt><i>u</i><tt>.getPath(), </tt><i>u</i><tt>.getQuery(),<br>
1.407 + * </tt><i>u</i><tt>.getFragment())<br>
1.408 + * .equals(</tt><i>u</i><tt>)</tt>
1.409 + * </blockquote>
1.410 + *
1.411 + * if <i>u</i> is hierarchical, and
1.412 + *
1.413 + * <blockquote>
1.414 + * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
1.415 + * </tt><i>u</i><tt>.getUserInfo(), </tt><i>u</i><tt>.getHost(), </tt><i>u</i><tt>.getPort(),<br>
1.416 + * </tt><i>u</i><tt>.getPath(), </tt><i>u</i><tt>.getQuery(),<br>
1.417 + * </tt><i>u</i><tt>.getFragment())<br>
1.418 + * .equals(</tt><i>u</i><tt>)</tt>
1.419 + * </blockquote>
1.420 + *
1.421 + * if <i>u</i> is hierarchical and has either no authority or a server-based
1.422 + * authority.
1.423 + *
1.424 + *
1.425 + * <h4> URIs, URLs, and URNs </h4>
1.426 + *
1.427 + * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
1.428 + * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but
1.429 + * not every URI is a URL. This is because there is another subcategory of
1.430 + * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
1.431 + * specify how to locate them. The <tt>mailto</tt>, <tt>news</tt>, and
1.432 + * <tt>isbn</tt> URIs shown above are examples of URNs.
1.433 + *
1.434 + * <p> The conceptual distinction between URIs and URLs is reflected in the
1.435 + * differences between this class and the {@link URL} class.
1.436 + *
1.437 + * <p> An instance of this class represents a URI reference in the syntactic
1.438 + * sense defined by RFC 2396. A URI may be either absolute or relative.
1.439 + * A URI string is parsed according to the generic syntax without regard to the
1.440 + * scheme, if any, that it specifies. No lookup of the host, if any, is
1.441 + * performed, and no scheme-dependent stream handler is constructed. Equality,
1.442 + * hashing, and comparison are defined strictly in terms of the character
1.443 + * content of the instance. In other words, a URI instance is little more than
1.444 + * a structured string that supports the syntactic, scheme-independent
1.445 + * operations of comparison, normalization, resolution, and relativization.
1.446 + *
1.447 + * <p> An instance of the {@link URL} class, by contrast, represents the
1.448 + * syntactic components of a URL together with some of the information required
1.449 + * to access the resource that it describes. A URL must be absolute, that is,
1.450 + * it must always specify a scheme. A URL string is parsed according to its
1.451 + * scheme. A stream handler is always established for a URL, and in fact it is
1.452 + * impossible to create a URL instance for a scheme for which no handler is
1.453 + * available. Equality and hashing depend upon both the scheme and the
1.454 + * Internet address of the host, if any; comparison is not defined. In other
1.455 + * words, a URL is a structured string that supports the syntactic operation of
1.456 + * resolution as well as the network I/O operations of looking up the host and
1.457 + * opening a connection to the specified resource.
1.458 + *
1.459 + *
1.460 + * @author Mark Reinhold
1.461 + * @since 1.4
1.462 + *
1.463 + * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a
1.464 + * transformation format of ISO 10646</i></a>, <br><a
1.465 + * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing
1.466 + * Architecture</i></a>, <br><a
1.467 + * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform
1.468 + * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
1.469 + * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for
1.470 + * Literal IPv6 Addresses in URLs</i></a>, <br><a
1.471 + * href="URISyntaxException.html">URISyntaxException</a>
1.472 + */
1.473 +
1.474 +public final class URI
1.475 + implements Comparable<URI>, Serializable
1.476 +{
1.477 +
1.478 + // Note: Comments containing the word "ASSERT" indicate places where a
1.479 + // throw of an InternalError should be replaced by an appropriate assertion
1.480 + // statement once asserts are enabled in the build.
1.481 +
1.482 + static final long serialVersionUID = -6052424284110960213L;
1.483 +
1.484 +
1.485 + // -- Properties and components of this instance --
1.486 +
1.487 + // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
1.488 + private transient String scheme; // null ==> relative URI
1.489 + private transient String fragment;
1.490 +
1.491 + // Hierarchical URI components: [//<authority>]<path>[?<query>]
1.492 + private transient String authority; // Registry or server
1.493 +
1.494 + // Server-based authority: [<userInfo>@]<host>[:<port>]
1.495 + private transient String userInfo;
1.496 + private transient String host; // null ==> registry-based
1.497 + private transient int port = -1; // -1 ==> undefined
1.498 +
1.499 + // Remaining components of hierarchical URIs
1.500 + private transient String path; // null ==> opaque
1.501 + private transient String query;
1.502 +
1.503 + // The remaining fields may be computed on demand
1.504 +
1.505 + private volatile transient String schemeSpecificPart;
1.506 + private volatile transient int hash; // Zero ==> undefined
1.507 +
1.508 + private volatile transient String decodedUserInfo = null;
1.509 + private volatile transient String decodedAuthority = null;
1.510 + private volatile transient String decodedPath = null;
1.511 + private volatile transient String decodedQuery = null;
1.512 + private volatile transient String decodedFragment = null;
1.513 + private volatile transient String decodedSchemeSpecificPart = null;
1.514 +
1.515 + /**
1.516 + * The string form of this URI.
1.517 + *
1.518 + * @serial
1.519 + */
1.520 + private volatile String string; // The only serializable field
1.521 +
1.522 +
1.523 +
1.524 + // -- Constructors and factories --
1.525 +
1.526 + private URI() { } // Used internally
1.527 +
1.528 + /**
1.529 + * Constructs a URI by parsing the given string.
1.530 + *
1.531 + * <p> This constructor parses the given string exactly as specified by the
1.532 + * grammar in <a
1.533 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.534 + * Appendix A, <b><i>except for the following deviations:</i></b> </p>
1.535 + *
1.536 + * <ul type=disc>
1.537 + *
1.538 + * <li><p> An empty authority component is permitted as long as it is
1.539 + * followed by a non-empty path, a query component, or a fragment
1.540 + * component. This allows the parsing of URIs such as
1.541 + * <tt>"file:///foo/bar"</tt>, which seems to be the intent of
1.542 + * RFC 2396 although the grammar does not permit it. If the
1.543 + * authority component is empty then the user-information, host, and port
1.544 + * components are undefined. </p></li>
1.545 + *
1.546 + * <li><p> Empty relative paths are permitted; this seems to be the
1.547 + * intent of RFC 2396 although the grammar does not permit it. The
1.548 + * primary consequence of this deviation is that a standalone fragment
1.549 + * such as <tt>"#foo"</tt> parses as a relative URI with an empty path
1.550 + * and the given fragment, and can be usefully <a
1.551 + * href="#resolve-frag">resolved</a> against a base URI.
1.552 + *
1.553 + * <li><p> IPv4 addresses in host components are parsed rigorously, as
1.554 + * specified by <a
1.555 + * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each
1.556 + * element of a dotted-quad address must contain no more than three
1.557 + * decimal digits. Each element is further constrained to have a value
1.558 + * no greater than 255. </p></li>
1.559 + *
1.560 + * <li> <p> Hostnames in host components that comprise only a single
1.561 + * domain label are permitted to start with an <i>alphanum</i>
1.562 + * character. This seems to be the intent of <a
1.563 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
1.564 + * section 3.2.2 although the grammar does not permit it. The
1.565 + * consequence of this deviation is that the authority component of a
1.566 + * hierarchical URI such as <tt>s://123</tt>, will parse as a server-based
1.567 + * authority. </p></li>
1.568 + *
1.569 + * <li><p> IPv6 addresses are permitted for the host component. An IPv6
1.570 + * address must be enclosed in square brackets (<tt>'['</tt> and
1.571 + * <tt>']'</tt>) as specified by <a
1.572 + * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The
1.573 + * IPv6 address itself must parse according to <a
1.574 + * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6
1.575 + * addresses are further constrained to describe no more than sixteen
1.576 + * bytes of address information, a constraint implicit in RFC 2373
1.577 + * but not expressible in the grammar. </p></li>
1.578 + *
1.579 + * <li><p> Characters in the <i>other</i> category are permitted wherever
1.580 + * RFC 2396 permits <i>escaped</i> octets, that is, in the
1.581 + * user-information, path, query, and fragment components, as well as in
1.582 + * the authority component if the authority is registry-based. This
1.583 + * allows URIs to contain Unicode characters beyond those in the US-ASCII
1.584 + * character set. </p></li>
1.585 + *
1.586 + * </ul>
1.587 + *
1.588 + * @param str The string to be parsed into a URI
1.589 + *
1.590 + * @throws NullPointerException
1.591 + * If <tt>str</tt> is <tt>null</tt>
1.592 + *
1.593 + * @throws URISyntaxException
1.594 + * If the given string violates RFC 2396, as augmented
1.595 + * by the above deviations
1.596 + */
1.597 + public URI(String str) throws URISyntaxException {
1.598 + new Parser(str).parse(false);
1.599 + }
1.600 +
1.601 + /**
1.602 + * Constructs a hierarchical URI from the given components.
1.603 + *
1.604 + * <p> If a scheme is given then the path, if also given, must either be
1.605 + * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
1.606 + * component of the new URI may be left undefined by passing <tt>null</tt>
1.607 + * for the corresponding parameter or, in the case of the <tt>port</tt>
1.608 + * parameter, by passing <tt>-1</tt>.
1.609 + *
1.610 + * <p> This constructor first builds a URI string from the given components
1.611 + * according to the rules specified in <a
1.612 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.613 + * section 5.2, step 7: </p>
1.614 + *
1.615 + * <ol>
1.616 + *
1.617 + * <li><p> Initially, the result string is empty. </p></li>
1.618 + *
1.619 + * <li><p> If a scheme is given then it is appended to the result,
1.620 + * followed by a colon character (<tt>':'</tt>). </p></li>
1.621 + *
1.622 + * <li><p> If user information, a host, or a port are given then the
1.623 + * string <tt>"//"</tt> is appended. </p></li>
1.624 + *
1.625 + * <li><p> If user information is given then it is appended, followed by
1.626 + * a commercial-at character (<tt>'@'</tt>). Any character not in the
1.627 + * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
1.628 + * categories is <a href="#quote">quoted</a>. </p></li>
1.629 + *
1.630 + * <li><p> If a host is given then it is appended. If the host is a
1.631 + * literal IPv6 address but is not enclosed in square brackets
1.632 + * (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
1.633 + * </p></li>
1.634 + *
1.635 + * <li><p> If a port number is given then a colon character
1.636 + * (<tt>':'</tt>) is appended, followed by the port number in decimal.
1.637 + * </p></li>
1.638 + *
1.639 + * <li><p> If a path is given then it is appended. Any character not in
1.640 + * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
1.641 + * categories, and not equal to the slash character (<tt>'/'</tt>) or the
1.642 + * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
1.643 + *
1.644 + * <li><p> If a query is given then a question-mark character
1.645 + * (<tt>'?'</tt>) is appended, followed by the query. Any character that
1.646 + * is not a <a href="#legal-chars">legal URI character</a> is quoted.
1.647 + * </p></li>
1.648 + *
1.649 + * <li><p> Finally, if a fragment is given then a hash character
1.650 + * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
1.651 + * that is not a legal URI character is quoted. </p></li>
1.652 + *
1.653 + * </ol>
1.654 + *
1.655 + * <p> The resulting URI string is then parsed as if by invoking the {@link
1.656 + * #URI(String)} constructor and then invoking the {@link
1.657 + * #parseServerAuthority()} method upon the result; this may cause a {@link
1.658 + * URISyntaxException} to be thrown. </p>
1.659 + *
1.660 + * @param scheme Scheme name
1.661 + * @param userInfo User name and authorization information
1.662 + * @param host Host name
1.663 + * @param port Port number
1.664 + * @param path Path
1.665 + * @param query Query
1.666 + * @param fragment Fragment
1.667 + *
1.668 + * @throws URISyntaxException
1.669 + * If both a scheme and a path are given but the path is relative,
1.670 + * if the URI string constructed from the given components violates
1.671 + * RFC 2396, or if the authority component of the string is
1.672 + * present but cannot be parsed as a server-based authority
1.673 + */
1.674 + public URI(String scheme,
1.675 + String userInfo, String host, int port,
1.676 + String path, String query, String fragment)
1.677 + throws URISyntaxException
1.678 + {
1.679 + String s = toString(scheme, null,
1.680 + null, userInfo, host, port,
1.681 + path, query, fragment);
1.682 + checkPath(s, scheme, path);
1.683 + new Parser(s).parse(true);
1.684 + }
1.685 +
1.686 + /**
1.687 + * Constructs a hierarchical URI from the given components.
1.688 + *
1.689 + * <p> If a scheme is given then the path, if also given, must either be
1.690 + * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
1.691 + * component of the new URI may be left undefined by passing <tt>null</tt>
1.692 + * for the corresponding parameter.
1.693 + *
1.694 + * <p> This constructor first builds a URI string from the given components
1.695 + * according to the rules specified in <a
1.696 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.697 + * section 5.2, step 7: </p>
1.698 + *
1.699 + * <ol>
1.700 + *
1.701 + * <li><p> Initially, the result string is empty. </p></li>
1.702 + *
1.703 + * <li><p> If a scheme is given then it is appended to the result,
1.704 + * followed by a colon character (<tt>':'</tt>). </p></li>
1.705 + *
1.706 + * <li><p> If an authority is given then the string <tt>"//"</tt> is
1.707 + * appended, followed by the authority. If the authority contains a
1.708 + * literal IPv6 address then the address must be enclosed in square
1.709 + * brackets (<tt>'['</tt> and <tt>']'</tt>). Any character not in the
1.710 + * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
1.711 + * categories, and not equal to the commercial-at character
1.712 + * (<tt>'@'</tt>), is <a href="#quote">quoted</a>. </p></li>
1.713 + *
1.714 + * <li><p> If a path is given then it is appended. Any character not in
1.715 + * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
1.716 + * categories, and not equal to the slash character (<tt>'/'</tt>) or the
1.717 + * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
1.718 + *
1.719 + * <li><p> If a query is given then a question-mark character
1.720 + * (<tt>'?'</tt>) is appended, followed by the query. Any character that
1.721 + * is not a <a href="#legal-chars">legal URI character</a> is quoted.
1.722 + * </p></li>
1.723 + *
1.724 + * <li><p> Finally, if a fragment is given then a hash character
1.725 + * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
1.726 + * that is not a legal URI character is quoted. </p></li>
1.727 + *
1.728 + * </ol>
1.729 + *
1.730 + * <p> The resulting URI string is then parsed as if by invoking the {@link
1.731 + * #URI(String)} constructor and then invoking the {@link
1.732 + * #parseServerAuthority()} method upon the result; this may cause a {@link
1.733 + * URISyntaxException} to be thrown. </p>
1.734 + *
1.735 + * @param scheme Scheme name
1.736 + * @param authority Authority
1.737 + * @param path Path
1.738 + * @param query Query
1.739 + * @param fragment Fragment
1.740 + *
1.741 + * @throws URISyntaxException
1.742 + * If both a scheme and a path are given but the path is relative,
1.743 + * if the URI string constructed from the given components violates
1.744 + * RFC 2396, or if the authority component of the string is
1.745 + * present but cannot be parsed as a server-based authority
1.746 + */
1.747 + public URI(String scheme,
1.748 + String authority,
1.749 + String path, String query, String fragment)
1.750 + throws URISyntaxException
1.751 + {
1.752 + String s = toString(scheme, null,
1.753 + authority, null, null, -1,
1.754 + path, query, fragment);
1.755 + checkPath(s, scheme, path);
1.756 + new Parser(s).parse(false);
1.757 + }
1.758 +
1.759 + /**
1.760 + * Constructs a hierarchical URI from the given components.
1.761 + *
1.762 + * <p> A component may be left undefined by passing <tt>null</tt>.
1.763 + *
1.764 + * <p> This convenience constructor works as if by invoking the
1.765 + * seven-argument constructor as follows:
1.766 + *
1.767 + * <blockquote><tt>
1.768 + * new {@link #URI(String, String, String, int, String, String, String)
1.769 + * URI}(scheme, null, host, -1, path, null, fragment);
1.770 + * </tt></blockquote>
1.771 + *
1.772 + * @param scheme Scheme name
1.773 + * @param host Host name
1.774 + * @param path Path
1.775 + * @param fragment Fragment
1.776 + *
1.777 + * @throws URISyntaxException
1.778 + * If the URI string constructed from the given components
1.779 + * violates RFC 2396
1.780 + */
1.781 + public URI(String scheme, String host, String path, String fragment)
1.782 + throws URISyntaxException
1.783 + {
1.784 + this(scheme, null, host, -1, path, null, fragment);
1.785 + }
1.786 +
1.787 + /**
1.788 + * Constructs a URI from the given components.
1.789 + *
1.790 + * <p> A component may be left undefined by passing <tt>null</tt>.
1.791 + *
1.792 + * <p> This constructor first builds a URI in string form using the given
1.793 + * components as follows: </p>
1.794 + *
1.795 + * <ol>
1.796 + *
1.797 + * <li><p> Initially, the result string is empty. </p></li>
1.798 + *
1.799 + * <li><p> If a scheme is given then it is appended to the result,
1.800 + * followed by a colon character (<tt>':'</tt>). </p></li>
1.801 + *
1.802 + * <li><p> If a scheme-specific part is given then it is appended. Any
1.803 + * character that is not a <a href="#legal-chars">legal URI character</a>
1.804 + * is <a href="#quote">quoted</a>. </p></li>
1.805 + *
1.806 + * <li><p> Finally, if a fragment is given then a hash character
1.807 + * (<tt>'#'</tt>) is appended to the string, followed by the fragment.
1.808 + * Any character that is not a legal URI character is quoted. </p></li>
1.809 + *
1.810 + * </ol>
1.811 + *
1.812 + * <p> The resulting URI string is then parsed in order to create the new
1.813 + * URI instance as if by invoking the {@link #URI(String)} constructor;
1.814 + * this may cause a {@link URISyntaxException} to be thrown. </p>
1.815 + *
1.816 + * @param scheme Scheme name
1.817 + * @param ssp Scheme-specific part
1.818 + * @param fragment Fragment
1.819 + *
1.820 + * @throws URISyntaxException
1.821 + * If the URI string constructed from the given components
1.822 + * violates RFC 2396
1.823 + */
1.824 + public URI(String scheme, String ssp, String fragment)
1.825 + throws URISyntaxException
1.826 + {
1.827 + new Parser(toString(scheme, ssp,
1.828 + null, null, null, -1,
1.829 + null, null, fragment))
1.830 + .parse(false);
1.831 + }
1.832 +
1.833 + /**
1.834 + * Creates a URI by parsing the given string.
1.835 + *
1.836 + * <p> This convenience factory method works as if by invoking the {@link
1.837 + * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
1.838 + * constructor is caught and wrapped in a new {@link
1.839 + * IllegalArgumentException} object, which is then thrown.
1.840 + *
1.841 + * <p> This method is provided for use in situations where it is known that
1.842 + * the given string is a legal URI, for example for URI constants declared
1.843 + * within in a program, and so it would be considered a programming error
1.844 + * for the string not to parse as such. The constructors, which throw
1.845 + * {@link URISyntaxException} directly, should be used situations where a
1.846 + * URI is being constructed from user input or from some other source that
1.847 + * may be prone to errors. </p>
1.848 + *
1.849 + * @param str The string to be parsed into a URI
1.850 + * @return The new URI
1.851 + *
1.852 + * @throws NullPointerException
1.853 + * If <tt>str</tt> is <tt>null</tt>
1.854 + *
1.855 + * @throws IllegalArgumentException
1.856 + * If the given string violates RFC 2396
1.857 + */
1.858 + public static URI create(String str) {
1.859 + try {
1.860 + return new URI(str);
1.861 + } catch (URISyntaxException x) {
1.862 + throw new IllegalArgumentException(x.getMessage(), x);
1.863 + }
1.864 + }
1.865 +
1.866 +
1.867 + // -- Operations --
1.868 +
1.869 + /**
1.870 + * Attempts to parse this URI's authority component, if defined, into
1.871 + * user-information, host, and port components.
1.872 + *
1.873 + * <p> If this URI's authority component has already been recognized as
1.874 + * being server-based then it will already have been parsed into
1.875 + * user-information, host, and port components. In this case, or if this
1.876 + * URI has no authority component, this method simply returns this URI.
1.877 + *
1.878 + * <p> Otherwise this method attempts once more to parse the authority
1.879 + * component into user-information, host, and port components, and throws
1.880 + * an exception describing why the authority component could not be parsed
1.881 + * in that way.
1.882 + *
1.883 + * <p> This method is provided because the generic URI syntax specified in
1.884 + * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
1.885 + * cannot always distinguish a malformed server-based authority from a
1.886 + * legitimate registry-based authority. It must therefore treat some
1.887 + * instances of the former as instances of the latter. The authority
1.888 + * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
1.889 + * legal server-based authority but it is legal as a registry-based
1.890 + * authority.
1.891 + *
1.892 + * <p> In many common situations, for example when working URIs that are
1.893 + * known to be either URNs or URLs, the hierarchical URIs being used will
1.894 + * always be server-based. They therefore must either be parsed as such or
1.895 + * treated as an error. In these cases a statement such as
1.896 + *
1.897 + * <blockquote>
1.898 + * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
1.899 + * </blockquote>
1.900 + *
1.901 + * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
1.902 + * it has an authority component, has a server-based authority with proper
1.903 + * user-information, host, and port components. Invoking this method also
1.904 + * ensures that if the authority could not be parsed in that way then an
1.905 + * appropriate diagnostic message can be issued based upon the exception
1.906 + * that is thrown. </p>
1.907 + *
1.908 + * @return A URI whose authority field has been parsed
1.909 + * as a server-based authority
1.910 + *
1.911 + * @throws URISyntaxException
1.912 + * If the authority component of this URI is defined
1.913 + * but cannot be parsed as a server-based authority
1.914 + * according to RFC 2396
1.915 + */
1.916 + public URI parseServerAuthority()
1.917 + throws URISyntaxException
1.918 + {
1.919 + // We could be clever and cache the error message and index from the
1.920 + // exception thrown during the original parse, but that would require
1.921 + // either more fields or a more-obscure representation.
1.922 + if ((host != null) || (authority == null))
1.923 + return this;
1.924 + defineString();
1.925 + new Parser(string).parse(true);
1.926 + return this;
1.927 + }
1.928 +
1.929 + /**
1.930 + * Normalizes this URI's path.
1.931 + *
1.932 + * <p> If this URI is opaque, or if its path is already in normal form,
1.933 + * then this URI is returned. Otherwise a new URI is constructed that is
1.934 + * identical to this URI except that its path is computed by normalizing
1.935 + * this URI's path in a manner consistent with <a
1.936 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.937 + * section 5.2, step 6, sub-steps c through f; that is:
1.938 + * </p>
1.939 + *
1.940 + * <ol>
1.941 + *
1.942 + * <li><p> All <tt>"."</tt> segments are removed. </p></li>
1.943 + *
1.944 + * <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
1.945 + * segment then both of these segments are removed. This step is
1.946 + * repeated until it is no longer applicable. </p></li>
1.947 + *
1.948 + * <li><p> If the path is relative, and if its first segment contains a
1.949 + * colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
1.950 + * prepended. This prevents a relative URI with a path such as
1.951 + * <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
1.952 + * scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
1.953 + * <b><i>(Deviation from RFC 2396)</i></b> </p></li>
1.954 + *
1.955 + * </ol>
1.956 + *
1.957 + * <p> A normalized path will begin with one or more <tt>".."</tt> segments
1.958 + * if there were insufficient non-<tt>".."</tt> segments preceding them to
1.959 + * allow their removal. A normalized path will begin with a <tt>"."</tt>
1.960 + * segment if one was inserted by step 3 above. Otherwise, a normalized
1.961 + * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
1.962 + *
1.963 + * @return A URI equivalent to this URI,
1.964 + * but whose path is in normal form
1.965 + */
1.966 + public URI normalize() {
1.967 + return normalize(this);
1.968 + }
1.969 +
1.970 + /**
1.971 + * Resolves the given URI against this URI.
1.972 + *
1.973 + * <p> If the given URI is already absolute, or if this URI is opaque, then
1.974 + * the given URI is returned.
1.975 + *
1.976 + * <p><a name="resolve-frag"></a> If the given URI's fragment component is
1.977 + * defined, its path component is empty, and its scheme, authority, and
1.978 + * query components are undefined, then a URI with the given fragment but
1.979 + * with all other components equal to those of this URI is returned. This
1.980 + * allows a URI representing a standalone fragment reference, such as
1.981 + * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
1.982 + *
1.983 + * <p> Otherwise this method constructs a new hierarchical URI in a manner
1.984 + * consistent with <a
1.985 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.986 + * section 5.2; that is: </p>
1.987 + *
1.988 + * <ol>
1.989 + *
1.990 + * <li><p> A new URI is constructed with this URI's scheme and the given
1.991 + * URI's query and fragment components. </p></li>
1.992 + *
1.993 + * <li><p> If the given URI has an authority component then the new URI's
1.994 + * authority and path are taken from the given URI. </p></li>
1.995 + *
1.996 + * <li><p> Otherwise the new URI's authority component is copied from
1.997 + * this URI, and its path is computed as follows: </p>
1.998 + *
1.999 + * <ol type=a>
1.1000 + *
1.1001 + * <li><p> If the given URI's path is absolute then the new URI's path
1.1002 + * is taken from the given URI. </p></li>
1.1003 + *
1.1004 + * <li><p> Otherwise the given URI's path is relative, and so the new
1.1005 + * URI's path is computed by resolving the path of the given URI
1.1006 + * against the path of this URI. This is done by concatenating all but
1.1007 + * the last segment of this URI's path, if any, with the given URI's
1.1008 + * path and then normalizing the result as if by invoking the {@link
1.1009 + * #normalize() normalize} method. </p></li>
1.1010 + *
1.1011 + * </ol></li>
1.1012 + *
1.1013 + * </ol>
1.1014 + *
1.1015 + * <p> The result of this method is absolute if, and only if, either this
1.1016 + * URI is absolute or the given URI is absolute. </p>
1.1017 + *
1.1018 + * @param uri The URI to be resolved against this URI
1.1019 + * @return The resulting URI
1.1020 + *
1.1021 + * @throws NullPointerException
1.1022 + * If <tt>uri</tt> is <tt>null</tt>
1.1023 + */
1.1024 + public URI resolve(URI uri) {
1.1025 + return resolve(this, uri);
1.1026 + }
1.1027 +
1.1028 + /**
1.1029 + * Constructs a new URI by parsing the given string and then resolving it
1.1030 + * against this URI.
1.1031 + *
1.1032 + * <p> This convenience method works as if invoking it were equivalent to
1.1033 + * evaluating the expression <tt>{@link #resolve(java.net.URI)
1.1034 + * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
1.1035 + *
1.1036 + * @param str The string to be parsed into a URI
1.1037 + * @return The resulting URI
1.1038 + *
1.1039 + * @throws NullPointerException
1.1040 + * If <tt>str</tt> is <tt>null</tt>
1.1041 + *
1.1042 + * @throws IllegalArgumentException
1.1043 + * If the given string violates RFC 2396
1.1044 + */
1.1045 + public URI resolve(String str) {
1.1046 + return resolve(URI.create(str));
1.1047 + }
1.1048 +
1.1049 + /**
1.1050 + * Relativizes the given URI against this URI.
1.1051 + *
1.1052 + * <p> The relativization of the given URI against this URI is computed as
1.1053 + * follows: </p>
1.1054 + *
1.1055 + * <ol>
1.1056 + *
1.1057 + * <li><p> If either this URI or the given URI are opaque, or if the
1.1058 + * scheme and authority components of the two URIs are not identical, or
1.1059 + * if the path of this URI is not a prefix of the path of the given URI,
1.1060 + * then the given URI is returned. </p></li>
1.1061 + *
1.1062 + * <li><p> Otherwise a new relative hierarchical URI is constructed with
1.1063 + * query and fragment components taken from the given URI and with a path
1.1064 + * component computed by removing this URI's path from the beginning of
1.1065 + * the given URI's path. </p></li>
1.1066 + *
1.1067 + * </ol>
1.1068 + *
1.1069 + * @param uri The URI to be relativized against this URI
1.1070 + * @return The resulting URI
1.1071 + *
1.1072 + * @throws NullPointerException
1.1073 + * If <tt>uri</tt> is <tt>null</tt>
1.1074 + */
1.1075 + public URI relativize(URI uri) {
1.1076 + return relativize(this, uri);
1.1077 + }
1.1078 +
1.1079 + /**
1.1080 + * Constructs a URL from this URI.
1.1081 + *
1.1082 + * <p> This convenience method works as if invoking it were equivalent to
1.1083 + * evaluating the expression <tt>new URL(this.toString())</tt> after
1.1084 + * first checking that this URI is absolute. </p>
1.1085 + *
1.1086 + * @return A URL constructed from this URI
1.1087 + *
1.1088 + * @throws IllegalArgumentException
1.1089 + * If this URL is not absolute
1.1090 + *
1.1091 + * @throws MalformedURLException
1.1092 + * If a protocol handler for the URL could not be found,
1.1093 + * or if some other error occurred while constructing the URL
1.1094 + */
1.1095 + public URL toURL()
1.1096 + throws MalformedURLException {
1.1097 + if (!isAbsolute())
1.1098 + throw new IllegalArgumentException("URI is not absolute");
1.1099 + return new URL(toString());
1.1100 + }
1.1101 +
1.1102 + // -- Component access methods --
1.1103 +
1.1104 + /**
1.1105 + * Returns the scheme component of this URI.
1.1106 + *
1.1107 + * <p> The scheme component of a URI, if defined, only contains characters
1.1108 + * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>. A
1.1109 + * scheme always starts with an <i>alpha</i> character. <p>
1.1110 + *
1.1111 + * The scheme component of a URI cannot contain escaped octets, hence this
1.1112 + * method does not perform any decoding.
1.1113 + *
1.1114 + * @return The scheme component of this URI,
1.1115 + * or <tt>null</tt> if the scheme is undefined
1.1116 + */
1.1117 + public String getScheme() {
1.1118 + return scheme;
1.1119 + }
1.1120 +
1.1121 + /**
1.1122 + * Tells whether or not this URI is absolute.
1.1123 + *
1.1124 + * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1.1125 + *
1.1126 + * @return <tt>true</tt> if, and only if, this URI is absolute
1.1127 + */
1.1128 + public boolean isAbsolute() {
1.1129 + return scheme != null;
1.1130 + }
1.1131 +
1.1132 + /**
1.1133 + * Tells whether or not this URI is opaque.
1.1134 + *
1.1135 + * <p> A URI is opaque if, and only if, it is absolute and its
1.1136 + * scheme-specific part does not begin with a slash character ('/').
1.1137 + * An opaque URI has a scheme, a scheme-specific part, and possibly
1.1138 + * a fragment; all other components are undefined. </p>
1.1139 + *
1.1140 + * @return <tt>true</tt> if, and only if, this URI is opaque
1.1141 + */
1.1142 + public boolean isOpaque() {
1.1143 + return path == null;
1.1144 + }
1.1145 +
1.1146 + /**
1.1147 + * Returns the raw scheme-specific part of this URI. The scheme-specific
1.1148 + * part is never undefined, though it may be empty.
1.1149 + *
1.1150 + * <p> The scheme-specific part of a URI only contains legal URI
1.1151 + * characters. </p>
1.1152 + *
1.1153 + * @return The raw scheme-specific part of this URI
1.1154 + * (never <tt>null</tt>)
1.1155 + */
1.1156 + public String getRawSchemeSpecificPart() {
1.1157 + defineSchemeSpecificPart();
1.1158 + return schemeSpecificPart;
1.1159 + }
1.1160 +
1.1161 + /**
1.1162 + * Returns the decoded scheme-specific part of this URI.
1.1163 + *
1.1164 + * <p> The string returned by this method is equal to that returned by the
1.1165 + * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1.1166 + * except that all sequences of escaped octets are <a
1.1167 + * href="#decode">decoded</a>. </p>
1.1168 + *
1.1169 + * @return The decoded scheme-specific part of this URI
1.1170 + * (never <tt>null</tt>)
1.1171 + */
1.1172 + public String getSchemeSpecificPart() {
1.1173 + if (decodedSchemeSpecificPart == null)
1.1174 + decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
1.1175 + return decodedSchemeSpecificPart;
1.1176 + }
1.1177 +
1.1178 + /**
1.1179 + * Returns the raw authority component of this URI.
1.1180 + *
1.1181 + * <p> The authority component of a URI, if defined, only contains the
1.1182 + * commercial-at character (<tt>'@'</tt>) and characters in the
1.1183 + * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1.1184 + * categories. If the authority is server-based then it is further
1.1185 + * constrained to have valid user-information, host, and port
1.1186 + * components. </p>
1.1187 + *
1.1188 + * @return The raw authority component of this URI,
1.1189 + * or <tt>null</tt> if the authority is undefined
1.1190 + */
1.1191 + public String getRawAuthority() {
1.1192 + return authority;
1.1193 + }
1.1194 +
1.1195 + /**
1.1196 + * Returns the decoded authority component of this URI.
1.1197 + *
1.1198 + * <p> The string returned by this method is equal to that returned by the
1.1199 + * {@link #getRawAuthority() getRawAuthority} method except that all
1.1200 + * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1.1201 + *
1.1202 + * @return The decoded authority component of this URI,
1.1203 + * or <tt>null</tt> if the authority is undefined
1.1204 + */
1.1205 + public String getAuthority() {
1.1206 + if (decodedAuthority == null)
1.1207 + decodedAuthority = decode(authority);
1.1208 + return decodedAuthority;
1.1209 + }
1.1210 +
1.1211 + /**
1.1212 + * Returns the raw user-information component of this URI.
1.1213 + *
1.1214 + * <p> The user-information component of a URI, if defined, only contains
1.1215 + * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1.1216 + * <i>other</i> categories. </p>
1.1217 + *
1.1218 + * @return The raw user-information component of this URI,
1.1219 + * or <tt>null</tt> if the user information is undefined
1.1220 + */
1.1221 + public String getRawUserInfo() {
1.1222 + return userInfo;
1.1223 + }
1.1224 +
1.1225 + /**
1.1226 + * Returns the decoded user-information component of this URI.
1.1227 + *
1.1228 + * <p> The string returned by this method is equal to that returned by the
1.1229 + * {@link #getRawUserInfo() getRawUserInfo} method except that all
1.1230 + * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1.1231 + *
1.1232 + * @return The decoded user-information component of this URI,
1.1233 + * or <tt>null</tt> if the user information is undefined
1.1234 + */
1.1235 + public String getUserInfo() {
1.1236 + if ((decodedUserInfo == null) && (userInfo != null))
1.1237 + decodedUserInfo = decode(userInfo);
1.1238 + return decodedUserInfo;
1.1239 + }
1.1240 +
1.1241 + /**
1.1242 + * Returns the host component of this URI.
1.1243 + *
1.1244 + * <p> The host component of a URI, if defined, will have one of the
1.1245 + * following forms: </p>
1.1246 + *
1.1247 + * <ul type=disc>
1.1248 + *
1.1249 + * <li><p> A domain name consisting of one or more <i>labels</i>
1.1250 + * separated by period characters (<tt>'.'</tt>), optionally followed by
1.1251 + * a period character. Each label consists of <i>alphanum</i> characters
1.1252 + * as well as hyphen characters (<tt>'-'</tt>), though hyphens never
1.1253 + * occur as the first or last characters in a label. The rightmost
1.1254 + * label of a domain name consisting of two or more labels, begins
1.1255 + * with an <i>alpha</i> character. </li>
1.1256 + *
1.1257 + * <li><p> A dotted-quad IPv4 address of the form
1.1258 + * <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
1.1259 + * where no <i>digit</i> sequence is longer than three characters and no
1.1260 + * sequence has a value larger than 255. </p></li>
1.1261 + *
1.1262 + * <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
1.1263 + * <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
1.1264 + * (<tt>':'</tt>), and possibly an embedded IPv4 address. The full
1.1265 + * syntax of IPv6 addresses is specified in <a
1.1266 + * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6
1.1267 + * Addressing Architecture</i></a>. </p></li>
1.1268 + *
1.1269 + * </ul>
1.1270 + *
1.1271 + * The host component of a URI cannot contain escaped octets, hence this
1.1272 + * method does not perform any decoding.
1.1273 + *
1.1274 + * @return The host component of this URI,
1.1275 + * or <tt>null</tt> if the host is undefined
1.1276 + */
1.1277 + public String getHost() {
1.1278 + return host;
1.1279 + }
1.1280 +
1.1281 + /**
1.1282 + * Returns the port number of this URI.
1.1283 + *
1.1284 + * <p> The port component of a URI, if defined, is a non-negative
1.1285 + * integer. </p>
1.1286 + *
1.1287 + * @return The port component of this URI,
1.1288 + * or <tt>-1</tt> if the port is undefined
1.1289 + */
1.1290 + public int getPort() {
1.1291 + return port;
1.1292 + }
1.1293 +
1.1294 + /**
1.1295 + * Returns the raw path component of this URI.
1.1296 + *
1.1297 + * <p> The path component of a URI, if defined, only contains the slash
1.1298 + * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
1.1299 + * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1.1300 + * and <i>other</i> categories. </p>
1.1301 + *
1.1302 + * @return The path component of this URI,
1.1303 + * or <tt>null</tt> if the path is undefined
1.1304 + */
1.1305 + public String getRawPath() {
1.1306 + return path;
1.1307 + }
1.1308 +
1.1309 + /**
1.1310 + * Returns the decoded path component of this URI.
1.1311 + *
1.1312 + * <p> The string returned by this method is equal to that returned by the
1.1313 + * {@link #getRawPath() getRawPath} method except that all sequences of
1.1314 + * escaped octets are <a href="#decode">decoded</a>. </p>
1.1315 + *
1.1316 + * @return The decoded path component of this URI,
1.1317 + * or <tt>null</tt> if the path is undefined
1.1318 + */
1.1319 + public String getPath() {
1.1320 + if ((decodedPath == null) && (path != null))
1.1321 + decodedPath = decode(path);
1.1322 + return decodedPath;
1.1323 + }
1.1324 +
1.1325 + /**
1.1326 + * Returns the raw query component of this URI.
1.1327 + *
1.1328 + * <p> The query component of a URI, if defined, only contains legal URI
1.1329 + * characters. </p>
1.1330 + *
1.1331 + * @return The raw query component of this URI,
1.1332 + * or <tt>null</tt> if the query is undefined
1.1333 + */
1.1334 + public String getRawQuery() {
1.1335 + return query;
1.1336 + }
1.1337 +
1.1338 + /**
1.1339 + * Returns the decoded query component of this URI.
1.1340 + *
1.1341 + * <p> The string returned by this method is equal to that returned by the
1.1342 + * {@link #getRawQuery() getRawQuery} method except that all sequences of
1.1343 + * escaped octets are <a href="#decode">decoded</a>. </p>
1.1344 + *
1.1345 + * @return The decoded query component of this URI,
1.1346 + * or <tt>null</tt> if the query is undefined
1.1347 + */
1.1348 + public String getQuery() {
1.1349 + if ((decodedQuery == null) && (query != null))
1.1350 + decodedQuery = decode(query);
1.1351 + return decodedQuery;
1.1352 + }
1.1353 +
1.1354 + /**
1.1355 + * Returns the raw fragment component of this URI.
1.1356 + *
1.1357 + * <p> The fragment component of a URI, if defined, only contains legal URI
1.1358 + * characters. </p>
1.1359 + *
1.1360 + * @return The raw fragment component of this URI,
1.1361 + * or <tt>null</tt> if the fragment is undefined
1.1362 + */
1.1363 + public String getRawFragment() {
1.1364 + return fragment;
1.1365 + }
1.1366 +
1.1367 + /**
1.1368 + * Returns the decoded fragment component of this URI.
1.1369 + *
1.1370 + * <p> The string returned by this method is equal to that returned by the
1.1371 + * {@link #getRawFragment() getRawFragment} method except that all
1.1372 + * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1.1373 + *
1.1374 + * @return The decoded fragment component of this URI,
1.1375 + * or <tt>null</tt> if the fragment is undefined
1.1376 + */
1.1377 + public String getFragment() {
1.1378 + if ((decodedFragment == null) && (fragment != null))
1.1379 + decodedFragment = decode(fragment);
1.1380 + return decodedFragment;
1.1381 + }
1.1382 +
1.1383 +
1.1384 + // -- Equality, comparison, hash code, toString, and serialization --
1.1385 +
1.1386 + /**
1.1387 + * Tests this URI for equality with another object.
1.1388 + *
1.1389 + * <p> If the given object is not a URI then this method immediately
1.1390 + * returns <tt>false</tt>.
1.1391 + *
1.1392 + * <p> For two URIs to be considered equal requires that either both are
1.1393 + * opaque or both are hierarchical. Their schemes must either both be
1.1394 + * undefined or else be equal without regard to case. Their fragments
1.1395 + * must either both be undefined or else be equal.
1.1396 + *
1.1397 + * <p> For two opaque URIs to be considered equal, their scheme-specific
1.1398 + * parts must be equal.
1.1399 + *
1.1400 + * <p> For two hierarchical URIs to be considered equal, their paths must
1.1401 + * be equal and their queries must either both be undefined or else be
1.1402 + * equal. Their authorities must either both be undefined, or both be
1.1403 + * registry-based, or both be server-based. If their authorities are
1.1404 + * defined and are registry-based, then they must be equal. If their
1.1405 + * authorities are defined and are server-based, then their hosts must be
1.1406 + * equal without regard to case, their port numbers must be equal, and
1.1407 + * their user-information components must be equal.
1.1408 + *
1.1409 + * <p> When testing the user-information, path, query, fragment, authority,
1.1410 + * or scheme-specific parts of two URIs for equality, the raw forms rather
1.1411 + * than the encoded forms of these components are compared and the
1.1412 + * hexadecimal digits of escaped octets are compared without regard to
1.1413 + * case.
1.1414 + *
1.1415 + * <p> This method satisfies the general contract of the {@link
1.1416 + * java.lang.Object#equals(Object) Object.equals} method. </p>
1.1417 + *
1.1418 + * @param ob The object to which this object is to be compared
1.1419 + *
1.1420 + * @return <tt>true</tt> if, and only if, the given object is a URI that
1.1421 + * is identical to this URI
1.1422 + */
1.1423 + public boolean equals(Object ob) {
1.1424 + if (ob == this)
1.1425 + return true;
1.1426 + if (!(ob instanceof URI))
1.1427 + return false;
1.1428 + URI that = (URI)ob;
1.1429 + if (this.isOpaque() != that.isOpaque()) return false;
1.1430 + if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1.1431 + if (!equal(this.fragment, that.fragment)) return false;
1.1432 +
1.1433 + // Opaque
1.1434 + if (this.isOpaque())
1.1435 + return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1.1436 +
1.1437 + // Hierarchical
1.1438 + if (!equal(this.path, that.path)) return false;
1.1439 + if (!equal(this.query, that.query)) return false;
1.1440 +
1.1441 + // Authorities
1.1442 + if (this.authority == that.authority) return true;
1.1443 + if (this.host != null) {
1.1444 + // Server-based
1.1445 + if (!equal(this.userInfo, that.userInfo)) return false;
1.1446 + if (!equalIgnoringCase(this.host, that.host)) return false;
1.1447 + if (this.port != that.port) return false;
1.1448 + } else if (this.authority != null) {
1.1449 + // Registry-based
1.1450 + if (!equal(this.authority, that.authority)) return false;
1.1451 + } else if (this.authority != that.authority) {
1.1452 + return false;
1.1453 + }
1.1454 +
1.1455 + return true;
1.1456 + }
1.1457 +
1.1458 + /**
1.1459 + * Returns a hash-code value for this URI. The hash code is based upon all
1.1460 + * of the URI's components, and satisfies the general contract of the
1.1461 + * {@link java.lang.Object#hashCode() Object.hashCode} method.
1.1462 + *
1.1463 + * @return A hash-code value for this URI
1.1464 + */
1.1465 + public int hashCode() {
1.1466 + if (hash != 0)
1.1467 + return hash;
1.1468 + int h = hashIgnoringCase(0, scheme);
1.1469 + h = hash(h, fragment);
1.1470 + if (isOpaque()) {
1.1471 + h = hash(h, schemeSpecificPart);
1.1472 + } else {
1.1473 + h = hash(h, path);
1.1474 + h = hash(h, query);
1.1475 + if (host != null) {
1.1476 + h = hash(h, userInfo);
1.1477 + h = hashIgnoringCase(h, host);
1.1478 + h += 1949 * port;
1.1479 + } else {
1.1480 + h = hash(h, authority);
1.1481 + }
1.1482 + }
1.1483 + hash = h;
1.1484 + return h;
1.1485 + }
1.1486 +
1.1487 + /**
1.1488 + * Compares this URI to another object, which must be a URI.
1.1489 + *
1.1490 + * <p> When comparing corresponding components of two URIs, if one
1.1491 + * component is undefined but the other is defined then the first is
1.1492 + * considered to be less than the second. Unless otherwise noted, string
1.1493 + * components are ordered according to their natural, case-sensitive
1.1494 + * ordering as defined by the {@link java.lang.String#compareTo(Object)
1.1495 + * String.compareTo} method. String components that are subject to
1.1496 + * encoding are compared by comparing their raw forms rather than their
1.1497 + * encoded forms.
1.1498 + *
1.1499 + * <p> The ordering of URIs is defined as follows: </p>
1.1500 + *
1.1501 + * <ul type=disc>
1.1502 + *
1.1503 + * <li><p> Two URIs with different schemes are ordered according the
1.1504 + * ordering of their schemes, without regard to case. </p></li>
1.1505 + *
1.1506 + * <li><p> A hierarchical URI is considered to be less than an opaque URI
1.1507 + * with an identical scheme. </p></li>
1.1508 + *
1.1509 + * <li><p> Two opaque URIs with identical schemes are ordered according
1.1510 + * to the ordering of their scheme-specific parts. </p></li>
1.1511 + *
1.1512 + * <li><p> Two opaque URIs with identical schemes and scheme-specific
1.1513 + * parts are ordered according to the ordering of their
1.1514 + * fragments. </p></li>
1.1515 + *
1.1516 + * <li><p> Two hierarchical URIs with identical schemes are ordered
1.1517 + * according to the ordering of their authority components: </p>
1.1518 + *
1.1519 + * <ul type=disc>
1.1520 + *
1.1521 + * <li><p> If both authority components are server-based then the URIs
1.1522 + * are ordered according to their user-information components; if these
1.1523 + * components are identical then the URIs are ordered according to the
1.1524 + * ordering of their hosts, without regard to case; if the hosts are
1.1525 + * identical then the URIs are ordered according to the ordering of
1.1526 + * their ports. </p></li>
1.1527 + *
1.1528 + * <li><p> If one or both authority components are registry-based then
1.1529 + * the URIs are ordered according to the ordering of their authority
1.1530 + * components. </p></li>
1.1531 + *
1.1532 + * </ul></li>
1.1533 + *
1.1534 + * <li><p> Finally, two hierarchical URIs with identical schemes and
1.1535 + * authority components are ordered according to the ordering of their
1.1536 + * paths; if their paths are identical then they are ordered according to
1.1537 + * the ordering of their queries; if the queries are identical then they
1.1538 + * are ordered according to the order of their fragments. </p></li>
1.1539 + *
1.1540 + * </ul>
1.1541 + *
1.1542 + * <p> This method satisfies the general contract of the {@link
1.1543 + * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1.1544 + * method. </p>
1.1545 + *
1.1546 + * @param that
1.1547 + * The object to which this URI is to be compared
1.1548 + *
1.1549 + * @return A negative integer, zero, or a positive integer as this URI is
1.1550 + * less than, equal to, or greater than the given URI
1.1551 + *
1.1552 + * @throws ClassCastException
1.1553 + * If the given object is not a URI
1.1554 + */
1.1555 + public int compareTo(URI that) {
1.1556 + int c;
1.1557 +
1.1558 + if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1.1559 + return c;
1.1560 +
1.1561 + if (this.isOpaque()) {
1.1562 + if (that.isOpaque()) {
1.1563 + // Both opaque
1.1564 + if ((c = compare(this.schemeSpecificPart,
1.1565 + that.schemeSpecificPart)) != 0)
1.1566 + return c;
1.1567 + return compare(this.fragment, that.fragment);
1.1568 + }
1.1569 + return +1; // Opaque > hierarchical
1.1570 + } else if (that.isOpaque()) {
1.1571 + return -1; // Hierarchical < opaque
1.1572 + }
1.1573 +
1.1574 + // Hierarchical
1.1575 + if ((this.host != null) && (that.host != null)) {
1.1576 + // Both server-based
1.1577 + if ((c = compare(this.userInfo, that.userInfo)) != 0)
1.1578 + return c;
1.1579 + if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1.1580 + return c;
1.1581 + if ((c = this.port - that.port) != 0)
1.1582 + return c;
1.1583 + } else {
1.1584 + // If one or both authorities are registry-based then we simply
1.1585 + // compare them in the usual, case-sensitive way. If one is
1.1586 + // registry-based and one is server-based then the strings are
1.1587 + // guaranteed to be unequal, hence the comparison will never return
1.1588 + // zero and the compareTo and equals methods will remain
1.1589 + // consistent.
1.1590 + if ((c = compare(this.authority, that.authority)) != 0) return c;
1.1591 + }
1.1592 +
1.1593 + if ((c = compare(this.path, that.path)) != 0) return c;
1.1594 + if ((c = compare(this.query, that.query)) != 0) return c;
1.1595 + return compare(this.fragment, that.fragment);
1.1596 + }
1.1597 +
1.1598 + /**
1.1599 + * Returns the content of this URI as a string.
1.1600 + *
1.1601 + * <p> If this URI was created by invoking one of the constructors in this
1.1602 + * class then a string equivalent to the original input string, or to the
1.1603 + * string computed from the originally-given components, as appropriate, is
1.1604 + * returned. Otherwise this URI was created by normalization, resolution,
1.1605 + * or relativization, and so a string is constructed from this URI's
1.1606 + * components according to the rules specified in <a
1.1607 + * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1.1608 + * section 5.2, step 7. </p>
1.1609 + *
1.1610 + * @return The string form of this URI
1.1611 + */
1.1612 + public String toString() {
1.1613 + defineString();
1.1614 + return string;
1.1615 + }
1.1616 +
1.1617 + /**
1.1618 + * Returns the content of this URI as a US-ASCII string.
1.1619 + *
1.1620 + * <p> If this URI does not contain any characters in the <i>other</i>
1.1621 + * category then an invocation of this method will return the same value as
1.1622 + * an invocation of the {@link #toString() toString} method. Otherwise
1.1623 + * this method works as if by invoking that method and then <a
1.1624 + * href="#encode">encoding</a> the result. </p>
1.1625 + *
1.1626 + * @return The string form of this URI, encoded as needed
1.1627 + * so that it only contains characters in the US-ASCII
1.1628 + * charset
1.1629 + */
1.1630 + public String toASCIIString() {
1.1631 + defineString();
1.1632 + return encode(string);
1.1633 + }
1.1634 +
1.1635 +
1.1636 + // -- Serialization support --
1.1637 +
1.1638 + /**
1.1639 + * Saves the content of this URI to the given serial stream.
1.1640 + *
1.1641 + * <p> The only serializable field of a URI instance is its <tt>string</tt>
1.1642 + * field. That field is given a value, if it does not have one already,
1.1643 + * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1.1644 + * method of the given object-output stream is invoked. </p>
1.1645 + *
1.1646 + * @param os The object-output stream to which this object
1.1647 + * is to be written
1.1648 + */
1.1649 + private void writeObject(ObjectOutputStream os)
1.1650 + throws IOException
1.1651 + {
1.1652 + defineString();
1.1653 + os.defaultWriteObject(); // Writes the string field only
1.1654 + }
1.1655 +
1.1656 + /**
1.1657 + * Reconstitutes a URI from the given serial stream.
1.1658 + *
1.1659 + * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1.1660 + * invoked to read the value of the <tt>string</tt> field. The result is
1.1661 + * then parsed in the usual way.
1.1662 + *
1.1663 + * @param is The object-input stream from which this object
1.1664 + * is being read
1.1665 + */
1.1666 + private void readObject(ObjectInputStream is)
1.1667 + throws ClassNotFoundException, IOException
1.1668 + {
1.1669 + port = -1; // Argh
1.1670 + is.defaultReadObject();
1.1671 + try {
1.1672 + new Parser(string).parse(false);
1.1673 + } catch (URISyntaxException x) {
1.1674 + IOException y = new InvalidObjectException("Invalid URI");
1.1675 + y.initCause(x);
1.1676 + throw y;
1.1677 + }
1.1678 + }
1.1679 +
1.1680 +
1.1681 + // -- End of public methods --
1.1682 +
1.1683 +
1.1684 + // -- Utility methods for string-field comparison and hashing --
1.1685 +
1.1686 + // These methods return appropriate values for null string arguments,
1.1687 + // thereby simplifying the equals, hashCode, and compareTo methods.
1.1688 + //
1.1689 + // The case-ignoring methods should only be applied to strings whose
1.1690 + // characters are all known to be US-ASCII. Because of this restriction,
1.1691 + // these methods are faster than the similar methods in the String class.
1.1692 +
1.1693 + // US-ASCII only
1.1694 + private static int toLower(char c) {
1.1695 + if ((c >= 'A') && (c <= 'Z'))
1.1696 + return c + ('a' - 'A');
1.1697 + return c;
1.1698 + }
1.1699 +
1.1700 + private static boolean equal(String s, String t) {
1.1701 + if (s == t) return true;
1.1702 + if ((s != null) && (t != null)) {
1.1703 + if (s.length() != t.length())
1.1704 + return false;
1.1705 + if (s.indexOf('%') < 0)
1.1706 + return s.equals(t);
1.1707 + int n = s.length();
1.1708 + for (int i = 0; i < n;) {
1.1709 + char c = s.charAt(i);
1.1710 + char d = t.charAt(i);
1.1711 + if (c != '%') {
1.1712 + if (c != d)
1.1713 + return false;
1.1714 + i++;
1.1715 + continue;
1.1716 + }
1.1717 + i++;
1.1718 + if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1.1719 + return false;
1.1720 + i++;
1.1721 + if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1.1722 + return false;
1.1723 + i++;
1.1724 + }
1.1725 + return true;
1.1726 + }
1.1727 + return false;
1.1728 + }
1.1729 +
1.1730 + // US-ASCII only
1.1731 + private static boolean equalIgnoringCase(String s, String t) {
1.1732 + if (s == t) return true;
1.1733 + if ((s != null) && (t != null)) {
1.1734 + int n = s.length();
1.1735 + if (t.length() != n)
1.1736 + return false;
1.1737 + for (int i = 0; i < n; i++) {
1.1738 + if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1.1739 + return false;
1.1740 + }
1.1741 + return true;
1.1742 + }
1.1743 + return false;
1.1744 + }
1.1745 +
1.1746 + private static int hash(int hash, String s) {
1.1747 + if (s == null) return hash;
1.1748 + return hash * 127 + s.hashCode();
1.1749 + }
1.1750 +
1.1751 + // US-ASCII only
1.1752 + private static int hashIgnoringCase(int hash, String s) {
1.1753 + if (s == null) return hash;
1.1754 + int h = hash;
1.1755 + int n = s.length();
1.1756 + for (int i = 0; i < n; i++)
1.1757 + h = 31 * h + toLower(s.charAt(i));
1.1758 + return h;
1.1759 + }
1.1760 +
1.1761 + private static int compare(String s, String t) {
1.1762 + if (s == t) return 0;
1.1763 + if (s != null) {
1.1764 + if (t != null)
1.1765 + return s.compareTo(t);
1.1766 + else
1.1767 + return +1;
1.1768 + } else {
1.1769 + return -1;
1.1770 + }
1.1771 + }
1.1772 +
1.1773 + // US-ASCII only
1.1774 + private static int compareIgnoringCase(String s, String t) {
1.1775 + if (s == t) return 0;
1.1776 + if (s != null) {
1.1777 + if (t != null) {
1.1778 + int sn = s.length();
1.1779 + int tn = t.length();
1.1780 + int n = sn < tn ? sn : tn;
1.1781 + for (int i = 0; i < n; i++) {
1.1782 + int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1.1783 + if (c != 0)
1.1784 + return c;
1.1785 + }
1.1786 + return sn - tn;
1.1787 + }
1.1788 + return +1;
1.1789 + } else {
1.1790 + return -1;
1.1791 + }
1.1792 + }
1.1793 +
1.1794 +
1.1795 + // -- String construction --
1.1796 +
1.1797 + // If a scheme is given then the path, if given, must be absolute
1.1798 + //
1.1799 + private static void checkPath(String s, String scheme, String path)
1.1800 + throws URISyntaxException
1.1801 + {
1.1802 + if (scheme != null) {
1.1803 + if ((path != null)
1.1804 + && ((path.length() > 0) && (path.charAt(0) != '/')))
1.1805 + throw new URISyntaxException(s,
1.1806 + "Relative path in absolute URI");
1.1807 + }
1.1808 + }
1.1809 +
1.1810 + private void appendAuthority(StringBuffer sb,
1.1811 + String authority,
1.1812 + String userInfo,
1.1813 + String host,
1.1814 + int port)
1.1815 + {
1.1816 + if (host != null) {
1.1817 + sb.append("//");
1.1818 + if (userInfo != null) {
1.1819 + sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1.1820 + sb.append('@');
1.1821 + }
1.1822 + boolean needBrackets = ((host.indexOf(':') >= 0)
1.1823 + && !host.startsWith("[")
1.1824 + && !host.endsWith("]"));
1.1825 + if (needBrackets) sb.append('[');
1.1826 + sb.append(host);
1.1827 + if (needBrackets) sb.append(']');
1.1828 + if (port != -1) {
1.1829 + sb.append(':');
1.1830 + sb.append(port);
1.1831 + }
1.1832 + } else if (authority != null) {
1.1833 + sb.append("//");
1.1834 + if (authority.startsWith("[")) {
1.1835 + // authority should (but may not) contain an embedded IPv6 address
1.1836 + int end = authority.indexOf("]");
1.1837 + String doquote = authority, dontquote = "";
1.1838 + if (end != -1 && authority.indexOf(":") != -1) {
1.1839 + // the authority contains an IPv6 address
1.1840 + if (end == authority.length()) {
1.1841 + dontquote = authority;
1.1842 + doquote = "";
1.1843 + } else {
1.1844 + dontquote = authority.substring(0 , end + 1);
1.1845 + doquote = authority.substring(end + 1);
1.1846 + }
1.1847 + }
1.1848 + sb.append(dontquote);
1.1849 + sb.append(quote(doquote,
1.1850 + L_REG_NAME | L_SERVER,
1.1851 + H_REG_NAME | H_SERVER));
1.1852 + } else {
1.1853 + sb.append(quote(authority,
1.1854 + L_REG_NAME | L_SERVER,
1.1855 + H_REG_NAME | H_SERVER));
1.1856 + }
1.1857 + }
1.1858 + }
1.1859 +
1.1860 + private void appendSchemeSpecificPart(StringBuffer sb,
1.1861 + String opaquePart,
1.1862 + String authority,
1.1863 + String userInfo,
1.1864 + String host,
1.1865 + int port,
1.1866 + String path,
1.1867 + String query)
1.1868 + {
1.1869 + if (opaquePart != null) {
1.1870 + /* check if SSP begins with an IPv6 address
1.1871 + * because we must not quote a literal IPv6 address
1.1872 + */
1.1873 + if (opaquePart.startsWith("//[")) {
1.1874 + int end = opaquePart.indexOf("]");
1.1875 + if (end != -1 && opaquePart.indexOf(":")!=-1) {
1.1876 + String doquote, dontquote;
1.1877 + if (end == opaquePart.length()) {
1.1878 + dontquote = opaquePart;
1.1879 + doquote = "";
1.1880 + } else {
1.1881 + dontquote = opaquePart.substring(0,end+1);
1.1882 + doquote = opaquePart.substring(end+1);
1.1883 + }
1.1884 + sb.append (dontquote);
1.1885 + sb.append(quote(doquote, L_URIC, H_URIC));
1.1886 + }
1.1887 + } else {
1.1888 + sb.append(quote(opaquePart, L_URIC, H_URIC));
1.1889 + }
1.1890 + } else {
1.1891 + appendAuthority(sb, authority, userInfo, host, port);
1.1892 + if (path != null)
1.1893 + sb.append(quote(path, L_PATH, H_PATH));
1.1894 + if (query != null) {
1.1895 + sb.append('?');
1.1896 + sb.append(quote(query, L_URIC, H_URIC));
1.1897 + }
1.1898 + }
1.1899 + }
1.1900 +
1.1901 + private void appendFragment(StringBuffer sb, String fragment) {
1.1902 + if (fragment != null) {
1.1903 + sb.append('#');
1.1904 + sb.append(quote(fragment, L_URIC, H_URIC));
1.1905 + }
1.1906 + }
1.1907 +
1.1908 + private String toString(String scheme,
1.1909 + String opaquePart,
1.1910 + String authority,
1.1911 + String userInfo,
1.1912 + String host,
1.1913 + int port,
1.1914 + String path,
1.1915 + String query,
1.1916 + String fragment)
1.1917 + {
1.1918 + StringBuffer sb = new StringBuffer();
1.1919 + if (scheme != null) {
1.1920 + sb.append(scheme);
1.1921 + sb.append(':');
1.1922 + }
1.1923 + appendSchemeSpecificPart(sb, opaquePart,
1.1924 + authority, userInfo, host, port,
1.1925 + path, query);
1.1926 + appendFragment(sb, fragment);
1.1927 + return sb.toString();
1.1928 + }
1.1929 +
1.1930 + private void defineSchemeSpecificPart() {
1.1931 + if (schemeSpecificPart != null) return;
1.1932 + StringBuffer sb = new StringBuffer();
1.1933 + appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1.1934 + host, port, getPath(), getQuery());
1.1935 + if (sb.length() == 0) return;
1.1936 + schemeSpecificPart = sb.toString();
1.1937 + }
1.1938 +
1.1939 + private void defineString() {
1.1940 + if (string != null) return;
1.1941 +
1.1942 + StringBuffer sb = new StringBuffer();
1.1943 + if (scheme != null) {
1.1944 + sb.append(scheme);
1.1945 + sb.append(':');
1.1946 + }
1.1947 + if (isOpaque()) {
1.1948 + sb.append(schemeSpecificPart);
1.1949 + } else {
1.1950 + if (host != null) {
1.1951 + sb.append("//");
1.1952 + if (userInfo != null) {
1.1953 + sb.append(userInfo);
1.1954 + sb.append('@');
1.1955 + }
1.1956 + boolean needBrackets = ((host.indexOf(':') >= 0)
1.1957 + && !host.startsWith("[")
1.1958 + && !host.endsWith("]"));
1.1959 + if (needBrackets) sb.append('[');
1.1960 + sb.append(host);
1.1961 + if (needBrackets) sb.append(']');
1.1962 + if (port != -1) {
1.1963 + sb.append(':');
1.1964 + sb.append(port);
1.1965 + }
1.1966 + } else if (authority != null) {
1.1967 + sb.append("//");
1.1968 + sb.append(authority);
1.1969 + }
1.1970 + if (path != null)
1.1971 + sb.append(path);
1.1972 + if (query != null) {
1.1973 + sb.append('?');
1.1974 + sb.append(query);
1.1975 + }
1.1976 + }
1.1977 + if (fragment != null) {
1.1978 + sb.append('#');
1.1979 + sb.append(fragment);
1.1980 + }
1.1981 + string = sb.toString();
1.1982 + }
1.1983 +
1.1984 +
1.1985 + // -- Normalization, resolution, and relativization --
1.1986 +
1.1987 + // RFC2396 5.2 (6)
1.1988 + private static String resolvePath(String base, String child,
1.1989 + boolean absolute)
1.1990 + {
1.1991 + int i = base.lastIndexOf('/');
1.1992 + int cn = child.length();
1.1993 + String path = "";
1.1994 +
1.1995 + if (cn == 0) {
1.1996 + // 5.2 (6a)
1.1997 + if (i >= 0)
1.1998 + path = base.substring(0, i + 1);
1.1999 + } else {
1.2000 + StringBuffer sb = new StringBuffer(base.length() + cn);
1.2001 + // 5.2 (6a)
1.2002 + if (i >= 0)
1.2003 + sb.append(base.substring(0, i + 1));
1.2004 + // 5.2 (6b)
1.2005 + sb.append(child);
1.2006 + path = sb.toString();
1.2007 + }
1.2008 +
1.2009 + // 5.2 (6c-f)
1.2010 + String np = normalize(path);
1.2011 +
1.2012 + // 5.2 (6g): If the result is absolute but the path begins with "../",
1.2013 + // then we simply leave the path as-is
1.2014 +
1.2015 + return np;
1.2016 + }
1.2017 +
1.2018 + // RFC2396 5.2
1.2019 + private static URI resolve(URI base, URI child) {
1.2020 + // check if child if opaque first so that NPE is thrown
1.2021 + // if child is null.
1.2022 + if (child.isOpaque() || base.isOpaque())
1.2023 + return child;
1.2024 +
1.2025 + // 5.2 (2): Reference to current document (lone fragment)
1.2026 + if ((child.scheme == null) && (child.authority == null)
1.2027 + && child.path.equals("") && (child.fragment != null)
1.2028 + && (child.query == null)) {
1.2029 + if ((base.fragment != null)
1.2030 + && child.fragment.equals(base.fragment)) {
1.2031 + return base;
1.2032 + }
1.2033 + URI ru = new URI();
1.2034 + ru.scheme = base.scheme;
1.2035 + ru.authority = base.authority;
1.2036 + ru.userInfo = base.userInfo;
1.2037 + ru.host = base.host;
1.2038 + ru.port = base.port;
1.2039 + ru.path = base.path;
1.2040 + ru.fragment = child.fragment;
1.2041 + ru.query = base.query;
1.2042 + return ru;
1.2043 + }
1.2044 +
1.2045 + // 5.2 (3): Child is absolute
1.2046 + if (child.scheme != null)
1.2047 + return child;
1.2048 +
1.2049 + URI ru = new URI(); // Resolved URI
1.2050 + ru.scheme = base.scheme;
1.2051 + ru.query = child.query;
1.2052 + ru.fragment = child.fragment;
1.2053 +
1.2054 + // 5.2 (4): Authority
1.2055 + if (child.authority == null) {
1.2056 + ru.authority = base.authority;
1.2057 + ru.host = base.host;
1.2058 + ru.userInfo = base.userInfo;
1.2059 + ru.port = base.port;
1.2060 +
1.2061 + String cp = (child.path == null) ? "" : child.path;
1.2062 + if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
1.2063 + // 5.2 (5): Child path is absolute
1.2064 + ru.path = child.path;
1.2065 + } else {
1.2066 + // 5.2 (6): Resolve relative path
1.2067 + ru.path = resolvePath(base.path, cp, base.isAbsolute());
1.2068 + }
1.2069 + } else {
1.2070 + ru.authority = child.authority;
1.2071 + ru.host = child.host;
1.2072 + ru.userInfo = child.userInfo;
1.2073 + ru.host = child.host;
1.2074 + ru.port = child.port;
1.2075 + ru.path = child.path;
1.2076 + }
1.2077 +
1.2078 + // 5.2 (7): Recombine (nothing to do here)
1.2079 + return ru;
1.2080 + }
1.2081 +
1.2082 + // If the given URI's path is normal then return the URI;
1.2083 + // o.w., return a new URI containing the normalized path.
1.2084 + //
1.2085 + private static URI normalize(URI u) {
1.2086 + if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
1.2087 + return u;
1.2088 +
1.2089 + String np = normalize(u.path);
1.2090 + if (np == u.path)
1.2091 + return u;
1.2092 +
1.2093 + URI v = new URI();
1.2094 + v.scheme = u.scheme;
1.2095 + v.fragment = u.fragment;
1.2096 + v.authority = u.authority;
1.2097 + v.userInfo = u.userInfo;
1.2098 + v.host = u.host;
1.2099 + v.port = u.port;
1.2100 + v.path = np;
1.2101 + v.query = u.query;
1.2102 + return v;
1.2103 + }
1.2104 +
1.2105 + // If both URIs are hierarchical, their scheme and authority components are
1.2106 + // identical, and the base path is a prefix of the child's path, then
1.2107 + // return a relative URI that, when resolved against the base, yields the
1.2108 + // child; otherwise, return the child.
1.2109 + //
1.2110 + private static URI relativize(URI base, URI child) {
1.2111 + // check if child if opaque first so that NPE is thrown
1.2112 + // if child is null.
1.2113 + if (child.isOpaque() || base.isOpaque())
1.2114 + return child;
1.2115 + if (!equalIgnoringCase(base.scheme, child.scheme)
1.2116 + || !equal(base.authority, child.authority))
1.2117 + return child;
1.2118 +
1.2119 + String bp = normalize(base.path);
1.2120 + String cp = normalize(child.path);
1.2121 + if (!bp.equals(cp)) {
1.2122 + if (!bp.endsWith("/"))
1.2123 + bp = bp + "/";
1.2124 + if (!cp.startsWith(bp))
1.2125 + return child;
1.2126 + }
1.2127 +
1.2128 + URI v = new URI();
1.2129 + v.path = cp.substring(bp.length());
1.2130 + v.query = child.query;
1.2131 + v.fragment = child.fragment;
1.2132 + return v;
1.2133 + }
1.2134 +
1.2135 +
1.2136 +
1.2137 + // -- Path normalization --
1.2138 +
1.2139 + // The following algorithm for path normalization avoids the creation of a
1.2140 + // string object for each segment, as well as the use of a string buffer to
1.2141 + // compute the final result, by using a single char array and editing it in
1.2142 + // place. The array is first split into segments, replacing each slash
1.2143 + // with '\0' and creating a segment-index array, each element of which is
1.2144 + // the index of the first char in the corresponding segment. We then walk
1.2145 + // through both arrays, removing ".", "..", and other segments as necessary
1.2146 + // by setting their entries in the index array to -1. Finally, the two
1.2147 + // arrays are used to rejoin the segments and compute the final result.
1.2148 + //
1.2149 + // This code is based upon src/solaris/native/java/io/canonicalize_md.c
1.2150 +
1.2151 +
1.2152 + // Check the given path to see if it might need normalization. A path
1.2153 + // might need normalization if it contains duplicate slashes, a "."
1.2154 + // segment, or a ".." segment. Return -1 if no further normalization is
1.2155 + // possible, otherwise return the number of segments found.
1.2156 + //
1.2157 + // This method takes a string argument rather than a char array so that
1.2158 + // this test can be performed without invoking path.toCharArray().
1.2159 + //
1.2160 + static private int needsNormalization(String path) {
1.2161 + boolean normal = true;
1.2162 + int ns = 0; // Number of segments
1.2163 + int end = path.length() - 1; // Index of last char in path
1.2164 + int p = 0; // Index of next char in path
1.2165 +
1.2166 + // Skip initial slashes
1.2167 + while (p <= end) {
1.2168 + if (path.charAt(p) != '/') break;
1.2169 + p++;
1.2170 + }
1.2171 + if (p > 1) normal = false;
1.2172 +
1.2173 + // Scan segments
1.2174 + while (p <= end) {
1.2175 +
1.2176 + // Looking at "." or ".." ?
1.2177 + if ((path.charAt(p) == '.')
1.2178 + && ((p == end)
1.2179 + || ((path.charAt(p + 1) == '/')
1.2180 + || ((path.charAt(p + 1) == '.')
1.2181 + && ((p + 1 == end)
1.2182 + || (path.charAt(p + 2) == '/')))))) {
1.2183 + normal = false;
1.2184 + }
1.2185 + ns++;
1.2186 +
1.2187 + // Find beginning of next segment
1.2188 + while (p <= end) {
1.2189 + if (path.charAt(p++) != '/')
1.2190 + continue;
1.2191 +
1.2192 + // Skip redundant slashes
1.2193 + while (p <= end) {
1.2194 + if (path.charAt(p) != '/') break;
1.2195 + normal = false;
1.2196 + p++;
1.2197 + }
1.2198 +
1.2199 + break;
1.2200 + }
1.2201 + }
1.2202 +
1.2203 + return normal ? -1 : ns;
1.2204 + }
1.2205 +
1.2206 +
1.2207 + // Split the given path into segments, replacing slashes with nulls and
1.2208 + // filling in the given segment-index array.
1.2209 + //
1.2210 + // Preconditions:
1.2211 + // segs.length == Number of segments in path
1.2212 + //
1.2213 + // Postconditions:
1.2214 + // All slashes in path replaced by '\0'
1.2215 + // segs[i] == Index of first char in segment i (0 <= i < segs.length)
1.2216 + //
1.2217 + static private void split(char[] path, int[] segs) {
1.2218 + int end = path.length - 1; // Index of last char in path
1.2219 + int p = 0; // Index of next char in path
1.2220 + int i = 0; // Index of current segment
1.2221 +
1.2222 + // Skip initial slashes
1.2223 + while (p <= end) {
1.2224 + if (path[p] != '/') break;
1.2225 + path[p] = '\0';
1.2226 + p++;
1.2227 + }
1.2228 +
1.2229 + while (p <= end) {
1.2230 +
1.2231 + // Note start of segment
1.2232 + segs[i++] = p++;
1.2233 +
1.2234 + // Find beginning of next segment
1.2235 + while (p <= end) {
1.2236 + if (path[p++] != '/')
1.2237 + continue;
1.2238 + path[p - 1] = '\0';
1.2239 +
1.2240 + // Skip redundant slashes
1.2241 + while (p <= end) {
1.2242 + if (path[p] != '/') break;
1.2243 + path[p++] = '\0';
1.2244 + }
1.2245 + break;
1.2246 + }
1.2247 + }
1.2248 +
1.2249 + if (i != segs.length)
1.2250 + throw new InternalError(); // ASSERT
1.2251 + }
1.2252 +
1.2253 +
1.2254 + // Join the segments in the given path according to the given segment-index
1.2255 + // array, ignoring those segments whose index entries have been set to -1,
1.2256 + // and inserting slashes as needed. Return the length of the resulting
1.2257 + // path.
1.2258 + //
1.2259 + // Preconditions:
1.2260 + // segs[i] == -1 implies segment i is to be ignored
1.2261 + // path computed by split, as above, with '\0' having replaced '/'
1.2262 + //
1.2263 + // Postconditions:
1.2264 + // path[0] .. path[return value] == Resulting path
1.2265 + //
1.2266 + static private int join(char[] path, int[] segs) {
1.2267 + int ns = segs.length; // Number of segments
1.2268 + int end = path.length - 1; // Index of last char in path
1.2269 + int p = 0; // Index of next path char to write
1.2270 +
1.2271 + if (path[p] == '\0') {
1.2272 + // Restore initial slash for absolute paths
1.2273 + path[p++] = '/';
1.2274 + }
1.2275 +
1.2276 + for (int i = 0; i < ns; i++) {
1.2277 + int q = segs[i]; // Current segment
1.2278 + if (q == -1)
1.2279 + // Ignore this segment
1.2280 + continue;
1.2281 +
1.2282 + if (p == q) {
1.2283 + // We're already at this segment, so just skip to its end
1.2284 + while ((p <= end) && (path[p] != '\0'))
1.2285 + p++;
1.2286 + if (p <= end) {
1.2287 + // Preserve trailing slash
1.2288 + path[p++] = '/';
1.2289 + }
1.2290 + } else if (p < q) {
1.2291 + // Copy q down to p
1.2292 + while ((q <= end) && (path[q] != '\0'))
1.2293 + path[p++] = path[q++];
1.2294 + if (q <= end) {
1.2295 + // Preserve trailing slash
1.2296 + path[p++] = '/';
1.2297 + }
1.2298 + } else
1.2299 + throw new InternalError(); // ASSERT false
1.2300 + }
1.2301 +
1.2302 + return p;
1.2303 + }
1.2304 +
1.2305 +
1.2306 + // Remove "." segments from the given path, and remove segment pairs
1.2307 + // consisting of a non-".." segment followed by a ".." segment.
1.2308 + //
1.2309 + private static void removeDots(char[] path, int[] segs) {
1.2310 + int ns = segs.length;
1.2311 + int end = path.length - 1;
1.2312 +
1.2313 + for (int i = 0; i < ns; i++) {
1.2314 + int dots = 0; // Number of dots found (0, 1, or 2)
1.2315 +
1.2316 + // Find next occurrence of "." or ".."
1.2317 + do {
1.2318 + int p = segs[i];
1.2319 + if (path[p] == '.') {
1.2320 + if (p == end) {
1.2321 + dots = 1;
1.2322 + break;
1.2323 + } else if (path[p + 1] == '\0') {
1.2324 + dots = 1;
1.2325 + break;
1.2326 + } else if ((path[p + 1] == '.')
1.2327 + && ((p + 1 == end)
1.2328 + || (path[p + 2] == '\0'))) {
1.2329 + dots = 2;
1.2330 + break;
1.2331 + }
1.2332 + }
1.2333 + i++;
1.2334 + } while (i < ns);
1.2335 + if ((i > ns) || (dots == 0))
1.2336 + break;
1.2337 +
1.2338 + if (dots == 1) {
1.2339 + // Remove this occurrence of "."
1.2340 + segs[i] = -1;
1.2341 + } else {
1.2342 + // If there is a preceding non-".." segment, remove both that
1.2343 + // segment and this occurrence of ".."; otherwise, leave this
1.2344 + // ".." segment as-is.
1.2345 + int j;
1.2346 + for (j = i - 1; j >= 0; j--) {
1.2347 + if (segs[j] != -1) break;
1.2348 + }
1.2349 + if (j >= 0) {
1.2350 + int q = segs[j];
1.2351 + if (!((path[q] == '.')
1.2352 + && (path[q + 1] == '.')
1.2353 + && (path[q + 2] == '\0'))) {
1.2354 + segs[i] = -1;
1.2355 + segs[j] = -1;
1.2356 + }
1.2357 + }
1.2358 + }
1.2359 + }
1.2360 + }
1.2361 +
1.2362 +
1.2363 + // DEVIATION: If the normalized path is relative, and if the first
1.2364 + // segment could be parsed as a scheme name, then prepend a "." segment
1.2365 + //
1.2366 + private static void maybeAddLeadingDot(char[] path, int[] segs) {
1.2367 +
1.2368 + if (path[0] == '\0')
1.2369 + // The path is absolute
1.2370 + return;
1.2371 +
1.2372 + int ns = segs.length;
1.2373 + int f = 0; // Index of first segment
1.2374 + while (f < ns) {
1.2375 + if (segs[f] >= 0)
1.2376 + break;
1.2377 + f++;
1.2378 + }
1.2379 + if ((f >= ns) || (f == 0))
1.2380 + // The path is empty, or else the original first segment survived,
1.2381 + // in which case we already know that no leading "." is needed
1.2382 + return;
1.2383 +
1.2384 + int p = segs[f];
1.2385 + while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
1.2386 + if (p >= path.length || path[p] == '\0')
1.2387 + // No colon in first segment, so no "." needed
1.2388 + return;
1.2389 +
1.2390 + // At this point we know that the first segment is unused,
1.2391 + // hence we can insert a "." segment at that position
1.2392 + path[0] = '.';
1.2393 + path[1] = '\0';
1.2394 + segs[0] = 0;
1.2395 + }
1.2396 +
1.2397 +
1.2398 + // Normalize the given path string. A normal path string has no empty
1.2399 + // segments (i.e., occurrences of "//"), no segments equal to ".", and no
1.2400 + // segments equal to ".." that are preceded by a segment not equal to "..".
1.2401 + // In contrast to Unix-style pathname normalization, for URI paths we
1.2402 + // always retain trailing slashes.
1.2403 + //
1.2404 + private static String normalize(String ps) {
1.2405 +
1.2406 + // Does this path need normalization?
1.2407 + int ns = needsNormalization(ps); // Number of segments
1.2408 + if (ns < 0)
1.2409 + // Nope -- just return it
1.2410 + return ps;
1.2411 +
1.2412 + char[] path = ps.toCharArray(); // Path in char-array form
1.2413 +
1.2414 + // Split path into segments
1.2415 + int[] segs = new int[ns]; // Segment-index array
1.2416 + split(path, segs);
1.2417 +
1.2418 + // Remove dots
1.2419 + removeDots(path, segs);
1.2420 +
1.2421 + // Prevent scheme-name confusion
1.2422 + maybeAddLeadingDot(path, segs);
1.2423 +
1.2424 + // Join the remaining segments and return the result
1.2425 + String s = new String(path, 0, join(path, segs));
1.2426 + if (s.equals(ps)) {
1.2427 + // string was already normalized
1.2428 + return ps;
1.2429 + }
1.2430 + return s;
1.2431 + }
1.2432 +
1.2433 +
1.2434 +
1.2435 + // -- Character classes for parsing --
1.2436 +
1.2437 + // RFC2396 precisely specifies which characters in the US-ASCII charset are
1.2438 + // permissible in the various components of a URI reference. We here
1.2439 + // define a set of mask pairs to aid in enforcing these restrictions. Each
1.2440 + // mask pair consists of two longs, a low mask and a high mask. Taken
1.2441 + // together they represent a 128-bit mask, where bit i is set iff the
1.2442 + // character with value i is permitted.
1.2443 + //
1.2444 + // This approach is more efficient than sequentially searching arrays of
1.2445 + // permitted characters. It could be made still more efficient by
1.2446 + // precompiling the mask information so that a character's presence in a
1.2447 + // given mask could be determined by a single table lookup.
1.2448 +
1.2449 + // Compute the low-order mask for the characters in the given string
1.2450 + private static long lowMask(String chars) {
1.2451 + int n = chars.length();
1.2452 + long m = 0;
1.2453 + for (int i = 0; i < n; i++) {
1.2454 + char c = chars.charAt(i);
1.2455 + if (c < 64)
1.2456 + m |= (1L << c);
1.2457 + }
1.2458 + return m;
1.2459 + }
1.2460 +
1.2461 + // Compute the high-order mask for the characters in the given string
1.2462 + private static long highMask(String chars) {
1.2463 + int n = chars.length();
1.2464 + long m = 0;
1.2465 + for (int i = 0; i < n; i++) {
1.2466 + char c = chars.charAt(i);
1.2467 + if ((c >= 64) && (c < 128))
1.2468 + m |= (1L << (c - 64));
1.2469 + }
1.2470 + return m;
1.2471 + }
1.2472 +
1.2473 + // Compute a low-order mask for the characters
1.2474 + // between first and last, inclusive
1.2475 + private static long lowMask(char first, char last) {
1.2476 + long m = 0;
1.2477 + int f = Math.max(Math.min(first, 63), 0);
1.2478 + int l = Math.max(Math.min(last, 63), 0);
1.2479 + for (int i = f; i <= l; i++)
1.2480 + m |= 1L << i;
1.2481 + return m;
1.2482 + }
1.2483 +
1.2484 + // Compute a high-order mask for the characters
1.2485 + // between first and last, inclusive
1.2486 + private static long highMask(char first, char last) {
1.2487 + long m = 0;
1.2488 + int f = Math.max(Math.min(first, 127), 64) - 64;
1.2489 + int l = Math.max(Math.min(last, 127), 64) - 64;
1.2490 + for (int i = f; i <= l; i++)
1.2491 + m |= 1L << i;
1.2492 + return m;
1.2493 + }
1.2494 +
1.2495 + // Tell whether the given character is permitted by the given mask pair
1.2496 + private static boolean match(char c, long lowMask, long highMask) {
1.2497 + if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
1.2498 + return false;
1.2499 + if (c < 64)
1.2500 + return ((1L << c) & lowMask) != 0;
1.2501 + if (c < 128)
1.2502 + return ((1L << (c - 64)) & highMask) != 0;
1.2503 + return false;
1.2504 + }
1.2505 +
1.2506 + // Character-class masks, in reverse order from RFC2396 because
1.2507 + // initializers for static fields cannot make forward references.
1.2508 +
1.2509 + // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
1.2510 + // "8" | "9"
1.2511 + private static final long L_DIGIT = lowMask('0', '9');
1.2512 + private static final long H_DIGIT = 0L;
1.2513 +
1.2514 + // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
1.2515 + // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
1.2516 + // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
1.2517 + private static final long L_UPALPHA = 0L;
1.2518 + private static final long H_UPALPHA = highMask('A', 'Z');
1.2519 +
1.2520 + // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
1.2521 + // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
1.2522 + // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
1.2523 + private static final long L_LOWALPHA = 0L;
1.2524 + private static final long H_LOWALPHA = highMask('a', 'z');
1.2525 +
1.2526 + // alpha = lowalpha | upalpha
1.2527 + private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
1.2528 + private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
1.2529 +
1.2530 + // alphanum = alpha | digit
1.2531 + private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
1.2532 + private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
1.2533 +
1.2534 + // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
1.2535 + // "a" | "b" | "c" | "d" | "e" | "f"
1.2536 + private static final long L_HEX = L_DIGIT;
1.2537 + private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
1.2538 +
1.2539 + // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
1.2540 + // "(" | ")"
1.2541 + private static final long L_MARK = lowMask("-_.!~*'()");
1.2542 + private static final long H_MARK = highMask("-_.!~*'()");
1.2543 +
1.2544 + // unreserved = alphanum | mark
1.2545 + private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
1.2546 + private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
1.2547 +
1.2548 + // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1.2549 + // "$" | "," | "[" | "]"
1.2550 + // Added per RFC2732: "[", "]"
1.2551 + private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
1.2552 + private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
1.2553 +
1.2554 + // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
1.2555 + // characters are allowed; this is handled by the scanEscape method below.
1.2556 + private static final long L_ESCAPED = 1L;
1.2557 + private static final long H_ESCAPED = 0L;
1.2558 +
1.2559 + // uric = reserved | unreserved | escaped
1.2560 + private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
1.2561 + private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
1.2562 +
1.2563 + // pchar = unreserved | escaped |
1.2564 + // ":" | "@" | "&" | "=" | "+" | "$" | ","
1.2565 + private static final long L_PCHAR
1.2566 + = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
1.2567 + private static final long H_PCHAR
1.2568 + = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
1.2569 +
1.2570 + // All valid path characters
1.2571 + private static final long L_PATH = L_PCHAR | lowMask(";/");
1.2572 + private static final long H_PATH = H_PCHAR | highMask(";/");
1.2573 +
1.2574 + // Dash, for use in domainlabel and toplabel
1.2575 + private static final long L_DASH = lowMask("-");
1.2576 + private static final long H_DASH = highMask("-");
1.2577 +
1.2578 + // Dot, for use in hostnames
1.2579 + private static final long L_DOT = lowMask(".");
1.2580 + private static final long H_DOT = highMask(".");
1.2581 +
1.2582 + // userinfo = *( unreserved | escaped |
1.2583 + // ";" | ":" | "&" | "=" | "+" | "$" | "," )
1.2584 + private static final long L_USERINFO
1.2585 + = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
1.2586 + private static final long H_USERINFO
1.2587 + = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
1.2588 +
1.2589 + // reg_name = 1*( unreserved | escaped | "$" | "," |
1.2590 + // ";" | ":" | "@" | "&" | "=" | "+" )
1.2591 + private static final long L_REG_NAME
1.2592 + = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
1.2593 + private static final long H_REG_NAME
1.2594 + = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
1.2595 +
1.2596 + // All valid characters for server-based authorities
1.2597 + private static final long L_SERVER
1.2598 + = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
1.2599 + private static final long H_SERVER
1.2600 + = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
1.2601 +
1.2602 + // Special case of server authority that represents an IPv6 address
1.2603 + // In this case, a % does not signify an escape sequence
1.2604 + private static final long L_SERVER_PERCENT
1.2605 + = L_SERVER | lowMask("%");
1.2606 + private static final long H_SERVER_PERCENT
1.2607 + = H_SERVER | highMask("%");
1.2608 + private static final long L_LEFT_BRACKET = lowMask("[");
1.2609 + private static final long H_LEFT_BRACKET = highMask("[");
1.2610 +
1.2611 + // scheme = alpha *( alpha | digit | "+" | "-" | "." )
1.2612 + private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
1.2613 + private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
1.2614 +
1.2615 + // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
1.2616 + // "&" | "=" | "+" | "$" | ","
1.2617 + private static final long L_URIC_NO_SLASH
1.2618 + = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
1.2619 + private static final long H_URIC_NO_SLASH
1.2620 + = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
1.2621 +
1.2622 +
1.2623 + // -- Escaping and encoding --
1.2624 +
1.2625 + private final static char[] hexDigits = {
1.2626 + '0', '1', '2', '3', '4', '5', '6', '7',
1.2627 + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
1.2628 + };
1.2629 +
1.2630 + private static void appendEscape(StringBuffer sb, byte b) {
1.2631 + sb.append('%');
1.2632 + sb.append(hexDigits[(b >> 4) & 0x0f]);
1.2633 + sb.append(hexDigits[(b >> 0) & 0x0f]);
1.2634 + }
1.2635 +
1.2636 + private static void appendEncoded(StringBuffer sb, char c) {
1.2637 + ByteBuffer bb = null;
1.2638 + try {
1.2639 + bb = ThreadLocalCoders.encoderFor("UTF-8")
1.2640 + .encode(CharBuffer.wrap("" + c));
1.2641 + } catch (CharacterCodingException x) {
1.2642 + assert false;
1.2643 + }
1.2644 + while (bb.hasRemaining()) {
1.2645 + int b = bb.get() & 0xff;
1.2646 + if (b >= 0x80)
1.2647 + appendEscape(sb, (byte)b);
1.2648 + else
1.2649 + sb.append((char)b);
1.2650 + }
1.2651 + }
1.2652 +
1.2653 + // Quote any characters in s that are not permitted
1.2654 + // by the given mask pair
1.2655 + //
1.2656 + private static String quote(String s, long lowMask, long highMask) {
1.2657 + int n = s.length();
1.2658 + StringBuffer sb = null;
1.2659 + boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
1.2660 + for (int i = 0; i < s.length(); i++) {
1.2661 + char c = s.charAt(i);
1.2662 + if (c < '\u0080') {
1.2663 + if (!match(c, lowMask, highMask)) {
1.2664 + if (sb == null) {
1.2665 + sb = new StringBuffer();
1.2666 + sb.append(s.substring(0, i));
1.2667 + }
1.2668 + appendEscape(sb, (byte)c);
1.2669 + } else {
1.2670 + if (sb != null)
1.2671 + sb.append(c);
1.2672 + }
1.2673 + } else if (allowNonASCII
1.2674 + && (Character.isSpaceChar(c)
1.2675 + || Character.isISOControl(c))) {
1.2676 + if (sb == null) {
1.2677 + sb = new StringBuffer();
1.2678 + sb.append(s.substring(0, i));
1.2679 + }
1.2680 + appendEncoded(sb, c);
1.2681 + } else {
1.2682 + if (sb != null)
1.2683 + sb.append(c);
1.2684 + }
1.2685 + }
1.2686 + return (sb == null) ? s : sb.toString();
1.2687 + }
1.2688 +
1.2689 + // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
1.2690 + // assuming that s is otherwise legal
1.2691 + //
1.2692 + private static String encode(String s) {
1.2693 + int n = s.length();
1.2694 + if (n == 0)
1.2695 + return s;
1.2696 +
1.2697 + // First check whether we actually need to encode
1.2698 + for (int i = 0;;) {
1.2699 + if (s.charAt(i) >= '\u0080')
1.2700 + break;
1.2701 + if (++i >= n)
1.2702 + return s;
1.2703 + }
1.2704 +
1.2705 + String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
1.2706 + ByteBuffer bb = null;
1.2707 + try {
1.2708 + bb = ThreadLocalCoders.encoderFor("UTF-8")
1.2709 + .encode(CharBuffer.wrap(ns));
1.2710 + } catch (CharacterCodingException x) {
1.2711 + assert false;
1.2712 + }
1.2713 +
1.2714 + StringBuffer sb = new StringBuffer();
1.2715 + while (bb.hasRemaining()) {
1.2716 + int b = bb.get() & 0xff;
1.2717 + if (b >= 0x80)
1.2718 + appendEscape(sb, (byte)b);
1.2719 + else
1.2720 + sb.append((char)b);
1.2721 + }
1.2722 + return sb.toString();
1.2723 + }
1.2724 +
1.2725 + private static int decode(char c) {
1.2726 + if ((c >= '0') && (c <= '9'))
1.2727 + return c - '0';
1.2728 + if ((c >= 'a') && (c <= 'f'))
1.2729 + return c - 'a' + 10;
1.2730 + if ((c >= 'A') && (c <= 'F'))
1.2731 + return c - 'A' + 10;
1.2732 + assert false;
1.2733 + return -1;
1.2734 + }
1.2735 +
1.2736 + private static byte decode(char c1, char c2) {
1.2737 + return (byte)( ((decode(c1) & 0xf) << 4)
1.2738 + | ((decode(c2) & 0xf) << 0));
1.2739 + }
1.2740 +
1.2741 + // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes
1.2742 + // that escapes are well-formed syntactically, i.e., of the form %XX. If a
1.2743 + // sequence of escaped octets is not valid UTF-8 then the erroneous octets
1.2744 + // are replaced with '\uFFFD'.
1.2745 + // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
1.2746 + // with a scope_id
1.2747 + //
1.2748 + private static String decode(String s) {
1.2749 + if (s == null)
1.2750 + return s;
1.2751 + int n = s.length();
1.2752 + if (n == 0)
1.2753 + return s;
1.2754 + if (s.indexOf('%') < 0)
1.2755 + return s;
1.2756 +
1.2757 + StringBuffer sb = new StringBuffer(n);
1.2758 + ByteBuffer bb = ByteBuffer.allocate(n);
1.2759 + CharBuffer cb = CharBuffer.allocate(n);
1.2760 + CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
1.2761 + .onMalformedInput(CodingErrorAction.REPLACE)
1.2762 + .onUnmappableCharacter(CodingErrorAction.REPLACE);
1.2763 +
1.2764 + // This is not horribly efficient, but it will do for now
1.2765 + char c = s.charAt(0);
1.2766 + boolean betweenBrackets = false;
1.2767 +
1.2768 + for (int i = 0; i < n;) {
1.2769 + assert c == s.charAt(i); // Loop invariant
1.2770 + if (c == '[') {
1.2771 + betweenBrackets = true;
1.2772 + } else if (betweenBrackets && c == ']') {
1.2773 + betweenBrackets = false;
1.2774 + }
1.2775 + if (c != '%' || betweenBrackets) {
1.2776 + sb.append(c);
1.2777 + if (++i >= n)
1.2778 + break;
1.2779 + c = s.charAt(i);
1.2780 + continue;
1.2781 + }
1.2782 + bb.clear();
1.2783 + int ui = i;
1.2784 + for (;;) {
1.2785 + assert (n - i >= 2);
1.2786 + bb.put(decode(s.charAt(++i), s.charAt(++i)));
1.2787 + if (++i >= n)
1.2788 + break;
1.2789 + c = s.charAt(i);
1.2790 + if (c != '%')
1.2791 + break;
1.2792 + }
1.2793 + bb.flip();
1.2794 + cb.clear();
1.2795 + dec.reset();
1.2796 + CoderResult cr = dec.decode(bb, cb, true);
1.2797 + assert cr.isUnderflow();
1.2798 + cr = dec.flush(cb);
1.2799 + assert cr.isUnderflow();
1.2800 + sb.append(cb.flip().toString());
1.2801 + }
1.2802 +
1.2803 + return sb.toString();
1.2804 + }
1.2805 +
1.2806 +
1.2807 + // -- Parsing --
1.2808 +
1.2809 + // For convenience we wrap the input URI string in a new instance of the
1.2810 + // following internal class. This saves always having to pass the input
1.2811 + // string as an argument to each internal scan/parse method.
1.2812 +
1.2813 + private class Parser {
1.2814 +
1.2815 + private String input; // URI input string
1.2816 + private boolean requireServerAuthority = false;
1.2817 +
1.2818 + Parser(String s) {
1.2819 + input = s;
1.2820 + string = s;
1.2821 + }
1.2822 +
1.2823 + // -- Methods for throwing URISyntaxException in various ways --
1.2824 +
1.2825 + private void fail(String reason) throws URISyntaxException {
1.2826 + throw new URISyntaxException(input, reason);
1.2827 + }
1.2828 +
1.2829 + private void fail(String reason, int p) throws URISyntaxException {
1.2830 + throw new URISyntaxException(input, reason, p);
1.2831 + }
1.2832 +
1.2833 + private void failExpecting(String expected, int p)
1.2834 + throws URISyntaxException
1.2835 + {
1.2836 + fail("Expected " + expected, p);
1.2837 + }
1.2838 +
1.2839 + private void failExpecting(String expected, String prior, int p)
1.2840 + throws URISyntaxException
1.2841 + {
1.2842 + fail("Expected " + expected + " following " + prior, p);
1.2843 + }
1.2844 +
1.2845 +
1.2846 + // -- Simple access to the input string --
1.2847 +
1.2848 + // Return a substring of the input string
1.2849 + //
1.2850 + private String substring(int start, int end) {
1.2851 + return input.substring(start, end);
1.2852 + }
1.2853 +
1.2854 + // Return the char at position p,
1.2855 + // assuming that p < input.length()
1.2856 + //
1.2857 + private char charAt(int p) {
1.2858 + return input.charAt(p);
1.2859 + }
1.2860 +
1.2861 + // Tells whether start < end and, if so, whether charAt(start) == c
1.2862 + //
1.2863 + private boolean at(int start, int end, char c) {
1.2864 + return (start < end) && (charAt(start) == c);
1.2865 + }
1.2866 +
1.2867 + // Tells whether start + s.length() < end and, if so,
1.2868 + // whether the chars at the start position match s exactly
1.2869 + //
1.2870 + private boolean at(int start, int end, String s) {
1.2871 + int p = start;
1.2872 + int sn = s.length();
1.2873 + if (sn > end - p)
1.2874 + return false;
1.2875 + int i = 0;
1.2876 + while (i < sn) {
1.2877 + if (charAt(p++) != s.charAt(i)) {
1.2878 + break;
1.2879 + }
1.2880 + i++;
1.2881 + }
1.2882 + return (i == sn);
1.2883 + }
1.2884 +
1.2885 +
1.2886 + // -- Scanning --
1.2887 +
1.2888 + // The various scan and parse methods that follow use a uniform
1.2889 + // convention of taking the current start position and end index as
1.2890 + // their first two arguments. The start is inclusive while the end is
1.2891 + // exclusive, just as in the String class, i.e., a start/end pair
1.2892 + // denotes the left-open interval [start, end) of the input string.
1.2893 + //
1.2894 + // These methods never proceed past the end position. They may return
1.2895 + // -1 to indicate outright failure, but more often they simply return
1.2896 + // the position of the first char after the last char scanned. Thus
1.2897 + // a typical idiom is
1.2898 + //
1.2899 + // int p = start;
1.2900 + // int q = scan(p, end, ...);
1.2901 + // if (q > p)
1.2902 + // // We scanned something
1.2903 + // ...;
1.2904 + // else if (q == p)
1.2905 + // // We scanned nothing
1.2906 + // ...;
1.2907 + // else if (q == -1)
1.2908 + // // Something went wrong
1.2909 + // ...;
1.2910 +
1.2911 +
1.2912 + // Scan a specific char: If the char at the given start position is
1.2913 + // equal to c, return the index of the next char; otherwise, return the
1.2914 + // start position.
1.2915 + //
1.2916 + private int scan(int start, int end, char c) {
1.2917 + if ((start < end) && (charAt(start) == c))
1.2918 + return start + 1;
1.2919 + return start;
1.2920 + }
1.2921 +
1.2922 + // Scan forward from the given start position. Stop at the first char
1.2923 + // in the err string (in which case -1 is returned), or the first char
1.2924 + // in the stop string (in which case the index of the preceding char is
1.2925 + // returned), or the end of the input string (in which case the length
1.2926 + // of the input string is returned). May return the start position if
1.2927 + // nothing matches.
1.2928 + //
1.2929 + private int scan(int start, int end, String err, String stop) {
1.2930 + int p = start;
1.2931 + while (p < end) {
1.2932 + char c = charAt(p);
1.2933 + if (err.indexOf(c) >= 0)
1.2934 + return -1;
1.2935 + if (stop.indexOf(c) >= 0)
1.2936 + break;
1.2937 + p++;
1.2938 + }
1.2939 + return p;
1.2940 + }
1.2941 +
1.2942 + // Scan a potential escape sequence, starting at the given position,
1.2943 + // with the given first char (i.e., charAt(start) == c).
1.2944 + //
1.2945 + // This method assumes that if escapes are allowed then visible
1.2946 + // non-US-ASCII chars are also allowed.
1.2947 + //
1.2948 + private int scanEscape(int start, int n, char first)
1.2949 + throws URISyntaxException
1.2950 + {
1.2951 + int p = start;
1.2952 + char c = first;
1.2953 + if (c == '%') {
1.2954 + // Process escape pair
1.2955 + if ((p + 3 <= n)
1.2956 + && match(charAt(p + 1), L_HEX, H_HEX)
1.2957 + && match(charAt(p + 2), L_HEX, H_HEX)) {
1.2958 + return p + 3;
1.2959 + }
1.2960 + fail("Malformed escape pair", p);
1.2961 + } else if ((c > 128)
1.2962 + && !Character.isSpaceChar(c)
1.2963 + && !Character.isISOControl(c)) {
1.2964 + // Allow unescaped but visible non-US-ASCII chars
1.2965 + return p + 1;
1.2966 + }
1.2967 + return p;
1.2968 + }
1.2969 +
1.2970 + // Scan chars that match the given mask pair
1.2971 + //
1.2972 + private int scan(int start, int n, long lowMask, long highMask)
1.2973 + throws URISyntaxException
1.2974 + {
1.2975 + int p = start;
1.2976 + while (p < n) {
1.2977 + char c = charAt(p);
1.2978 + if (match(c, lowMask, highMask)) {
1.2979 + p++;
1.2980 + continue;
1.2981 + }
1.2982 + if ((lowMask & L_ESCAPED) != 0) {
1.2983 + int q = scanEscape(p, n, c);
1.2984 + if (q > p) {
1.2985 + p = q;
1.2986 + continue;
1.2987 + }
1.2988 + }
1.2989 + break;
1.2990 + }
1.2991 + return p;
1.2992 + }
1.2993 +
1.2994 + // Check that each of the chars in [start, end) matches the given mask
1.2995 + //
1.2996 + private void checkChars(int start, int end,
1.2997 + long lowMask, long highMask,
1.2998 + String what)
1.2999 + throws URISyntaxException
1.3000 + {
1.3001 + int p = scan(start, end, lowMask, highMask);
1.3002 + if (p < end)
1.3003 + fail("Illegal character in " + what, p);
1.3004 + }
1.3005 +
1.3006 + // Check that the char at position p matches the given mask
1.3007 + //
1.3008 + private void checkChar(int p,
1.3009 + long lowMask, long highMask,
1.3010 + String what)
1.3011 + throws URISyntaxException
1.3012 + {
1.3013 + checkChars(p, p + 1, lowMask, highMask, what);
1.3014 + }
1.3015 +
1.3016 +
1.3017 + // -- Parsing --
1.3018 +
1.3019 + // [<scheme>:]<scheme-specific-part>[#<fragment>]
1.3020 + //
1.3021 + void parse(boolean rsa) throws URISyntaxException {
1.3022 + requireServerAuthority = rsa;
1.3023 + int ssp; // Start of scheme-specific part
1.3024 + int n = input.length();
1.3025 + int p = scan(0, n, "/?#", ":");
1.3026 + if ((p >= 0) && at(p, n, ':')) {
1.3027 + if (p == 0)
1.3028 + failExpecting("scheme name", 0);
1.3029 + checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
1.3030 + checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
1.3031 + scheme = substring(0, p);
1.3032 + p++; // Skip ':'
1.3033 + ssp = p;
1.3034 + if (at(p, n, '/')) {
1.3035 + p = parseHierarchical(p, n);
1.3036 + } else {
1.3037 + int q = scan(p, n, "", "#");
1.3038 + if (q <= p)
1.3039 + failExpecting("scheme-specific part", p);
1.3040 + checkChars(p, q, L_URIC, H_URIC, "opaque part");
1.3041 + p = q;
1.3042 + }
1.3043 + } else {
1.3044 + ssp = 0;
1.3045 + p = parseHierarchical(0, n);
1.3046 + }
1.3047 + schemeSpecificPart = substring(ssp, p);
1.3048 + if (at(p, n, '#')) {
1.3049 + checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
1.3050 + fragment = substring(p + 1, n);
1.3051 + p = n;
1.3052 + }
1.3053 + if (p < n)
1.3054 + fail("end of URI", p);
1.3055 + }
1.3056 +
1.3057 + // [//authority]<path>[?<query>]
1.3058 + //
1.3059 + // DEVIATION from RFC2396: We allow an empty authority component as
1.3060 + // long as it's followed by a non-empty path, query component, or
1.3061 + // fragment component. This is so that URIs such as "file:///foo/bar"
1.3062 + // will parse. This seems to be the intent of RFC2396, though the
1.3063 + // grammar does not permit it. If the authority is empty then the
1.3064 + // userInfo, host, and port components are undefined.
1.3065 + //
1.3066 + // DEVIATION from RFC2396: We allow empty relative paths. This seems
1.3067 + // to be the intent of RFC2396, but the grammar does not permit it.
1.3068 + // The primary consequence of this deviation is that "#f" parses as a
1.3069 + // relative URI with an empty path.
1.3070 + //
1.3071 + private int parseHierarchical(int start, int n)
1.3072 + throws URISyntaxException
1.3073 + {
1.3074 + int p = start;
1.3075 + if (at(p, n, '/') && at(p + 1, n, '/')) {
1.3076 + p += 2;
1.3077 + int q = scan(p, n, "", "/?#");
1.3078 + if (q > p) {
1.3079 + p = parseAuthority(p, q);
1.3080 + } else if (q < n) {
1.3081 + // DEVIATION: Allow empty authority prior to non-empty
1.3082 + // path, query component or fragment identifier
1.3083 + } else
1.3084 + failExpecting("authority", p);
1.3085 + }
1.3086 + int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
1.3087 + checkChars(p, q, L_PATH, H_PATH, "path");
1.3088 + path = substring(p, q);
1.3089 + p = q;
1.3090 + if (at(p, n, '?')) {
1.3091 + p++;
1.3092 + q = scan(p, n, "", "#");
1.3093 + checkChars(p, q, L_URIC, H_URIC, "query");
1.3094 + query = substring(p, q);
1.3095 + p = q;
1.3096 + }
1.3097 + return p;
1.3098 + }
1.3099 +
1.3100 + // authority = server | reg_name
1.3101 + //
1.3102 + // Ambiguity: An authority that is a registry name rather than a server
1.3103 + // might have a prefix that parses as a server. We use the fact that
1.3104 + // the authority component is always followed by '/' or the end of the
1.3105 + // input string to resolve this: If the complete authority did not
1.3106 + // parse as a server then we try to parse it as a registry name.
1.3107 + //
1.3108 + private int parseAuthority(int start, int n)
1.3109 + throws URISyntaxException
1.3110 + {
1.3111 + int p = start;
1.3112 + int q = p;
1.3113 + URISyntaxException ex = null;
1.3114 +
1.3115 + boolean serverChars;
1.3116 + boolean regChars;
1.3117 +
1.3118 + if (scan(p, n, "", "]") > p) {
1.3119 + // contains a literal IPv6 address, therefore % is allowed
1.3120 + serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
1.3121 + } else {
1.3122 + serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
1.3123 + }
1.3124 + regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
1.3125 +
1.3126 + if (regChars && !serverChars) {
1.3127 + // Must be a registry-based authority
1.3128 + authority = substring(p, n);
1.3129 + return n;
1.3130 + }
1.3131 +
1.3132 + if (serverChars) {
1.3133 + // Might be (probably is) a server-based authority, so attempt
1.3134 + // to parse it as such. If the attempt fails, try to treat it
1.3135 + // as a registry-based authority.
1.3136 + try {
1.3137 + q = parseServer(p, n);
1.3138 + if (q < n)
1.3139 + failExpecting("end of authority", q);
1.3140 + authority = substring(p, n);
1.3141 + } catch (URISyntaxException x) {
1.3142 + // Undo results of failed parse
1.3143 + userInfo = null;
1.3144 + host = null;
1.3145 + port = -1;
1.3146 + if (requireServerAuthority) {
1.3147 + // If we're insisting upon a server-based authority,
1.3148 + // then just re-throw the exception
1.3149 + throw x;
1.3150 + } else {
1.3151 + // Save the exception in case it doesn't parse as a
1.3152 + // registry either
1.3153 + ex = x;
1.3154 + q = p;
1.3155 + }
1.3156 + }
1.3157 + }
1.3158 +
1.3159 + if (q < n) {
1.3160 + if (regChars) {
1.3161 + // Registry-based authority
1.3162 + authority = substring(p, n);
1.3163 + } else if (ex != null) {
1.3164 + // Re-throw exception; it was probably due to
1.3165 + // a malformed IPv6 address
1.3166 + throw ex;
1.3167 + } else {
1.3168 + fail("Illegal character in authority", q);
1.3169 + }
1.3170 + }
1.3171 +
1.3172 + return n;
1.3173 + }
1.3174 +
1.3175 +
1.3176 + // [<userinfo>@]<host>[:<port>]
1.3177 + //
1.3178 + private int parseServer(int start, int n)
1.3179 + throws URISyntaxException
1.3180 + {
1.3181 + int p = start;
1.3182 + int q;
1.3183 +
1.3184 + // userinfo
1.3185 + q = scan(p, n, "/?#", "@");
1.3186 + if ((q >= p) && at(q, n, '@')) {
1.3187 + checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
1.3188 + userInfo = substring(p, q);
1.3189 + p = q + 1; // Skip '@'
1.3190 + }
1.3191 +
1.3192 + // hostname, IPv4 address, or IPv6 address
1.3193 + if (at(p, n, '[')) {
1.3194 + // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
1.3195 + p++;
1.3196 + q = scan(p, n, "/?#", "]");
1.3197 + if ((q > p) && at(q, n, ']')) {
1.3198 + // look for a "%" scope id
1.3199 + int r = scan (p, q, "", "%");
1.3200 + if (r > p) {
1.3201 + parseIPv6Reference(p, r);
1.3202 + if (r+1 == q) {
1.3203 + fail ("scope id expected");
1.3204 + }
1.3205 + checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
1.3206 + "scope id");
1.3207 + } else {
1.3208 + parseIPv6Reference(p, q);
1.3209 + }
1.3210 + host = substring(p-1, q+1);
1.3211 + p = q + 1;
1.3212 + } else {
1.3213 + failExpecting("closing bracket for IPv6 address", q);
1.3214 + }
1.3215 + } else {
1.3216 + q = parseIPv4Address(p, n);
1.3217 + if (q <= p)
1.3218 + q = parseHostname(p, n);
1.3219 + p = q;
1.3220 + }
1.3221 +
1.3222 + // port
1.3223 + if (at(p, n, ':')) {
1.3224 + p++;
1.3225 + q = scan(p, n, "", "/");
1.3226 + if (q > p) {
1.3227 + checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
1.3228 + try {
1.3229 + port = Integer.parseInt(substring(p, q));
1.3230 + } catch (NumberFormatException x) {
1.3231 + fail("Malformed port number", p);
1.3232 + }
1.3233 + p = q;
1.3234 + }
1.3235 + }
1.3236 + if (p < n)
1.3237 + failExpecting("port number", p);
1.3238 +
1.3239 + return p;
1.3240 + }
1.3241 +
1.3242 + // Scan a string of decimal digits whose value fits in a byte
1.3243 + //
1.3244 + private int scanByte(int start, int n)
1.3245 + throws URISyntaxException
1.3246 + {
1.3247 + int p = start;
1.3248 + int q = scan(p, n, L_DIGIT, H_DIGIT);
1.3249 + if (q <= p) return q;
1.3250 + if (Integer.parseInt(substring(p, q)) > 255) return p;
1.3251 + return q;
1.3252 + }
1.3253 +
1.3254 + // Scan an IPv4 address.
1.3255 + //
1.3256 + // If the strict argument is true then we require that the given
1.3257 + // interval contain nothing besides an IPv4 address; if it is false
1.3258 + // then we only require that it start with an IPv4 address.
1.3259 + //
1.3260 + // If the interval does not contain or start with (depending upon the
1.3261 + // strict argument) a legal IPv4 address characters then we return -1
1.3262 + // immediately; otherwise we insist that these characters parse as a
1.3263 + // legal IPv4 address and throw an exception on failure.
1.3264 + //
1.3265 + // We assume that any string of decimal digits and dots must be an IPv4
1.3266 + // address. It won't parse as a hostname anyway, so making that
1.3267 + // assumption here allows more meaningful exceptions to be thrown.
1.3268 + //
1.3269 + private int scanIPv4Address(int start, int n, boolean strict)
1.3270 + throws URISyntaxException
1.3271 + {
1.3272 + int p = start;
1.3273 + int q;
1.3274 + int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
1.3275 + if ((m <= p) || (strict && (m != n)))
1.3276 + return -1;
1.3277 + for (;;) {
1.3278 + // Per RFC2732: At most three digits per byte
1.3279 + // Further constraint: Each element fits in a byte
1.3280 + if ((q = scanByte(p, m)) <= p) break; p = q;
1.3281 + if ((q = scan(p, m, '.')) <= p) break; p = q;
1.3282 + if ((q = scanByte(p, m)) <= p) break; p = q;
1.3283 + if ((q = scan(p, m, '.')) <= p) break; p = q;
1.3284 + if ((q = scanByte(p, m)) <= p) break; p = q;
1.3285 + if ((q = scan(p, m, '.')) <= p) break; p = q;
1.3286 + if ((q = scanByte(p, m)) <= p) break; p = q;
1.3287 + if (q < m) break;
1.3288 + return q;
1.3289 + }
1.3290 + fail("Malformed IPv4 address", q);
1.3291 + return -1;
1.3292 + }
1.3293 +
1.3294 + // Take an IPv4 address: Throw an exception if the given interval
1.3295 + // contains anything except an IPv4 address
1.3296 + //
1.3297 + private int takeIPv4Address(int start, int n, String expected)
1.3298 + throws URISyntaxException
1.3299 + {
1.3300 + int p = scanIPv4Address(start, n, true);
1.3301 + if (p <= start)
1.3302 + failExpecting(expected, start);
1.3303 + return p;
1.3304 + }
1.3305 +
1.3306 + // Attempt to parse an IPv4 address, returning -1 on failure but
1.3307 + // allowing the given interval to contain [:<characters>] after
1.3308 + // the IPv4 address.
1.3309 + //
1.3310 + private int parseIPv4Address(int start, int n) {
1.3311 + int p;
1.3312 +
1.3313 + try {
1.3314 + p = scanIPv4Address(start, n, false);
1.3315 + } catch (URISyntaxException x) {
1.3316 + return -1;
1.3317 + } catch (NumberFormatException nfe) {
1.3318 + return -1;
1.3319 + }
1.3320 +
1.3321 + if (p > start && p < n) {
1.3322 + // IPv4 address is followed by something - check that
1.3323 + // it's a ":" as this is the only valid character to
1.3324 + // follow an address.
1.3325 + if (charAt(p) != ':') {
1.3326 + p = -1;
1.3327 + }
1.3328 + }
1.3329 +
1.3330 + if (p > start)
1.3331 + host = substring(start, p);
1.3332 +
1.3333 + return p;
1.3334 + }
1.3335 +
1.3336 + // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
1.3337 + // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1.3338 + // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1.3339 + //
1.3340 + private int parseHostname(int start, int n)
1.3341 + throws URISyntaxException
1.3342 + {
1.3343 + int p = start;
1.3344 + int q;
1.3345 + int l = -1; // Start of last parsed label
1.3346 +
1.3347 + do {
1.3348 + // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
1.3349 + q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
1.3350 + if (q <= p)
1.3351 + break;
1.3352 + l = p;
1.3353 + if (q > p) {
1.3354 + p = q;
1.3355 + q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
1.3356 + if (q > p) {
1.3357 + if (charAt(q - 1) == '-')
1.3358 + fail("Illegal character in hostname", q - 1);
1.3359 + p = q;
1.3360 + }
1.3361 + }
1.3362 + q = scan(p, n, '.');
1.3363 + if (q <= p)
1.3364 + break;
1.3365 + p = q;
1.3366 + } while (p < n);
1.3367 +
1.3368 + if ((p < n) && !at(p, n, ':'))
1.3369 + fail("Illegal character in hostname", p);
1.3370 +
1.3371 + if (l < 0)
1.3372 + failExpecting("hostname", start);
1.3373 +
1.3374 + // for a fully qualified hostname check that the rightmost
1.3375 + // label starts with an alpha character.
1.3376 + if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
1.3377 + fail("Illegal character in hostname", l);
1.3378 + }
1.3379 +
1.3380 + host = substring(start, p);
1.3381 + return p;
1.3382 + }
1.3383 +
1.3384 +
1.3385 + // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
1.3386 + //
1.3387 + // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
1.3388 + // the form ::12.34.56.78, which are clearly shown in the examples
1.3389 + // earlier in the document. Here is the original grammar:
1.3390 + //
1.3391 + // IPv6address = hexpart [ ":" IPv4address ]
1.3392 + // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
1.3393 + // hexseq = hex4 *( ":" hex4)
1.3394 + // hex4 = 1*4HEXDIG
1.3395 + //
1.3396 + // We therefore use the following revised grammar:
1.3397 + //
1.3398 + // IPv6address = hexseq [ ":" IPv4address ]
1.3399 + // | hexseq [ "::" [ hexpost ] ]
1.3400 + // | "::" [ hexpost ]
1.3401 + // hexpost = hexseq | hexseq ":" IPv4address | IPv4address
1.3402 + // hexseq = hex4 *( ":" hex4)
1.3403 + // hex4 = 1*4HEXDIG
1.3404 + //
1.3405 + // This covers all and only the following cases:
1.3406 + //
1.3407 + // hexseq
1.3408 + // hexseq : IPv4address
1.3409 + // hexseq ::
1.3410 + // hexseq :: hexseq
1.3411 + // hexseq :: hexseq : IPv4address
1.3412 + // hexseq :: IPv4address
1.3413 + // :: hexseq
1.3414 + // :: hexseq : IPv4address
1.3415 + // :: IPv4address
1.3416 + // ::
1.3417 + //
1.3418 + // Additionally we constrain the IPv6 address as follows :-
1.3419 + //
1.3420 + // i. IPv6 addresses without compressed zeros should contain
1.3421 + // exactly 16 bytes.
1.3422 + //
1.3423 + // ii. IPv6 addresses with compressed zeros should contain
1.3424 + // less than 16 bytes.
1.3425 +
1.3426 + private int ipv6byteCount = 0;
1.3427 +
1.3428 + private int parseIPv6Reference(int start, int n)
1.3429 + throws URISyntaxException
1.3430 + {
1.3431 + int p = start;
1.3432 + int q;
1.3433 + boolean compressedZeros = false;
1.3434 +
1.3435 + q = scanHexSeq(p, n);
1.3436 +
1.3437 + if (q > p) {
1.3438 + p = q;
1.3439 + if (at(p, n, "::")) {
1.3440 + compressedZeros = true;
1.3441 + p = scanHexPost(p + 2, n);
1.3442 + } else if (at(p, n, ':')) {
1.3443 + p = takeIPv4Address(p + 1, n, "IPv4 address");
1.3444 + ipv6byteCount += 4;
1.3445 + }
1.3446 + } else if (at(p, n, "::")) {
1.3447 + compressedZeros = true;
1.3448 + p = scanHexPost(p + 2, n);
1.3449 + }
1.3450 + if (p < n)
1.3451 + fail("Malformed IPv6 address", start);
1.3452 + if (ipv6byteCount > 16)
1.3453 + fail("IPv6 address too long", start);
1.3454 + if (!compressedZeros && ipv6byteCount < 16)
1.3455 + fail("IPv6 address too short", start);
1.3456 + if (compressedZeros && ipv6byteCount == 16)
1.3457 + fail("Malformed IPv6 address", start);
1.3458 +
1.3459 + return p;
1.3460 + }
1.3461 +
1.3462 + private int scanHexPost(int start, int n)
1.3463 + throws URISyntaxException
1.3464 + {
1.3465 + int p = start;
1.3466 + int q;
1.3467 +
1.3468 + if (p == n)
1.3469 + return p;
1.3470 +
1.3471 + q = scanHexSeq(p, n);
1.3472 + if (q > p) {
1.3473 + p = q;
1.3474 + if (at(p, n, ':')) {
1.3475 + p++;
1.3476 + p = takeIPv4Address(p, n, "hex digits or IPv4 address");
1.3477 + ipv6byteCount += 4;
1.3478 + }
1.3479 + } else {
1.3480 + p = takeIPv4Address(p, n, "hex digits or IPv4 address");
1.3481 + ipv6byteCount += 4;
1.3482 + }
1.3483 + return p;
1.3484 + }
1.3485 +
1.3486 + // Scan a hex sequence; return -1 if one could not be scanned
1.3487 + //
1.3488 + private int scanHexSeq(int start, int n)
1.3489 + throws URISyntaxException
1.3490 + {
1.3491 + int p = start;
1.3492 + int q;
1.3493 +
1.3494 + q = scan(p, n, L_HEX, H_HEX);
1.3495 + if (q <= p)
1.3496 + return -1;
1.3497 + if (at(q, n, '.')) // Beginning of IPv4 address
1.3498 + return -1;
1.3499 + if (q > p + 4)
1.3500 + fail("IPv6 hexadecimal digit sequence too long", p);
1.3501 + ipv6byteCount += 2;
1.3502 + p = q;
1.3503 + while (p < n) {
1.3504 + if (!at(p, n, ':'))
1.3505 + break;
1.3506 + if (at(p + 1, n, ':'))
1.3507 + break; // "::"
1.3508 + p++;
1.3509 + q = scan(p, n, L_HEX, H_HEX);
1.3510 + if (q <= p)
1.3511 + failExpecting("digits for an IPv6 address", p);
1.3512 + if (at(q, n, '.')) { // Beginning of IPv4 address
1.3513 + p--;
1.3514 + break;
1.3515 + }
1.3516 + if (q > p + 4)
1.3517 + fail("IPv6 hexadecimal digit sequence too long", p);
1.3518 + ipv6byteCount += 2;
1.3519 + p = q;
1.3520 + }
1.3521 +
1.3522 + return p;
1.3523 + }
1.3524 +
1.3525 + }
1.3526 +
1.3527 +}