hg/bck2brwsr: emul/compact/src/main/java/java/net/URI.java@724f3e1ea53e

     1 /*

     2  * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.

     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     4  *

     5  * This code is free software; you can redistribute it and/or modify it

     6  * under the terms of the GNU General Public License version 2 only, as

     7  * published by the Free Software Foundation.  Oracle designates this

     8  * particular file as subject to the "Classpath" exception as provided

     9  * by Oracle in the LICENSE file that accompanied this code.

    10  *

    11  * This code is distributed in the hope that it will be useful, but WITHOUT

    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    14  * version 2 for more details (a copy is included in the LICENSE file that

    15  * accompanied this code).

    16  *

    17  * You should have received a copy of the GNU General Public License version

    18  * 2 along with this work; if not, write to the Free Software Foundation,

    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    20  *

    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    22  * or visit www.oracle.com if you need additional information or have any

    23  * questions.

    24  */

    26 package java.net;

    28 import java.io.IOException;

    29 import java.io.InvalidObjectException;

    30 import java.io.ObjectInputStream;

    31 import java.io.ObjectOutputStream;

    32 import java.io.Serializable;

    33 import java.nio.ByteBuffer;

    34 import java.nio.CharBuffer;

    35 import java.nio.charset.CharsetDecoder;

    36 import java.nio.charset.CharsetEncoder;

    37 import java.nio.charset.CoderResult;

    38 import java.nio.charset.CodingErrorAction;

    39 import java.nio.charset.CharacterCodingException;

    40 import java.text.Normalizer;

    41 import sun.nio.cs.ThreadLocalCoders;

    43 import java.lang.Character;             // for javadoc

    44 import java.lang.NullPointerException;  // for javadoc

    47 /**

    48  * Represents a Uniform Resource Identifier (URI) reference.

    49  *

    50  * <p> Aside from some minor deviations noted below, an instance of this

    51  * class represents a URI reference as defined by

    52  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform

    53  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a

    54  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for

    55  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format

    56  * also supports scope_ids. The syntax and usage of scope_ids is described

    57  * <a href="Inet6Address.html#scoped">here</a>.

    58  * This class provides constructors for creating URI instances from

    59  * their components or by parsing their string forms, methods for accessing the

    60  * various components of an instance, and methods for normalizing, resolving,

    61  * and relativizing URI instances.  Instances of this class are immutable.

    62  *

    63  *

    64  * <h4> URI syntax and components </h4>

    65  *

    66  * At the highest level a URI reference (hereinafter simply "URI") in string

    67  * form has the syntax

    68  *

    69  * <blockquote>

    70  * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]

    71  * </blockquote>

    72  *

    73  * where square brackets [...] delineate optional components and the characters

    74  * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.

    75  *

    76  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is

    77  * said to be <i>relative</i>.  URIs are also classified according to whether

    78  * they are <i>opaque</i> or <i>hierarchical</i>.

    79  *

    80  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does

    81  * not begin with a slash character (<tt>'/'</tt>).  Opaque URIs are not

    82  * subject to further parsing.  Some examples of opaque URIs are:

    83  *

    84  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">

    85  * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>

    86  * <tr><td><tt>news:comp.lang.java</tt><td></tr>

    87  * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>

    88  * </table></blockquote>

    89  *

    90  * <p> A <i>hierarchical</i> URI is either an absolute URI whose

    91  * scheme-specific part begins with a slash character, or a relative URI, that

    92  * is, a URI that does not specify a scheme.  Some examples of hierarchical

    93  * URIs are:

    94  *

    95  * <blockquote>

    96  * <tt>http://java.sun.com/j2se/1.3/</tt><br>

    97  * <tt>docs/guide/collections/designfaq.html#28</tt><br>

    98  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>

    99  * <tt>file:///~/calendar</tt>

   100  * </blockquote>

   101  *

   102  * <p> A hierarchical URI is subject to further parsing according to the syntax

   103  *

   104  * <blockquote>

   105  * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]

   106  * </blockquote>

   107  *

   108  * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,

   109  * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves.  The

   110  * scheme-specific part of a hierarchical URI consists of the characters

   111  * between the scheme and fragment components.

   112  *

   113  * <p> The authority component of a hierarchical URI is, if specified, either

   114  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority

   115  * parses according to the familiar syntax

   116  *

   117  * <blockquote>

   118  * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]

   119  * </blockquote>

   120  *

   121  * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for

   122  * themselves.  Nearly all URI schemes currently in use are server-based.  An

   123  * authority component that does not parse in this way is considered to be

   124  * registry-based.

   125  *

   126  * <p> The path component of a hierarchical URI is itself said to be absolute

   127  * if it begins with a slash character (<tt>'/'</tt>); otherwise it is

   128  * relative.  The path of a hierarchical URI that is either absolute or

   129  * specifies an authority is always absolute.

   130  *

   131  * <p> All told, then, a URI instance has the following nine components:

   132  *

   133  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">

   134  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>

   135  * <tr><td>scheme</td><td><tt>String</tt></td></tr>

   136  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>

   137  * <tr><td>authority</td><td><tt>String</tt></td></tr>

   138  * <tr><td>user-info</td><td><tt>String</tt></td></tr>

   139  * <tr><td>host</td><td><tt>String</tt></td></tr>

   140  * <tr><td>port</td><td><tt>int</tt></td></tr>

   141  * <tr><td>path</td><td><tt>String</tt></td></tr>

   142  * <tr><td>query</td><td><tt>String</tt></td></tr>

   143  * <tr><td>fragment</td><td><tt>String</tt></td></tr>

   144  * </table></blockquote>

   145  *

   146  * In a given instance any particular component is either <i>undefined</i> or

   147  * <i>defined</i> with a distinct value.  Undefined string components are

   148  * represented by <tt>null</tt>, while undefined integer components are

   149  * represented by <tt>-1</tt>.  A string component may be defined to have the

   150  * empty string as its value; this is not equivalent to that component being

   151  * undefined.

   152  *

   153  * <p> Whether a particular component is or is not defined in an instance

   154  * depends upon the type of the URI being represented.  An absolute URI has a

   155  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and

   156  * possibly a fragment, but has no other components.  A hierarchical URI always

   157  * has a path (though it may be empty) and a scheme-specific-part (which at

   158  * least contains the path), and may have any of the other components.  If the

   159  * authority component is present and is server-based then the host component

   160  * will be defined and the user-information and port components may be defined.

   161  *

   162  *

   163  * <h4> Operations on URI instances </h4>

   164  *

   165  * The key operations supported by this class are those of

   166  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.

   167  *

   168  * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>

   169  * and <tt>".."</tt> segments from the path component of a hierarchical URI.

   170  * Each <tt>"."</tt> segment is simply removed.  A <tt>".."</tt> segment is

   171  * removed only if it is preceded by a non-<tt>".."</tt> segment.

   172  * Normalization has no effect upon opaque URIs.

   173  *

   174  * <p> <i>Resolution</i> is the process of resolving one URI against another,

   175  * <i>base</i> URI.  The resulting URI is constructed from components of both

   176  * URIs in the manner specified by RFC&nbsp;2396, taking components from the

   177  * base URI for those not specified in the original.  For hierarchical URIs,

   178  * the path of the original is resolved against the path of the base and then

   179  * normalized.  The result, for example, of resolving

   180  *

   181  * <blockquote>

   182  * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)

   183  * </blockquote>

   184  *

   185  * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result

   186  * URI

   187  *

   188  * <blockquote>

   189  * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>

   190  * </blockquote>

   191  *

   192  * Resolving the relative URI

   193  *

   194  * <blockquote>

   195  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)

   196  * </blockquote>

   197  *

   198  * against this result yields, in turn,

   199  *

   200  * <blockquote>

   201  * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>

   202  * </blockquote>

   203  *

   204  * Resolution of both absolute and relative URIs, and of both absolute and

   205  * relative paths in the case of hierarchical URIs, is supported.  Resolving

   206  * the URI <tt>file:///~calendar</tt> against any other URI simply yields the

   207  * original URI, since it is absolute.  Resolving the relative URI (2) above

   208  * against the relative base URI (1) yields the normalized, but still relative,

   209  * URI

   210  *

   211  * <blockquote>

   212  * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>

   213  * </blockquote>

   214  *

   215  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any

   216  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,

   217  *

   218  * <blockquote>

   219  *   <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>

   220  *   <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>

   221  * </blockquote>

   222  *

   223  * This operation is often useful when constructing a document containing URIs

   224  * that must be made relative to the base URI of the document wherever

   225  * possible.  For example, relativizing the URI

   226  *

   227  * <blockquote>

   228  * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>

   229  * </blockquote>

   230  *

   231  * against the base URI

   232  *

   233  * <blockquote>

   234  * <tt>http://java.sun.com/j2se/1.3</tt>

   235  * </blockquote>

   236  *

   237  * yields the relative URI <tt>docs/guide/index.html</tt>.

   238  *

   239  *

   240  * <h4> Character categories </h4>

   241  *

   242  * RFC&nbsp;2396 specifies precisely which characters are permitted in the

   243  * various components of a URI reference.  The following categories, most of

   244  * which are taken from that specification, are used below to describe these

   245  * constraints:

   246  *

   247  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">

   248  *   <tr><th valign=top><i>alpha</i></th>

   249  *       <td>The US-ASCII alphabetic characters,

   250  *        <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>

   251  *        and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>

   252  *   <tr><th valign=top><i>digit</i></th>

   253  *       <td>The US-ASCII decimal digit characters,

   254  *       <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>

   255  *   <tr><th valign=top><i>alphanum</i></th>

   256  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>

   257  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>

   258  *       <td>All <i>alphanum</i> characters together with those in the string

   259  *        <tt>"_-!.~'()*"</tt></td></tr>

   260  *   <tr><th valign=top><i>punct</i></th>

   261  *       <td>The characters in the string <tt>",;:$&+="</tt></td></tr>

   262  *   <tr><th valign=top><i>reserved</i></th>

   263  *       <td>All <i>punct</i> characters together with those in the string

   264  *        <tt>"?/[]@"</tt></td></tr>

   265  *   <tr><th valign=top><i>escaped</i></th>

   266  *       <td>Escaped octets, that is, triplets consisting of the percent

   267  *           character (<tt>'%'</tt>) followed by two hexadecimal digits

   268  *           (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and

   269  *           <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>

   270  *   <tr><th valign=top><i>other</i></th>

   271  *       <td>The Unicode characters that are not in the US-ASCII character set,

   272  *           are not control characters (according to the {@link

   273  *           java.lang.Character#isISOControl(char) Character.isISOControl}

   274  *           method), and are not space characters (according to the {@link

   275  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}

   276  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is

   277  *           limited to US-ASCII)</i></td></tr>

   278  * </table></blockquote>

   279  *

   280  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of

   281  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>

   282  * characters.

   283  *

   284  *

   285  * <h4> Escaped octets, quotation, encoding, and decoding </h4>

   286  *

   287  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and

   288  * fragment components.  Escaping serves two purposes in URIs:

   289  *

   290  * <ul>

   291  *

   292  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to

   293  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>

   294  *   characters.  </p></li>

   295  *

   296  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a

   297  *   component.  The user-info, path, query, and fragment components differ

   298  *   slightly in terms of which characters are considered legal and illegal.

   299  *   </p></li>

   300  *

   301  * </ul>

   302  *

   303  * These purposes are served in this class by three related operations:

   304  *

   305  * <ul>

   306  *

   307  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it

   308  *   with the sequence of escaped octets that represent that character in the

   309  *   UTF-8 character set.  The Euro currency symbol (<tt>'&#92;u20AC'</tt>),

   310  *   for example, is encoded as <tt>"%E2%82%AC"</tt>.  <i>(<b>Deviation from

   311  *   RFC&nbsp;2396</b>, which does not specify any particular character

   312  *   set.)</i> </p></li>

   313  *

   314  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by

   315  *   encoding it.  The space character, for example, is quoted by replacing it

   316  *   with <tt>"%20"</tt>.  UTF-8 contains US-ASCII, hence for US-ASCII

   317  *   characters this transformation has exactly the effect required by

   318  *   RFC&nbsp;2396. </p></li>

   319  *

   320  *   <li><p><a name="decode"></a>

   321  *   A sequence of escaped octets is <i>decoded</i> by

   322  *   replacing it with the sequence of characters that it represents in the

   323  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the

   324  *   effect of de-quoting any quoted US-ASCII characters as well as that of

   325  *   decoding any encoded non-US-ASCII characters.  If a <a

   326  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs

   327  *   when decoding the escaped octets then the erroneous octets are replaced by

   328  *   <tt>'&#92;uFFFD'</tt>, the Unicode replacement character.  </p></li>

   329  *

   330  * </ul>

   331  *

   332  * These operations are exposed in the constructors and methods of this class

   333  * as follows:

   334  *

   335  * <ul>

   336  *

   337  *   <li><p> The {@link #URI(java.lang.String) <code>single-argument

   338  *   constructor</code>} requires any illegal characters in its argument to be

   339  *   quoted and preserves any escaped octets and <i>other</i> characters that

   340  *   are present.  </p></li>

   341  *

   342  *   <li><p> The {@link

   343  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)

   344  *   <code>multi-argument constructors</code>} quote illegal characters as

   345  *   required by the components in which they appear.  The percent character

   346  *   (<tt>'%'</tt>) is always quoted by these constructors.  Any <i>other</i>

   347  *   characters are preserved.  </p></li>

   348  *

   349  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()

   350  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()

   351  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link

   352  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the

   353  *   values of their corresponding components in raw form, without interpreting

   354  *   any escaped octets.  The strings returned by these methods may contain

   355  *   both escaped octets and <i>other</i> characters, and will not contain any

   356  *   illegal characters.  </p></li>

   357  *

   358  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()

   359  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()

   360  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link

   361  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped

   362  *   octets in their corresponding components.  The strings returned by these

   363  *   methods may contain both <i>other</i> characters and illegal characters,

   364  *   and will not contain any escaped octets.  </p></li>

   365  *

   366  *   <li><p> The {@link #toString() toString} method returns a URI string with

   367  *   all necessary quotation but which may contain <i>other</i> characters.

   368  *   </p></li>

   369  *

   370  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully

   371  *   quoted and encoded URI string that does not contain any <i>other</i>

   372  *   characters.  </p></li>

   373  *

   374  * </ul>

   375  *

   376  *

   377  * <h4> Identities </h4>

   378  *

   379  * For any URI <i>u</i>, it is always the case that

   380  *

   381  * <blockquote>

   382  * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.

   383  * </blockquote>

   384  *

   385  * For any URI <i>u</i> that does not contain redundant syntax such as two

   386  * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a

   387  * colon following a host name but no port (as in

   388  * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters

   389  * except those that must be quoted, the following identities also hold:

   390  *

   391  * <blockquote>

   392  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   393  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>

   394  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   395  * .equals(</tt><i>u</i><tt>)</tt>

   396  * </blockquote>

   397  *

   398  * in all cases,

   399  *

   400  * <blockquote>

   401  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   402  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>

   403  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>

   404  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   405  * .equals(</tt><i>u</i><tt>)</tt>

   406  * </blockquote>

   407  *

   408  * if <i>u</i> is hierarchical, and

   409  *

   410  * <blockquote>

   411  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   412  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>

   413  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>

   414  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   415  * .equals(</tt><i>u</i><tt>)</tt>

   416  * </blockquote>

   417  *

   418  * if <i>u</i> is hierarchical and has either no authority or a server-based

   419  * authority.

   420  *

   421  *

   422  * <h4> URIs, URLs, and URNs </h4>

   423  *

   424  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform

   425  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but

   426  * not every URI is a URL.  This is because there is another subcategory of

   427  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not

   428  * specify how to locate them.  The <tt>mailto</tt>, <tt>news</tt>, and

   429  * <tt>isbn</tt> URIs shown above are examples of URNs.

   430  *

   431  * <p> The conceptual distinction between URIs and URLs is reflected in the

   432  * differences between this class and the {@link URL} class.

   433  *

   434  * <p> An instance of this class represents a URI reference in the syntactic

   435  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.

   436  * A URI string is parsed according to the generic syntax without regard to the

   437  * scheme, if any, that it specifies.  No lookup of the host, if any, is

   438  * performed, and no scheme-dependent stream handler is constructed.  Equality,

   439  * hashing, and comparison are defined strictly in terms of the character

   440  * content of the instance.  In other words, a URI instance is little more than

   441  * a structured string that supports the syntactic, scheme-independent

   442  * operations of comparison, normalization, resolution, and relativization.

   443  *

   444  * <p> An instance of the {@link URL} class, by contrast, represents the

   445  * syntactic components of a URL together with some of the information required

   446  * to access the resource that it describes.  A URL must be absolute, that is,

   447  * it must always specify a scheme.  A URL string is parsed according to its

   448  * scheme.  A stream handler is always established for a URL, and in fact it is

   449  * impossible to create a URL instance for a scheme for which no handler is

   450  * available.  Equality and hashing depend upon both the scheme and the

   451  * Internet address of the host, if any; comparison is not defined.  In other

   452  * words, a URL is a structured string that supports the syntactic operation of

   453  * resolution as well as the network I/O operations of looking up the host and

   454  * opening a connection to the specified resource.

   455  *

   456  *

   457  * @author Mark Reinhold

   458  * @since 1.4

   459  *

   460  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a

   461  * transformation format of ISO 10646</i></a>, <br><a

   462  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing

   463  * Architecture</i></a>, <br><a

   464  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform

   465  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a

   466  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for

   467  * Literal IPv6 Addresses in URLs</i></a>, <br><a

   468  * href="URISyntaxException.html">URISyntaxException</a>

   469  */

   471 public final class URI

   472     implements Comparable<URI>, Serializable

   473 {

   475     // Note: Comments containing the word "ASSERT" indicate places where a

   476     // throw of an InternalError should be replaced by an appropriate assertion

   477     // statement once asserts are enabled in the build.

   479     static final long serialVersionUID = -6052424284110960213L;

   482     // -- Properties and components of this instance --

   484     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]

   485     private transient String scheme;            // null ==> relative URI

   486     private transient String fragment;

   488     // Hierarchical URI components: [//<authority>]<path>[?<query>]

   489     private transient String authority;         // Registry or server

   491     // Server-based authority: [<userInfo>@]<host>[:<port>]

   492     private transient String userInfo;

   493     private transient String host;              // null ==> registry-based

   494     private transient int port = -1;            // -1 ==> undefined

   496     // Remaining components of hierarchical URIs

   497     private transient String path;              // null ==> opaque

   498     private transient String query;

   500     // The remaining fields may be computed on demand

   502     private volatile transient String schemeSpecificPart;

   503     private volatile transient int hash;        // Zero ==> undefined

   505     private volatile transient String decodedUserInfo = null;

   506     private volatile transient String decodedAuthority = null;

   507     private volatile transient String decodedPath = null;

   508     private volatile transient String decodedQuery = null;

   509     private volatile transient String decodedFragment = null;

   510     private volatile transient String decodedSchemeSpecificPart = null;

   512     /**

   513      * The string form of this URI.

   514      *

   515      * @serial

   516      */

   517     private volatile String string;             // The only serializable field

   521     // -- Constructors and factories --

   523     private URI() { }                           // Used internally

   525     /**

   526      * Constructs a URI by parsing the given string.

   527      *

   528      * <p> This constructor parses the given string exactly as specified by the

   529      * grammar in <a

   530      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   531      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>

   532      *

   533      * <ul type=disc>

   534      *

   535      *   <li><p> An empty authority component is permitted as long as it is

   536      *   followed by a non-empty path, a query component, or a fragment

   537      *   component.  This allows the parsing of URIs such as

   538      *   <tt>"file:///foo/bar"</tt>, which seems to be the intent of

   539      *   RFC&nbsp;2396 although the grammar does not permit it.  If the

   540      *   authority component is empty then the user-information, host, and port

   541      *   components are undefined. </p></li>

   542      *

   543      *   <li><p> Empty relative paths are permitted; this seems to be the

   544      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The

   545      *   primary consequence of this deviation is that a standalone fragment

   546      *   such as <tt>"#foo"</tt> parses as a relative URI with an empty path

   547      *   and the given fragment, and can be usefully <a

   548      *   href="#resolve-frag">resolved</a> against a base URI.

   549      *

   550      *   <li><p> IPv4 addresses in host components are parsed rigorously, as

   551      *   specified by <a

   552      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each

   553      *   element of a dotted-quad address must contain no more than three

   554      *   decimal digits.  Each element is further constrained to have a value

   555      *   no greater than 255. </p></li>

   556      *

   557      *   <li> <p> Hostnames in host components that comprise only a single

   558      *   domain label are permitted to start with an <i>alphanum</i>

   559      *   character. This seems to be the intent of <a

   560      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>

   561      *   section&nbsp;3.2.2 although the grammar does not permit it. The

   562      *   consequence of this deviation is that the authority component of a

   563      *   hierarchical URI such as <tt>s://123</tt>, will parse as a server-based

   564      *   authority. </p></li>

   565      *

   566      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6

   567      *   address must be enclosed in square brackets (<tt>'['</tt> and

   568      *   <tt>']'</tt>) as specified by <a

   569      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The

   570      *   IPv6 address itself must parse according to <a

   571      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6

   572      *   addresses are further constrained to describe no more than sixteen

   573      *   bytes of address information, a constraint implicit in RFC&nbsp;2373

   574      *   but not expressible in the grammar. </p></li>

   575      *

   576      *   <li><p> Characters in the <i>other</i> category are permitted wherever

   577      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the

   578      *   user-information, path, query, and fragment components, as well as in

   579      *   the authority component if the authority is registry-based.  This

   580      *   allows URIs to contain Unicode characters beyond those in the US-ASCII

   581      *   character set. </p></li>

   582      *

   583      * </ul>

   584      *

   585      * @param  str   The string to be parsed into a URI

   586      *

   587      * @throws  NullPointerException

   588      *          If <tt>str</tt> is <tt>null</tt>

   589      *

   590      * @throws  URISyntaxException

   591      *          If the given string violates RFC&nbsp;2396, as augmented

   592      *          by the above deviations

   593      */

   594     public URI(String str) throws URISyntaxException {

   595         new Parser(str).parse(false);

   596     }

   598     /**

   599      * Constructs a hierarchical URI from the given components.

   600      *

   601      * <p> If a scheme is given then the path, if also given, must either be

   602      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a

   603      * component of the new URI may be left undefined by passing <tt>null</tt>

   604      * for the corresponding parameter or, in the case of the <tt>port</tt>

   605      * parameter, by passing <tt>-1</tt>.

   606      *

   607      * <p> This constructor first builds a URI string from the given components

   608      * according to the rules specified in <a

   609      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   610      * section&nbsp;5.2, step&nbsp;7: </p>

   611      *

   612      * <ol>

   613      *

   614      *   <li><p> Initially, the result string is empty. </p></li>

   615      *

   616      *   <li><p> If a scheme is given then it is appended to the result,

   617      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   618      *

   619      *   <li><p> If user information, a host, or a port are given then the

   620      *   string <tt>"//"</tt> is appended.  </p></li>

   621      *

   622      *   <li><p> If user information is given then it is appended, followed by

   623      *   a commercial-at character (<tt>'@'</tt>).  Any character not in the

   624      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   625      *   categories is <a href="#quote">quoted</a>.  </p></li>

   626      *

   627      *   <li><p> If a host is given then it is appended.  If the host is a

   628      *   literal IPv6 address but is not enclosed in square brackets

   629      *   (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.

   630      *   </p></li>

   631      *

   632      *   <li><p> If a port number is given then a colon character

   633      *   (<tt>':'</tt>) is appended, followed by the port number in decimal.

   634      *   </p></li>

   635      *

   636      *   <li><p> If a path is given then it is appended.  Any character not in

   637      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   638      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the

   639      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>

   640      *

   641      *   <li><p> If a query is given then a question-mark character

   642      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that

   643      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.

   644      *   </p></li>

   645      *

   646      *   <li><p> Finally, if a fragment is given then a hash character

   647      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character

   648      *   that is not a legal URI character is quoted.  </p></li>

   649      *

   650      * </ol>

   651      *

   652      * <p> The resulting URI string is then parsed as if by invoking the {@link

   653      * #URI(String)} constructor and then invoking the {@link

   654      * #parseServerAuthority()} method upon the result; this may cause a {@link

   655      * URISyntaxException} to be thrown.  </p>

   656      *

   657      * @param   scheme    Scheme name

   658      * @param   userInfo  User name and authorization information

   659      * @param   host      Host name

   660      * @param   port      Port number

   661      * @param   path      Path

   662      * @param   query     Query

   663      * @param   fragment  Fragment

   664      *

   665      * @throws URISyntaxException

   666      *         If both a scheme and a path are given but the path is relative,

   667      *         if the URI string constructed from the given components violates

   668      *         RFC&nbsp;2396, or if the authority component of the string is

   669      *         present but cannot be parsed as a server-based authority

   670      */

   671     public URI(String scheme,

   672                String userInfo, String host, int port,

   673                String path, String query, String fragment)

   674         throws URISyntaxException

   675     {

   676         String s = toString(scheme, null,

   677                             null, userInfo, host, port,

   678                             path, query, fragment);

   679         checkPath(s, scheme, path);

   680         new Parser(s).parse(true);

   681     }

   683     /**

   684      * Constructs a hierarchical URI from the given components.

   685      *

   686      * <p> If a scheme is given then the path, if also given, must either be

   687      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a

   688      * component of the new URI may be left undefined by passing <tt>null</tt>

   689      * for the corresponding parameter.

   690      *

   691      * <p> This constructor first builds a URI string from the given components

   692      * according to the rules specified in <a

   693      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   694      * section&nbsp;5.2, step&nbsp;7: </p>

   695      *

   696      * <ol>

   697      *

   698      *   <li><p> Initially, the result string is empty.  </p></li>

   699      *

   700      *   <li><p> If a scheme is given then it is appended to the result,

   701      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   702      *

   703      *   <li><p> If an authority is given then the string <tt>"//"</tt> is

   704      *   appended, followed by the authority.  If the authority contains a

   705      *   literal IPv6 address then the address must be enclosed in square

   706      *   brackets (<tt>'['</tt> and <tt>']'</tt>).  Any character not in the

   707      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   708      *   categories, and not equal to the commercial-at character

   709      *   (<tt>'@'</tt>), is <a href="#quote">quoted</a>.  </p></li>

   710      *

   711      *   <li><p> If a path is given then it is appended.  Any character not in

   712      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   713      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the

   714      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>

   715      *

   716      *   <li><p> If a query is given then a question-mark character

   717      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that

   718      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.

   719      *   </p></li>

   720      *

   721      *   <li><p> Finally, if a fragment is given then a hash character

   722      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character

   723      *   that is not a legal URI character is quoted.  </p></li>

   724      *

   725      * </ol>

   726      *

   727      * <p> The resulting URI string is then parsed as if by invoking the {@link

   728      * #URI(String)} constructor and then invoking the {@link

   729      * #parseServerAuthority()} method upon the result; this may cause a {@link

   730      * URISyntaxException} to be thrown.  </p>

   731      *

   732      * @param   scheme     Scheme name

   733      * @param   authority  Authority

   734      * @param   path       Path

   735      * @param   query      Query

   736      * @param   fragment   Fragment

   737      *

   738      * @throws URISyntaxException

   739      *         If both a scheme and a path are given but the path is relative,

   740      *         if the URI string constructed from the given components violates

   741      *         RFC&nbsp;2396, or if the authority component of the string is

   742      *         present but cannot be parsed as a server-based authority

   743      */

   744     public URI(String scheme,

   745                String authority,

   746                String path, String query, String fragment)

   747         throws URISyntaxException

   748     {

   749         String s = toString(scheme, null,

   750                             authority, null, null, -1,

   751                             path, query, fragment);

   752         checkPath(s, scheme, path);

   753         new Parser(s).parse(false);

   754     }

   756     /**

   757      * Constructs a hierarchical URI from the given components.

   758      *

   759      * <p> A component may be left undefined by passing <tt>null</tt>.

   760      *

   761      * <p> This convenience constructor works as if by invoking the

   762      * seven-argument constructor as follows:

   763      *

   764      * <blockquote><tt>

   765      * new&nbsp;{@link #URI(String, String, String, int, String, String, String)

   766      * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);

   767      * </tt></blockquote>

   768      *

   769      * @param   scheme    Scheme name

   770      * @param   host      Host name

   771      * @param   path      Path

   772      * @param   fragment  Fragment

   773      *

   774      * @throws  URISyntaxException

   775      *          If the URI string constructed from the given components

   776      *          violates RFC&nbsp;2396

   777      */

   778     public URI(String scheme, String host, String path, String fragment)

   779         throws URISyntaxException

   780     {

   781         this(scheme, null, host, -1, path, null, fragment);

   782     }

   784     /**

   785      * Constructs a URI from the given components.

   786      *

   787      * <p> A component may be left undefined by passing <tt>null</tt>.

   788      *

   789      * <p> This constructor first builds a URI in string form using the given

   790      * components as follows:  </p>

   791      *

   792      * <ol>

   793      *

   794      *   <li><p> Initially, the result string is empty.  </p></li>

   795      *

   796      *   <li><p> If a scheme is given then it is appended to the result,

   797      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   798      *

   799      *   <li><p> If a scheme-specific part is given then it is appended.  Any

   800      *   character that is not a <a href="#legal-chars">legal URI character</a>

   801      *   is <a href="#quote">quoted</a>.  </p></li>

   802      *

   803      *   <li><p> Finally, if a fragment is given then a hash character

   804      *   (<tt>'#'</tt>) is appended to the string, followed by the fragment.

   805      *   Any character that is not a legal URI character is quoted.  </p></li>

   806      *

   807      * </ol>

   808      *

   809      * <p> The resulting URI string is then parsed in order to create the new

   810      * URI instance as if by invoking the {@link #URI(String)} constructor;

   811      * this may cause a {@link URISyntaxException} to be thrown.  </p>

   812      *

   813      * @param   scheme    Scheme name

   814      * @param   ssp       Scheme-specific part

   815      * @param   fragment  Fragment

   816      *

   817      * @throws  URISyntaxException

   818      *          If the URI string constructed from the given components

   819      *          violates RFC&nbsp;2396

   820      */

   821     public URI(String scheme, String ssp, String fragment)

   822         throws URISyntaxException

   823     {

   824         new Parser(toString(scheme, ssp,

   825                             null, null, null, -1,

   826                             null, null, fragment))

   827             .parse(false);

   828     }

   830     /**

   831      * Creates a URI by parsing the given string.

   832      *

   833      * <p> This convenience factory method works as if by invoking the {@link

   834      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the

   835      * constructor is caught and wrapped in a new {@link

   836      * IllegalArgumentException} object, which is then thrown.

   837      *

   838      * <p> This method is provided for use in situations where it is known that

   839      * the given string is a legal URI, for example for URI constants declared

   840      * within in a program, and so it would be considered a programming error

   841      * for the string not to parse as such.  The constructors, which throw

   842      * {@link URISyntaxException} directly, should be used situations where a

   843      * URI is being constructed from user input or from some other source that

   844      * may be prone to errors.  </p>

   845      *

   846      * @param  str   The string to be parsed into a URI

   847      * @return The new URI

   848      *

   849      * @throws  NullPointerException

   850      *          If <tt>str</tt> is <tt>null</tt>

   851      *

   852      * @throws  IllegalArgumentException

   853      *          If the given string violates RFC&nbsp;2396

   854      */

   855     public static URI create(String str) {

   856         try {

   857             return new URI(str);

   858         } catch (URISyntaxException x) {

   859             throw new IllegalArgumentException(x.getMessage(), x);

   860         }

   861     }

   864     // -- Operations --

   866     /**

   867      * Attempts to parse this URI's authority component, if defined, into

   868      * user-information, host, and port components.

   869      *

   870      * <p> If this URI's authority component has already been recognized as

   871      * being server-based then it will already have been parsed into

   872      * user-information, host, and port components.  In this case, or if this

   873      * URI has no authority component, this method simply returns this URI.

   874      *

   875      * <p> Otherwise this method attempts once more to parse the authority

   876      * component into user-information, host, and port components, and throws

   877      * an exception describing why the authority component could not be parsed

   878      * in that way.

   879      *

   880      * <p> This method is provided because the generic URI syntax specified in

   881      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>

   882      * cannot always distinguish a malformed server-based authority from a

   883      * legitimate registry-based authority.  It must therefore treat some

   884      * instances of the former as instances of the latter.  The authority

   885      * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a

   886      * legal server-based authority but it is legal as a registry-based

   887      * authority.

   888      *

   889      * <p> In many common situations, for example when working URIs that are

   890      * known to be either URNs or URLs, the hierarchical URIs being used will

   891      * always be server-based.  They therefore must either be parsed as such or

   892      * treated as an error.  In these cases a statement such as

   893      *

   894      * <blockquote>

   895      * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>

   896      * </blockquote>

   897      *

   898      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if

   899      * it has an authority component, has a server-based authority with proper

   900      * user-information, host, and port components.  Invoking this method also

   901      * ensures that if the authority could not be parsed in that way then an

   902      * appropriate diagnostic message can be issued based upon the exception

   903      * that is thrown. </p>

   904      *

   905      * @return  A URI whose authority field has been parsed

   906      *          as a server-based authority

   907      *

   908      * @throws  URISyntaxException

   909      *          If the authority component of this URI is defined

   910      *          but cannot be parsed as a server-based authority

   911      *          according to RFC&nbsp;2396

   912      */

   913     public URI parseServerAuthority()

   914         throws URISyntaxException

   915     {

   916         // We could be clever and cache the error message and index from the

   917         // exception thrown during the original parse, but that would require

   918         // either more fields or a more-obscure representation.

   919         if ((host != null) || (authority == null))

   920             return this;

   921         defineString();

   922         new Parser(string).parse(true);

   923         return this;

   924     }

   926     /**

   927      * Normalizes this URI's path.

   928      *

   929      * <p> If this URI is opaque, or if its path is already in normal form,

   930      * then this URI is returned.  Otherwise a new URI is constructed that is

   931      * identical to this URI except that its path is computed by normalizing

   932      * this URI's path in a manner consistent with <a

   933      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   934      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:

   935      * </p>

   936      *

   937      * <ol>

   938      *

   939      *   <li><p> All <tt>"."</tt> segments are removed. </p></li>

   940      *

   941      *   <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>

   942      *   segment then both of these segments are removed.  This step is

   943      *   repeated until it is no longer applicable. </p></li>

   944      *

   945      *   <li><p> If the path is relative, and if its first segment contains a

   946      *   colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is

   947      *   prepended.  This prevents a relative URI with a path such as

   948      *   <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a

   949      *   scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.

   950      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>

   951      *

   952      * </ol>

   953      *

   954      * <p> A normalized path will begin with one or more <tt>".."</tt> segments

   955      * if there were insufficient non-<tt>".."</tt> segments preceding them to

   956      * allow their removal.  A normalized path will begin with a <tt>"."</tt>

   957      * segment if one was inserted by step 3 above.  Otherwise, a normalized

   958      * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>

   959      *

   960      * @return  A URI equivalent to this URI,

   961      *          but whose path is in normal form

   962      */

   963     public URI normalize() {

   964         return normalize(this);

   965     }

   967     /**

   968      * Resolves the given URI against this URI.

   969      *

   970      * <p> If the given URI is already absolute, or if this URI is opaque, then

   971      * the given URI is returned.

   972      *

   973      * <p><a name="resolve-frag"></a> If the given URI's fragment component is

   974      * defined, its path component is empty, and its scheme, authority, and

   975      * query components are undefined, then a URI with the given fragment but

   976      * with all other components equal to those of this URI is returned.  This

   977      * allows a URI representing a standalone fragment reference, such as

   978      * <tt>"#foo"</tt>, to be usefully resolved against a base URI.

   979      *

   980      * <p> Otherwise this method constructs a new hierarchical URI in a manner

   981      * consistent with <a

   982      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   983      * section&nbsp;5.2; that is: </p>

   984      *

   985      * <ol>

   986      *

   987      *   <li><p> A new URI is constructed with this URI's scheme and the given

   988      *   URI's query and fragment components. </p></li>

   989      *

   990      *   <li><p> If the given URI has an authority component then the new URI's

   991      *   authority and path are taken from the given URI. </p></li>

   992      *

   993      *   <li><p> Otherwise the new URI's authority component is copied from

   994      *   this URI, and its path is computed as follows: </p>

   995      *

   996      *   <ol type=a>

   997      *

   998      *     <li><p> If the given URI's path is absolute then the new URI's path

   999      *     is taken from the given URI. </p></li>

  1000      *

  1001      *     <li><p> Otherwise the given URI's path is relative, and so the new

  1002      *     URI's path is computed by resolving the path of the given URI

  1003      *     against the path of this URI.  This is done by concatenating all but

  1004      *     the last segment of this URI's path, if any, with the given URI's

  1005      *     path and then normalizing the result as if by invoking the {@link

  1006      *     #normalize() normalize} method. </p></li>

  1007      *

  1008      *   </ol></li>

  1009      *

  1010      * </ol>

  1011      *

  1012      * <p> The result of this method is absolute if, and only if, either this

  1013      * URI is absolute or the given URI is absolute.  </p>

  1014      *

  1015      * @param  uri  The URI to be resolved against this URI

  1016      * @return The resulting URI

  1017      *

  1018      * @throws  NullPointerException

  1019      *          If <tt>uri</tt> is <tt>null</tt>

  1020      */

  1021     public URI resolve(URI uri) {

  1022         return resolve(this, uri);

  1023     }

  1025     /**

  1026      * Constructs a new URI by parsing the given string and then resolving it

  1027      * against this URI.

  1028      *

  1029      * <p> This convenience method works as if invoking it were equivalent to

  1030      * evaluating the expression <tt>{@link #resolve(java.net.URI)

  1031      * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>

  1032      *

  1033      * @param  str   The string to be parsed into a URI

  1034      * @return The resulting URI

  1035      *

  1036      * @throws  NullPointerException

  1037      *          If <tt>str</tt> is <tt>null</tt>

  1038      *

  1039      * @throws  IllegalArgumentException

  1040      *          If the given string violates RFC&nbsp;2396

  1041      */

  1042     public URI resolve(String str) {

  1043         return resolve(URI.create(str));

  1044     }

  1046     /**

  1047      * Relativizes the given URI against this URI.

  1048      *

  1049      * <p> The relativization of the given URI against this URI is computed as

  1050      * follows: </p>

  1051      *

  1052      * <ol>

  1053      *

  1054      *   <li><p> If either this URI or the given URI are opaque, or if the

  1055      *   scheme and authority components of the two URIs are not identical, or

  1056      *   if the path of this URI is not a prefix of the path of the given URI,

  1057      *   then the given URI is returned. </p></li>

  1058      *

  1059      *   <li><p> Otherwise a new relative hierarchical URI is constructed with

  1060      *   query and fragment components taken from the given URI and with a path

  1061      *   component computed by removing this URI's path from the beginning of

  1062      *   the given URI's path. </p></li>

  1063      *

  1064      * </ol>

  1065      *

  1066      * @param  uri  The URI to be relativized against this URI

  1067      * @return The resulting URI

  1068      *

  1069      * @throws  NullPointerException

  1070      *          If <tt>uri</tt> is <tt>null</tt>

  1071      */

  1072     public URI relativize(URI uri) {

  1073         return relativize(this, uri);

  1074     }

  1076     /**

  1077      * Constructs a URL from this URI.

  1078      *

  1079      * <p> This convenience method works as if invoking it were equivalent to

  1080      * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after

  1081      * first checking that this URI is absolute. </p>

  1082      *

  1083      * @return  A URL constructed from this URI

  1084      *

  1085      * @throws  IllegalArgumentException

  1086      *          If this URL is not absolute

  1087      *

  1088      * @throws  MalformedURLException

  1089      *          If a protocol handler for the URL could not be found,

  1090      *          or if some other error occurred while constructing the URL

  1091      */

  1092     public URL toURL()

  1093         throws MalformedURLException {

  1094         if (!isAbsolute())

  1095             throw new IllegalArgumentException("URI is not absolute");

  1096         return new URL(toString());

  1097     }

  1099     // -- Component access methods --

  1101     /**

  1102      * Returns the scheme component of this URI.

  1103      *

  1104      * <p> The scheme component of a URI, if defined, only contains characters

  1105      * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>.  A

  1106      * scheme always starts with an <i>alpha</i> character. <p>

  1107      *

  1108      * The scheme component of a URI cannot contain escaped octets, hence this

  1109      * method does not perform any decoding.

  1110      *

  1111      * @return  The scheme component of this URI,

  1112      *          or <tt>null</tt> if the scheme is undefined

  1113      */

  1114     public String getScheme() {

  1115         return scheme;

  1116     }

  1118     /**

  1119      * Tells whether or not this URI is absolute.

  1120      *

  1121      * <p> A URI is absolute if, and only if, it has a scheme component. </p>

  1122      *

  1123      * @return  <tt>true</tt> if, and only if, this URI is absolute

  1124      */

  1125     public boolean isAbsolute() {

  1126         return scheme != null;

  1127     }

  1129     /**

  1130      * Tells whether or not this URI is opaque.

  1131      *

  1132      * <p> A URI is opaque if, and only if, it is absolute and its

  1133      * scheme-specific part does not begin with a slash character ('/').

  1134      * An opaque URI has a scheme, a scheme-specific part, and possibly

  1135      * a fragment; all other components are undefined. </p>

  1136      *

  1137      * @return  <tt>true</tt> if, and only if, this URI is opaque

  1138      */

  1139     public boolean isOpaque() {

  1140         return path == null;

  1141     }

  1143     /**

  1144      * Returns the raw scheme-specific part of this URI.  The scheme-specific

  1145      * part is never undefined, though it may be empty.

  1146      *

  1147      * <p> The scheme-specific part of a URI only contains legal URI

  1148      * characters. </p>

  1149      *

  1150      * @return  The raw scheme-specific part of this URI

  1151      *          (never <tt>null</tt>)

  1152      */

  1153     public String getRawSchemeSpecificPart() {

  1154         defineSchemeSpecificPart();

  1155         return schemeSpecificPart;

  1156     }

  1158     /**

  1159      * Returns the decoded scheme-specific part of this URI.

  1160      *

  1161      * <p> The string returned by this method is equal to that returned by the

  1162      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method

  1163      * except that all sequences of escaped octets are <a

  1164      * href="#decode">decoded</a>.  </p>

  1165      *

  1166      * @return  The decoded scheme-specific part of this URI

  1167      *          (never <tt>null</tt>)

  1168      */

  1169     public String getSchemeSpecificPart() {

  1170         if (decodedSchemeSpecificPart == null)

  1171             decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());

  1172         return decodedSchemeSpecificPart;

  1173     }

  1175     /**

  1176      * Returns the raw authority component of this URI.

  1177      *

  1178      * <p> The authority component of a URI, if defined, only contains the

  1179      * commercial-at character (<tt>'@'</tt>) and characters in the

  1180      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>

  1181      * categories.  If the authority is server-based then it is further

  1182      * constrained to have valid user-information, host, and port

  1183      * components. </p>

  1184      *

  1185      * @return  The raw authority component of this URI,

  1186      *          or <tt>null</tt> if the authority is undefined

  1187      */

  1188     public String getRawAuthority() {

  1189         return authority;

  1190     }

  1192     /**

  1193      * Returns the decoded authority component of this URI.

  1194      *

  1195      * <p> The string returned by this method is equal to that returned by the

  1196      * {@link #getRawAuthority() getRawAuthority} method except that all

  1197      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1198      *

  1199      * @return  The decoded authority component of this URI,

  1200      *          or <tt>null</tt> if the authority is undefined

  1201      */

  1202     public String getAuthority() {

  1203         if (decodedAuthority == null)

  1204             decodedAuthority = decode(authority);

  1205         return decodedAuthority;

  1206     }

  1208     /**

  1209      * Returns the raw user-information component of this URI.

  1210      *

  1211      * <p> The user-information component of a URI, if defined, only contains

  1212      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and

  1213      * <i>other</i> categories. </p>

  1214      *

  1215      * @return  The raw user-information component of this URI,

  1216      *          or <tt>null</tt> if the user information is undefined

  1217      */

  1218     public String getRawUserInfo() {

  1219         return userInfo;

  1220     }

  1222     /**

  1223      * Returns the decoded user-information component of this URI.

  1224      *

  1225      * <p> The string returned by this method is equal to that returned by the

  1226      * {@link #getRawUserInfo() getRawUserInfo} method except that all

  1227      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1228      *

  1229      * @return  The decoded user-information component of this URI,

  1230      *          or <tt>null</tt> if the user information is undefined

  1231      */

  1232     public String getUserInfo() {

  1233         if ((decodedUserInfo == null) && (userInfo != null))

  1234             decodedUserInfo = decode(userInfo);

  1235         return decodedUserInfo;

  1236     }

  1238     /**

  1239      * Returns the host component of this URI.

  1240      *

  1241      * <p> The host component of a URI, if defined, will have one of the

  1242      * following forms: </p>

  1243      *

  1244      * <ul type=disc>

  1245      *

  1246      *   <li><p> A domain name consisting of one or more <i>labels</i>

  1247      *   separated by period characters (<tt>'.'</tt>), optionally followed by

  1248      *   a period character.  Each label consists of <i>alphanum</i> characters

  1249      *   as well as hyphen characters (<tt>'-'</tt>), though hyphens never

  1250      *   occur as the first or last characters in a label. The rightmost

  1251      *   label of a domain name consisting of two or more labels, begins

  1252      *   with an <i>alpha</i> character. </li>

  1253      *

  1254      *   <li><p> A dotted-quad IPv4 address of the form

  1255      *   <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,

  1256      *   where no <i>digit</i> sequence is longer than three characters and no

  1257      *   sequence has a value larger than 255. </p></li>

  1258      *

  1259      *   <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and

  1260      *   <tt>']'</tt>) and consisting of hexadecimal digits, colon characters

  1261      *   (<tt>':'</tt>), and possibly an embedded IPv4 address.  The full

  1262      *   syntax of IPv6 addresses is specified in <a

  1263      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6

  1264      *   Addressing Architecture</i></a>.  </p></li>

  1265      *

  1266      * </ul>

  1267      *

  1268      * The host component of a URI cannot contain escaped octets, hence this

  1269      * method does not perform any decoding.

  1270      *

  1271      * @return  The host component of this URI,

  1272      *          or <tt>null</tt> if the host is undefined

  1273      */

  1274     public String getHost() {

  1275         return host;

  1276     }

  1278     /**

  1279      * Returns the port number of this URI.

  1280      *

  1281      * <p> The port component of a URI, if defined, is a non-negative

  1282      * integer. </p>

  1283      *

  1284      * @return  The port component of this URI,

  1285      *          or <tt>-1</tt> if the port is undefined

  1286      */

  1287     public int getPort() {

  1288         return port;

  1289     }

  1291     /**

  1292      * Returns the raw path component of this URI.

  1293      *

  1294      * <p> The path component of a URI, if defined, only contains the slash

  1295      * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),

  1296      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,

  1297      * and <i>other</i> categories. </p>

  1298      *

  1299      * @return  The path component of this URI,

  1300      *          or <tt>null</tt> if the path is undefined

  1301      */

  1302     public String getRawPath() {

  1303         return path;

  1304     }

  1306     /**

  1307      * Returns the decoded path component of this URI.

  1308      *

  1309      * <p> The string returned by this method is equal to that returned by the

  1310      * {@link #getRawPath() getRawPath} method except that all sequences of

  1311      * escaped octets are <a href="#decode">decoded</a>.  </p>

  1312      *

  1313      * @return  The decoded path component of this URI,

  1314      *          or <tt>null</tt> if the path is undefined

  1315      */

  1316     public String getPath() {

  1317         if ((decodedPath == null) && (path != null))

  1318             decodedPath = decode(path);

  1319         return decodedPath;

  1320     }

  1322     /**

  1323      * Returns the raw query component of this URI.

  1324      *

  1325      * <p> The query component of a URI, if defined, only contains legal URI

  1326      * characters. </p>

  1327      *

  1328      * @return  The raw query component of this URI,

  1329      *          or <tt>null</tt> if the query is undefined

  1330      */

  1331     public String getRawQuery() {

  1332         return query;

  1333     }

  1335     /**

  1336      * Returns the decoded query component of this URI.

  1337      *

  1338      * <p> The string returned by this method is equal to that returned by the

  1339      * {@link #getRawQuery() getRawQuery} method except that all sequences of

  1340      * escaped octets are <a href="#decode">decoded</a>.  </p>

  1341      *

  1342      * @return  The decoded query component of this URI,

  1343      *          or <tt>null</tt> if the query is undefined

  1344      */

  1345     public String getQuery() {

  1346         if ((decodedQuery == null) && (query != null))

  1347             decodedQuery = decode(query);

  1348         return decodedQuery;

  1349     }

  1351     /**

  1352      * Returns the raw fragment component of this URI.

  1353      *

  1354      * <p> The fragment component of a URI, if defined, only contains legal URI

  1355      * characters. </p>

  1356      *

  1357      * @return  The raw fragment component of this URI,

  1358      *          or <tt>null</tt> if the fragment is undefined

  1359      */

  1360     public String getRawFragment() {

  1361         return fragment;

  1362     }

  1364     /**

  1365      * Returns the decoded fragment component of this URI.

  1366      *

  1367      * <p> The string returned by this method is equal to that returned by the

  1368      * {@link #getRawFragment() getRawFragment} method except that all

  1369      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1370      *

  1371      * @return  The decoded fragment component of this URI,

  1372      *          or <tt>null</tt> if the fragment is undefined

  1373      */

  1374     public String getFragment() {

  1375         if ((decodedFragment == null) && (fragment != null))

  1376             decodedFragment = decode(fragment);

  1377         return decodedFragment;

  1378     }

  1381     // -- Equality, comparison, hash code, toString, and serialization --

  1383     /**

  1384      * Tests this URI for equality with another object.

  1385      *

  1386      * <p> If the given object is not a URI then this method immediately

  1387      * returns <tt>false</tt>.

  1388      *

  1389      * <p> For two URIs to be considered equal requires that either both are

  1390      * opaque or both are hierarchical.  Their schemes must either both be

  1391      * undefined or else be equal without regard to case. Their fragments

  1392      * must either both be undefined or else be equal.

  1393      *

  1394      * <p> For two opaque URIs to be considered equal, their scheme-specific

  1395      * parts must be equal.

  1396      *

  1397      * <p> For two hierarchical URIs to be considered equal, their paths must

  1398      * be equal and their queries must either both be undefined or else be

  1399      * equal.  Their authorities must either both be undefined, or both be

  1400      * registry-based, or both be server-based.  If their authorities are

  1401      * defined and are registry-based, then they must be equal.  If their

  1402      * authorities are defined and are server-based, then their hosts must be

  1403      * equal without regard to case, their port numbers must be equal, and

  1404      * their user-information components must be equal.

  1405      *

  1406      * <p> When testing the user-information, path, query, fragment, authority,

  1407      * or scheme-specific parts of two URIs for equality, the raw forms rather

  1408      * than the encoded forms of these components are compared and the

  1409      * hexadecimal digits of escaped octets are compared without regard to

  1410      * case.

  1411      *

  1412      * <p> This method satisfies the general contract of the {@link

  1413      * java.lang.Object#equals(Object) Object.equals} method. </p>

  1414      *

  1415      * @param   ob   The object to which this object is to be compared

  1416      *

  1417      * @return  <tt>true</tt> if, and only if, the given object is a URI that

  1418      *          is identical to this URI

  1419      */

  1420     public boolean equals(Object ob) {

  1421         if (ob == this)

  1422             return true;

  1423         if (!(ob instanceof URI))

  1424             return false;

  1425         URI that = (URI)ob;

  1426         if (this.isOpaque() != that.isOpaque()) return false;

  1427         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;

  1428         if (!equal(this.fragment, that.fragment)) return false;

  1430         // Opaque

  1431         if (this.isOpaque())

  1432             return equal(this.schemeSpecificPart, that.schemeSpecificPart);

  1434         // Hierarchical

  1435         if (!equal(this.path, that.path)) return false;

  1436         if (!equal(this.query, that.query)) return false;

  1438         // Authorities

  1439         if (this.authority == that.authority) return true;

  1440         if (this.host != null) {

  1441             // Server-based

  1442             if (!equal(this.userInfo, that.userInfo)) return false;

  1443             if (!equalIgnoringCase(this.host, that.host)) return false;

  1444             if (this.port != that.port) return false;

  1445         } else if (this.authority != null) {

  1446             // Registry-based

  1447             if (!equal(this.authority, that.authority)) return false;

  1448         } else if (this.authority != that.authority) {

  1449             return false;

  1450         }

  1452         return true;

  1453     }

  1455     /**

  1456      * Returns a hash-code value for this URI.  The hash code is based upon all

  1457      * of the URI's components, and satisfies the general contract of the

  1458      * {@link java.lang.Object#hashCode() Object.hashCode} method.

  1459      *

  1460      * @return  A hash-code value for this URI

  1461      */

  1462     public int hashCode() {

  1463         if (hash != 0)

  1464             return hash;

  1465         int h = hashIgnoringCase(0, scheme);

  1466         h = hash(h, fragment);

  1467         if (isOpaque()) {

  1468             h = hash(h, schemeSpecificPart);

  1469         } else {

  1470             h = hash(h, path);

  1471             h = hash(h, query);

  1472             if (host != null) {

  1473                 h = hash(h, userInfo);

  1474                 h = hashIgnoringCase(h, host);

  1475                 h += 1949 * port;

  1476             } else {

  1477                 h = hash(h, authority);

  1478             }

  1479         }

  1480         hash = h;

  1481         return h;

  1482     }

  1484     /**

  1485      * Compares this URI to another object, which must be a URI.

  1486      *

  1487      * <p> When comparing corresponding components of two URIs, if one

  1488      * component is undefined but the other is defined then the first is

  1489      * considered to be less than the second.  Unless otherwise noted, string

  1490      * components are ordered according to their natural, case-sensitive

  1491      * ordering as defined by the {@link java.lang.String#compareTo(Object)

  1492      * String.compareTo} method.  String components that are subject to

  1493      * encoding are compared by comparing their raw forms rather than their

  1494      * encoded forms.

  1495      *

  1496      * <p> The ordering of URIs is defined as follows: </p>

  1497      *

  1498      * <ul type=disc>

  1499      *

  1500      *   <li><p> Two URIs with different schemes are ordered according the

  1501      *   ordering of their schemes, without regard to case. </p></li>

  1502      *

  1503      *   <li><p> A hierarchical URI is considered to be less than an opaque URI

  1504      *   with an identical scheme. </p></li>

  1505      *

  1506      *   <li><p> Two opaque URIs with identical schemes are ordered according

  1507      *   to the ordering of their scheme-specific parts. </p></li>

  1508      *

  1509      *   <li><p> Two opaque URIs with identical schemes and scheme-specific

  1510      *   parts are ordered according to the ordering of their

  1511      *   fragments. </p></li>

  1512      *

  1513      *   <li><p> Two hierarchical URIs with identical schemes are ordered

  1514      *   according to the ordering of their authority components: </p>

  1515      *

  1516      *   <ul type=disc>

  1517      *

  1518      *     <li><p> If both authority components are server-based then the URIs

  1519      *     are ordered according to their user-information components; if these

  1520      *     components are identical then the URIs are ordered according to the

  1521      *     ordering of their hosts, without regard to case; if the hosts are

  1522      *     identical then the URIs are ordered according to the ordering of

  1523      *     their ports. </p></li>

  1524      *

  1525      *     <li><p> If one or both authority components are registry-based then

  1526      *     the URIs are ordered according to the ordering of their authority

  1527      *     components. </p></li>

  1528      *

  1529      *   </ul></li>

  1530      *

  1531      *   <li><p> Finally, two hierarchical URIs with identical schemes and

  1532      *   authority components are ordered according to the ordering of their

  1533      *   paths; if their paths are identical then they are ordered according to

  1534      *   the ordering of their queries; if the queries are identical then they

  1535      *   are ordered according to the order of their fragments. </p></li>

  1536      *

  1537      * </ul>

  1538      *

  1539      * <p> This method satisfies the general contract of the {@link

  1540      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}

  1541      * method. </p>

  1542      *

  1543      * @param   that

  1544      *          The object to which this URI is to be compared

  1545      *

  1546      * @return  A negative integer, zero, or a positive integer as this URI is

  1547      *          less than, equal to, or greater than the given URI

  1548      *

  1549      * @throws  ClassCastException

  1550      *          If the given object is not a URI

  1551      */

  1552     public int compareTo(URI that) {

  1553         int c;

  1555         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)

  1556             return c;

  1558         if (this.isOpaque()) {

  1559             if (that.isOpaque()) {

  1560                 // Both opaque

  1561                 if ((c = compare(this.schemeSpecificPart,

  1562                                  that.schemeSpecificPart)) != 0)

  1563                     return c;

  1564                 return compare(this.fragment, that.fragment);

  1565             }

  1566             return +1;                  // Opaque > hierarchical

  1567         } else if (that.isOpaque()) {

  1568             return -1;                  // Hierarchical < opaque

  1569         }

  1571         // Hierarchical

  1572         if ((this.host != null) && (that.host != null)) {

  1573             // Both server-based

  1574             if ((c = compare(this.userInfo, that.userInfo)) != 0)

  1575                 return c;

  1576             if ((c = compareIgnoringCase(this.host, that.host)) != 0)

  1577                 return c;

  1578             if ((c = this.port - that.port) != 0)

  1579                 return c;

  1580         } else {

  1581             // If one or both authorities are registry-based then we simply

  1582             // compare them in the usual, case-sensitive way.  If one is

  1583             // registry-based and one is server-based then the strings are

  1584             // guaranteed to be unequal, hence the comparison will never return

  1585             // zero and the compareTo and equals methods will remain

  1586             // consistent.

  1587             if ((c = compare(this.authority, that.authority)) != 0) return c;

  1588         }

  1590         if ((c = compare(this.path, that.path)) != 0) return c;

  1591         if ((c = compare(this.query, that.query)) != 0) return c;

  1592         return compare(this.fragment, that.fragment);

  1593     }

  1595     /**

  1596      * Returns the content of this URI as a string.

  1597      *

  1598      * <p> If this URI was created by invoking one of the constructors in this

  1599      * class then a string equivalent to the original input string, or to the

  1600      * string computed from the originally-given components, as appropriate, is

  1601      * returned.  Otherwise this URI was created by normalization, resolution,

  1602      * or relativization, and so a string is constructed from this URI's

  1603      * components according to the rules specified in <a

  1604      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

  1605      * section&nbsp;5.2, step&nbsp;7. </p>

  1606      *

  1607      * @return  The string form of this URI

  1608      */

  1609     public String toString() {

  1610         defineString();

  1611         return string;

  1612     }

  1614     /**

  1615      * Returns the content of this URI as a US-ASCII string.

  1616      *

  1617      * <p> If this URI does not contain any characters in the <i>other</i>

  1618      * category then an invocation of this method will return the same value as

  1619      * an invocation of the {@link #toString() toString} method.  Otherwise

  1620      * this method works as if by invoking that method and then <a

  1621      * href="#encode">encoding</a> the result.  </p>

  1622      *

  1623      * @return  The string form of this URI, encoded as needed

  1624      *          so that it only contains characters in the US-ASCII

  1625      *          charset

  1626      */

  1627     public String toASCIIString() {

  1628         defineString();

  1629         return encode(string);

  1630     }

  1633     // -- Serialization support --

  1635     /**

  1636      * Saves the content of this URI to the given serial stream.

  1637      *

  1638      * <p> The only serializable field of a URI instance is its <tt>string</tt>

  1639      * field.  That field is given a value, if it does not have one already,

  1640      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}

  1641      * method of the given object-output stream is invoked. </p>

  1642      *

  1643      * @param  os  The object-output stream to which this object

  1644      *             is to be written

  1645      */

  1646     private void writeObject(ObjectOutputStream os)

  1647         throws IOException

  1648     {

  1649         defineString();

  1650         os.defaultWriteObject();        // Writes the string field only

  1651     }

  1653     /**

  1654      * Reconstitutes a URI from the given serial stream.

  1655      *

  1656      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is

  1657      * invoked to read the value of the <tt>string</tt> field.  The result is

  1658      * then parsed in the usual way.

  1659      *

  1660      * @param  is  The object-input stream from which this object

  1661      *             is being read

  1662      */

  1663     private void readObject(ObjectInputStream is)

  1664         throws ClassNotFoundException, IOException

  1665     {

  1666         port = -1;                      // Argh

  1667         is.defaultReadObject();

  1668         try {

  1669             new Parser(string).parse(false);

  1670         } catch (URISyntaxException x) {

  1671             IOException y = new InvalidObjectException("Invalid URI");

  1672             y.initCause(x);

  1673             throw y;

  1674         }

  1675     }

  1678     // -- End of public methods --

  1681     // -- Utility methods for string-field comparison and hashing --

  1683     // These methods return appropriate values for null string arguments,

  1684     // thereby simplifying the equals, hashCode, and compareTo methods.

  1685     //

  1686     // The case-ignoring methods should only be applied to strings whose

  1687     // characters are all known to be US-ASCII.  Because of this restriction,

  1688     // these methods are faster than the similar methods in the String class.

  1690     // US-ASCII only

  1691     private static int toLower(char c) {

  1692         if ((c >= 'A') && (c <= 'Z'))

  1693             return c + ('a' - 'A');

  1694         return c;

  1695     }

  1697     private static boolean equal(String s, String t) {

  1698         if (s == t) return true;

  1699         if ((s != null) && (t != null)) {

  1700             if (s.length() != t.length())

  1701                 return false;

  1702             if (s.indexOf('%') < 0)

  1703                 return s.equals(t);

  1704             int n = s.length();

  1705             for (int i = 0; i < n;) {

  1706                 char c = s.charAt(i);

  1707                 char d = t.charAt(i);

  1708                 if (c != '%') {

  1709                     if (c != d)

  1710                         return false;

  1711                     i++;

  1712                     continue;

  1713                 }

  1714                 i++;

  1715                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1716                     return false;

  1717                 i++;

  1718                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1719                     return false;

  1720                 i++;

  1721             }

  1722             return true;

  1723         }

  1724         return false;

  1725     }

  1727     // US-ASCII only

  1728     private static boolean equalIgnoringCase(String s, String t) {

  1729         if (s == t) return true;

  1730         if ((s != null) && (t != null)) {

  1731             int n = s.length();

  1732             if (t.length() != n)

  1733                 return false;

  1734             for (int i = 0; i < n; i++) {

  1735                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1736                     return false;

  1737             }

  1738             return true;

  1739         }

  1740         return false;

  1741     }

  1743     private static int hash(int hash, String s) {

  1744         if (s == null) return hash;

  1745         return hash * 127 + s.hashCode();

  1746     }

  1748     // US-ASCII only

  1749     private static int hashIgnoringCase(int hash, String s) {

  1750         if (s == null) return hash;

  1751         int h = hash;

  1752         int n = s.length();

  1753         for (int i = 0; i < n; i++)

  1754             h = 31 * h + toLower(s.charAt(i));

  1755         return h;

  1756     }

  1758     private static int compare(String s, String t) {

  1759         if (s == t) return 0;

  1760         if (s != null) {

  1761             if (t != null)

  1762                 return s.compareTo(t);

  1763             else

  1764                 return +1;

  1765         } else {

  1766             return -1;

  1767         }

  1768     }

  1770     // US-ASCII only

  1771     private static int compareIgnoringCase(String s, String t) {

  1772         if (s == t) return 0;

  1773         if (s != null) {

  1774             if (t != null) {

  1775                 int sn = s.length();

  1776                 int tn = t.length();

  1777                 int n = sn < tn ? sn : tn;

  1778                 for (int i = 0; i < n; i++) {

  1779                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));

  1780                     if (c != 0)

  1781                         return c;

  1782                 }

  1783                 return sn - tn;

  1784             }

  1785             return +1;

  1786         } else {

  1787             return -1;

  1788         }

  1789     }

  1792     // -- String construction --

  1794     // If a scheme is given then the path, if given, must be absolute

  1795     //

  1796     private static void checkPath(String s, String scheme, String path)

  1797         throws URISyntaxException

  1798     {

  1799         if (scheme != null) {

  1800             if ((path != null)

  1801                 && ((path.length() > 0) && (path.charAt(0) != '/')))

  1802                 throw new URISyntaxException(s,

  1803                                              "Relative path in absolute URI");

  1804         }

  1805     }

  1807     private void appendAuthority(StringBuffer sb,

  1808                                  String authority,

  1809                                  String userInfo,

  1810                                  String host,

  1811                                  int port)

  1812     {

  1813         if (host != null) {

  1814             sb.append("//");

  1815             if (userInfo != null) {

  1816                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));

  1817                 sb.append('@');

  1818             }

  1819             boolean needBrackets = ((host.indexOf(':') >= 0)

  1820                                     && !host.startsWith("[")

  1821                                     && !host.endsWith("]"));

  1822             if (needBrackets) sb.append('[');

  1823             sb.append(host);

  1824             if (needBrackets) sb.append(']');

  1825             if (port != -1) {

  1826                 sb.append(':');

  1827                 sb.append(port);

  1828             }

  1829         } else if (authority != null) {

  1830             sb.append("//");

  1831             if (authority.startsWith("[")) {

  1832                 // authority should (but may not) contain an embedded IPv6 address

  1833                 int end = authority.indexOf("]");

  1834                 String doquote = authority, dontquote = "";

  1835                 if (end != -1 && authority.indexOf(":") != -1) {

  1836                     // the authority contains an IPv6 address

  1837                     if (end == authority.length()) {

  1838                         dontquote = authority;

  1839                         doquote = "";

  1840                     } else {

  1841                         dontquote = authority.substring(0 , end + 1);

  1842                         doquote = authority.substring(end + 1);

  1843                     }

  1844                 }

  1845                 sb.append(dontquote);

  1846                 sb.append(quote(doquote,

  1847                             L_REG_NAME | L_SERVER,

  1848                             H_REG_NAME | H_SERVER));

  1849             } else {

  1850                 sb.append(quote(authority,

  1851                             L_REG_NAME | L_SERVER,

  1852                             H_REG_NAME | H_SERVER));

  1853             }

  1854         }

  1855     }

  1857     private void appendSchemeSpecificPart(StringBuffer sb,

  1858                                           String opaquePart,

  1859                                           String authority,

  1860                                           String userInfo,

  1861                                           String host,

  1862                                           int port,

  1863                                           String path,

  1864                                           String query)

  1865     {

  1866         if (opaquePart != null) {

  1867             /* check if SSP begins with an IPv6 address

  1868              * because we must not quote a literal IPv6 address

  1869              */

  1870             if (opaquePart.startsWith("//[")) {

  1871                 int end =  opaquePart.indexOf("]");

  1872                 if (end != -1 && opaquePart.indexOf(":")!=-1) {

  1873                     String doquote, dontquote;

  1874                     if (end == opaquePart.length()) {

  1875                         dontquote = opaquePart;

  1876                         doquote = "";

  1877                     } else {

  1878                         dontquote = opaquePart.substring(0,end+1);

  1879                         doquote = opaquePart.substring(end+1);

  1880                     }

  1881                     sb.append (dontquote);

  1882                     sb.append(quote(doquote, L_URIC, H_URIC));

  1883                 }

  1884             } else {

  1885                 sb.append(quote(opaquePart, L_URIC, H_URIC));

  1886             }

  1887         } else {

  1888             appendAuthority(sb, authority, userInfo, host, port);

  1889             if (path != null)

  1890                 sb.append(quote(path, L_PATH, H_PATH));

  1891             if (query != null) {

  1892                 sb.append('?');

  1893                 sb.append(quote(query, L_URIC, H_URIC));

  1894             }

  1895         }

  1896     }

  1898     private void appendFragment(StringBuffer sb, String fragment) {

  1899         if (fragment != null) {

  1900             sb.append('#');

  1901             sb.append(quote(fragment, L_URIC, H_URIC));

  1902         }

  1903     }

  1905     private String toString(String scheme,

  1906                             String opaquePart,

  1907                             String authority,

  1908                             String userInfo,

  1909                             String host,

  1910                             int port,

  1911                             String path,

  1912                             String query,

  1913                             String fragment)

  1914     {

  1915         StringBuffer sb = new StringBuffer();

  1916         if (scheme != null) {

  1917             sb.append(scheme);

  1918             sb.append(':');

  1919         }

  1920         appendSchemeSpecificPart(sb, opaquePart,

  1921                                  authority, userInfo, host, port,

  1922                                  path, query);

  1923         appendFragment(sb, fragment);

  1924         return sb.toString();

  1925     }

  1927     private void defineSchemeSpecificPart() {

  1928         if (schemeSpecificPart != null) return;

  1929         StringBuffer sb = new StringBuffer();

  1930         appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),

  1931                                  host, port, getPath(), getQuery());

  1932         if (sb.length() == 0) return;

  1933         schemeSpecificPart = sb.toString();

  1934     }

  1936     private void defineString() {

  1937         if (string != null) return;

  1939         StringBuffer sb = new StringBuffer();

  1940         if (scheme != null) {

  1941             sb.append(scheme);

  1942             sb.append(':');

  1943         }

  1944         if (isOpaque()) {

  1945             sb.append(schemeSpecificPart);

  1946         } else {

  1947             if (host != null) {

  1948                 sb.append("//");

  1949                 if (userInfo != null) {

  1950                     sb.append(userInfo);

  1951                     sb.append('@');

  1952                 }

  1953                 boolean needBrackets = ((host.indexOf(':') >= 0)

  1954                                     && !host.startsWith("[")

  1955                                     && !host.endsWith("]"));

  1956                 if (needBrackets) sb.append('[');

  1957                 sb.append(host);

  1958                 if (needBrackets) sb.append(']');

  1959                 if (port != -1) {

  1960                     sb.append(':');

  1961                     sb.append(port);

  1962                 }

  1963             } else if (authority != null) {

  1964                 sb.append("//");

  1965                 sb.append(authority);

  1966             }

  1967             if (path != null)

  1968                 sb.append(path);

  1969             if (query != null) {

  1970                 sb.append('?');

  1971                 sb.append(query);

  1972             }

  1973         }

  1974         if (fragment != null) {

  1975             sb.append('#');

  1976             sb.append(fragment);

  1977         }

  1978         string = sb.toString();

  1979     }

  1982     // -- Normalization, resolution, and relativization --

  1984     // RFC2396 5.2 (6)

  1985     private static String resolvePath(String base, String child,

  1986                                       boolean absolute)

  1987     {

  1988         int i = base.lastIndexOf('/');

  1989         int cn = child.length();

  1990         String path = "";

  1992         if (cn == 0) {

  1993             // 5.2 (6a)

  1994             if (i >= 0)

  1995                 path = base.substring(0, i + 1);

  1996         } else {

  1997             StringBuffer sb = new StringBuffer(base.length() + cn);

  1998             // 5.2 (6a)

  1999             if (i >= 0)

  2000                 sb.append(base.substring(0, i + 1));

  2001             // 5.2 (6b)

  2002             sb.append(child);

  2003             path = sb.toString();

  2004         }

  2006         // 5.2 (6c-f)

  2007         String np = normalize(path);

  2009         // 5.2 (6g): If the result is absolute but the path begins with "../",

  2010         // then we simply leave the path as-is

  2012         return np;

  2013     }

  2015     // RFC2396 5.2

  2016     private static URI resolve(URI base, URI child) {

  2017         // check if child if opaque first so that NPE is thrown

  2018         // if child is null.

  2019         if (child.isOpaque() || base.isOpaque())

  2020             return child;

  2022         // 5.2 (2): Reference to current document (lone fragment)

  2023         if ((child.scheme == null) && (child.authority == null)

  2024             && child.path.equals("") && (child.fragment != null)

  2025             && (child.query == null)) {

  2026             if ((base.fragment != null)

  2027                 && child.fragment.equals(base.fragment)) {

  2028                 return base;

  2029             }

  2030             URI ru = new URI();

  2031             ru.scheme = base.scheme;

  2032             ru.authority = base.authority;

  2033             ru.userInfo = base.userInfo;

  2034             ru.host = base.host;

  2035             ru.port = base.port;

  2036             ru.path = base.path;

  2037             ru.fragment = child.fragment;

  2038             ru.query = base.query;

  2039             return ru;

  2040         }

  2042         // 5.2 (3): Child is absolute

  2043         if (child.scheme != null)

  2044             return child;

  2046         URI ru = new URI();             // Resolved URI

  2047         ru.scheme = base.scheme;

  2048         ru.query = child.query;

  2049         ru.fragment = child.fragment;

  2051         // 5.2 (4): Authority

  2052         if (child.authority == null) {

  2053             ru.authority = base.authority;

  2054             ru.host = base.host;

  2055             ru.userInfo = base.userInfo;

  2056             ru.port = base.port;

  2058             String cp = (child.path == null) ? "" : child.path;

  2059             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {

  2060                 // 5.2 (5): Child path is absolute

  2061                 ru.path = child.path;

  2062             } else {

  2063                 // 5.2 (6): Resolve relative path

  2064                 ru.path = resolvePath(base.path, cp, base.isAbsolute());

  2065             }

  2066         } else {

  2067             ru.authority = child.authority;

  2068             ru.host = child.host;

  2069             ru.userInfo = child.userInfo;

  2070             ru.host = child.host;

  2071             ru.port = child.port;

  2072             ru.path = child.path;

  2073         }

  2075         // 5.2 (7): Recombine (nothing to do here)

  2076         return ru;

  2077     }

  2079     // If the given URI's path is normal then return the URI;

  2080     // o.w., return a new URI containing the normalized path.

  2081     //

  2082     private static URI normalize(URI u) {

  2083         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))

  2084             return u;

  2086         String np = normalize(u.path);

  2087         if (np == u.path)

  2088             return u;

  2090         URI v = new URI();

  2091         v.scheme = u.scheme;

  2092         v.fragment = u.fragment;

  2093         v.authority = u.authority;

  2094         v.userInfo = u.userInfo;

  2095         v.host = u.host;

  2096         v.port = u.port;

  2097         v.path = np;

  2098         v.query = u.query;

  2099         return v;

  2100     }

  2102     // If both URIs are hierarchical, their scheme and authority components are

  2103     // identical, and the base path is a prefix of the child's path, then

  2104     // return a relative URI that, when resolved against the base, yields the

  2105     // child; otherwise, return the child.

  2106     //

  2107     private static URI relativize(URI base, URI child) {

  2108         // check if child if opaque first so that NPE is thrown

  2109         // if child is null.

  2110         if (child.isOpaque() || base.isOpaque())

  2111             return child;

  2112         if (!equalIgnoringCase(base.scheme, child.scheme)

  2113             || !equal(base.authority, child.authority))

  2114             return child;

  2116         String bp = normalize(base.path);

  2117         String cp = normalize(child.path);

  2118         if (!bp.equals(cp)) {

  2119             if (!bp.endsWith("/"))

  2120                 bp = bp + "/";

  2121             if (!cp.startsWith(bp))

  2122                 return child;

  2123         }

  2125         URI v = new URI();

  2126         v.path = cp.substring(bp.length());

  2127         v.query = child.query;

  2128         v.fragment = child.fragment;

  2129         return v;

  2130     }

  2134     // -- Path normalization --

  2136     // The following algorithm for path normalization avoids the creation of a

  2137     // string object for each segment, as well as the use of a string buffer to

  2138     // compute the final result, by using a single char array and editing it in

  2139     // place.  The array is first split into segments, replacing each slash

  2140     // with '\0' and creating a segment-index array, each element of which is

  2141     // the index of the first char in the corresponding segment.  We then walk

  2142     // through both arrays, removing ".", "..", and other segments as necessary

  2143     // by setting their entries in the index array to -1.  Finally, the two

  2144     // arrays are used to rejoin the segments and compute the final result.

  2145     //

  2146     // This code is based upon src/solaris/native/java/io/canonicalize_md.c

  2149     // Check the given path to see if it might need normalization.  A path

  2150     // might need normalization if it contains duplicate slashes, a "."

  2151     // segment, or a ".." segment.  Return -1 if no further normalization is

  2152     // possible, otherwise return the number of segments found.

  2153     //

  2154     // This method takes a string argument rather than a char array so that

  2155     // this test can be performed without invoking path.toCharArray().

  2156     //

  2157     static private int needsNormalization(String path) {

  2158         boolean normal = true;

  2159         int ns = 0;                     // Number of segments

  2160         int end = path.length() - 1;    // Index of last char in path

  2161         int p = 0;                      // Index of next char in path

  2163         // Skip initial slashes

  2164         while (p <= end) {

  2165             if (path.charAt(p) != '/') break;

  2166             p++;

  2167         }

  2168         if (p > 1) normal = false;

  2170         // Scan segments

  2171         while (p <= end) {

  2173             // Looking at "." or ".." ?

  2174             if ((path.charAt(p) == '.')

  2175                 && ((p == end)

  2176                     || ((path.charAt(p + 1) == '/')

  2177                         || ((path.charAt(p + 1) == '.')

  2178                             && ((p + 1 == end)

  2179                                 || (path.charAt(p + 2) == '/')))))) {

  2180                 normal = false;

  2181             }

  2182             ns++;

  2184             // Find beginning of next segment

  2185             while (p <= end) {

  2186                 if (path.charAt(p++) != '/')

  2187                     continue;

  2189                 // Skip redundant slashes

  2190                 while (p <= end) {

  2191                     if (path.charAt(p) != '/') break;

  2192                     normal = false;

  2193                     p++;

  2194                 }

  2196                 break;

  2197             }

  2198         }

  2200         return normal ? -1 : ns;

  2201     }

  2204     // Split the given path into segments, replacing slashes with nulls and

  2205     // filling in the given segment-index array.

  2206     //

  2207     // Preconditions:

  2208     //   segs.length == Number of segments in path

  2209     //

  2210     // Postconditions:

  2211     //   All slashes in path replaced by '\0'

  2212     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)

  2213     //

  2214     static private void split(char[] path, int[] segs) {

  2215         int end = path.length - 1;      // Index of last char in path

  2216         int p = 0;                      // Index of next char in path

  2217         int i = 0;                      // Index of current segment

  2219         // Skip initial slashes

  2220         while (p <= end) {

  2221             if (path[p] != '/') break;

  2222             path[p] = '\0';

  2223             p++;

  2224         }

  2226         while (p <= end) {

  2228             // Note start of segment

  2229             segs[i++] = p++;

  2231             // Find beginning of next segment

  2232             while (p <= end) {

  2233                 if (path[p++] != '/')

  2234                     continue;

  2235                 path[p - 1] = '\0';

  2237                 // Skip redundant slashes

  2238                 while (p <= end) {

  2239                     if (path[p] != '/') break;

  2240                     path[p++] = '\0';

  2241                 }

  2242                 break;

  2243             }

  2244         }

  2246         if (i != segs.length)

  2247             throw new InternalError();  // ASSERT

  2248     }

  2251     // Join the segments in the given path according to the given segment-index

  2252     // array, ignoring those segments whose index entries have been set to -1,

  2253     // and inserting slashes as needed.  Return the length of the resulting

  2254     // path.

  2255     //

  2256     // Preconditions:

  2257     //   segs[i] == -1 implies segment i is to be ignored

  2258     //   path computed by split, as above, with '\0' having replaced '/'

  2259     //

  2260     // Postconditions:

  2261     //   path[0] .. path[return value] == Resulting path

  2262     //

  2263     static private int join(char[] path, int[] segs) {

  2264         int ns = segs.length;           // Number of segments

  2265         int end = path.length - 1;      // Index of last char in path

  2266         int p = 0;                      // Index of next path char to write

  2268         if (path[p] == '\0') {

  2269             // Restore initial slash for absolute paths

  2270             path[p++] = '/';

  2271         }

  2273         for (int i = 0; i < ns; i++) {

  2274             int q = segs[i];            // Current segment

  2275             if (q == -1)

  2276                 // Ignore this segment

  2277                 continue;

  2279             if (p == q) {

  2280                 // We're already at this segment, so just skip to its end

  2281                 while ((p <= end) && (path[p] != '\0'))

  2282                     p++;

  2283                 if (p <= end) {

  2284                     // Preserve trailing slash

  2285                     path[p++] = '/';

  2286                 }

  2287             } else if (p < q) {

  2288                 // Copy q down to p

  2289                 while ((q <= end) && (path[q] != '\0'))

  2290                     path[p++] = path[q++];

  2291                 if (q <= end) {

  2292                     // Preserve trailing slash

  2293                     path[p++] = '/';

  2294                 }

  2295             } else

  2296                 throw new InternalError(); // ASSERT false

  2297         }

  2299         return p;

  2300     }

  2303     // Remove "." segments from the given path, and remove segment pairs

  2304     // consisting of a non-".." segment followed by a ".." segment.

  2305     //

  2306     private static void removeDots(char[] path, int[] segs) {

  2307         int ns = segs.length;

  2308         int end = path.length - 1;

  2310         for (int i = 0; i < ns; i++) {

  2311             int dots = 0;               // Number of dots found (0, 1, or 2)

  2313             // Find next occurrence of "." or ".."

  2314             do {

  2315                 int p = segs[i];

  2316                 if (path[p] == '.') {

  2317                     if (p == end) {

  2318                         dots = 1;

  2319                         break;

  2320                     } else if (path[p + 1] == '\0') {

  2321                         dots = 1;

  2322                         break;

  2323                     } else if ((path[p + 1] == '.')

  2324                                && ((p + 1 == end)

  2325                                    || (path[p + 2] == '\0'))) {

  2326                         dots = 2;

  2327                         break;

  2328                     }

  2329                 }

  2330                 i++;

  2331             } while (i < ns);

  2332             if ((i > ns) || (dots == 0))

  2333                 break;

  2335             if (dots == 1) {

  2336                 // Remove this occurrence of "."

  2337                 segs[i] = -1;

  2338             } else {

  2339                 // If there is a preceding non-".." segment, remove both that

  2340                 // segment and this occurrence of ".."; otherwise, leave this

  2341                 // ".." segment as-is.

  2342                 int j;

  2343                 for (j = i - 1; j >= 0; j--) {

  2344                     if (segs[j] != -1) break;

  2345                 }

  2346                 if (j >= 0) {

  2347                     int q = segs[j];

  2348                     if (!((path[q] == '.')

  2349                           && (path[q + 1] == '.')

  2350                           && (path[q + 2] == '\0'))) {

  2351                         segs[i] = -1;

  2352                         segs[j] = -1;

  2353                     }

  2354                 }

  2355             }

  2356         }

  2357     }

  2360     // DEVIATION: If the normalized path is relative, and if the first

  2361     // segment could be parsed as a scheme name, then prepend a "." segment

  2362     //

  2363     private static void maybeAddLeadingDot(char[] path, int[] segs) {

  2365         if (path[0] == '\0')

  2366             // The path is absolute

  2367             return;

  2369         int ns = segs.length;

  2370         int f = 0;                      // Index of first segment

  2371         while (f < ns) {

  2372             if (segs[f] >= 0)

  2373                 break;

  2374             f++;

  2375         }

  2376         if ((f >= ns) || (f == 0))

  2377             // The path is empty, or else the original first segment survived,

  2378             // in which case we already know that no leading "." is needed

  2379             return;

  2381         int p = segs[f];

  2382         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;

  2383         if (p >= path.length || path[p] == '\0')

  2384             // No colon in first segment, so no "." needed

  2385             return;

  2387         // At this point we know that the first segment is unused,

  2388         // hence we can insert a "." segment at that position

  2389         path[0] = '.';

  2390         path[1] = '\0';

  2391         segs[0] = 0;

  2392     }

  2395     // Normalize the given path string.  A normal path string has no empty

  2396     // segments (i.e., occurrences of "//"), no segments equal to ".", and no

  2397     // segments equal to ".." that are preceded by a segment not equal to "..".

  2398     // In contrast to Unix-style pathname normalization, for URI paths we

  2399     // always retain trailing slashes.

  2400     //

  2401     private static String normalize(String ps) {

  2403         // Does this path need normalization?

  2404         int ns = needsNormalization(ps);        // Number of segments

  2405         if (ns < 0)

  2406             // Nope -- just return it

  2407             return ps;

  2409         char[] path = ps.toCharArray();         // Path in char-array form

  2411         // Split path into segments

  2412         int[] segs = new int[ns];               // Segment-index array

  2413         split(path, segs);

  2415         // Remove dots

  2416         removeDots(path, segs);

  2418         // Prevent scheme-name confusion

  2419         maybeAddLeadingDot(path, segs);

  2421         // Join the remaining segments and return the result

  2422         String s = new String(path, 0, join(path, segs));

  2423         if (s.equals(ps)) {

  2424             // string was already normalized

  2425             return ps;

  2426         }

  2427         return s;

  2428     }

  2432     // -- Character classes for parsing --

  2434     // RFC2396 precisely specifies which characters in the US-ASCII charset are

  2435     // permissible in the various components of a URI reference.  We here

  2436     // define a set of mask pairs to aid in enforcing these restrictions.  Each

  2437     // mask pair consists of two longs, a low mask and a high mask.  Taken

  2438     // together they represent a 128-bit mask, where bit i is set iff the

  2439     // character with value i is permitted.

  2440     //

  2441     // This approach is more efficient than sequentially searching arrays of

  2442     // permitted characters.  It could be made still more efficient by

  2443     // precompiling the mask information so that a character's presence in a

  2444     // given mask could be determined by a single table lookup.

  2446     // Compute the low-order mask for the characters in the given string

  2447     private static long lowMask(String chars) {

  2448         int n = chars.length();

  2449         long m = 0;

  2450         for (int i = 0; i < n; i++) {

  2451             char c = chars.charAt(i);

  2452             if (c < 64)

  2453                 m |= (1L << c);

  2454         }

  2455         return m;

  2456     }

  2458     // Compute the high-order mask for the characters in the given string

  2459     private static long highMask(String chars) {

  2460         int n = chars.length();

  2461         long m = 0;

  2462         for (int i = 0; i < n; i++) {

  2463             char c = chars.charAt(i);

  2464             if ((c >= 64) && (c < 128))

  2465                 m |= (1L << (c - 64));

  2466         }

  2467         return m;

  2468     }

  2470     // Compute a low-order mask for the characters

  2471     // between first and last, inclusive

  2472     private static long lowMask(char first, char last) {

  2473         long m = 0;

  2474         int f = Math.max(Math.min(first, 63), 0);

  2475         int l = Math.max(Math.min(last, 63), 0);

  2476         for (int i = f; i <= l; i++)

  2477             m |= 1L << i;

  2478         return m;

  2479     }

  2481     // Compute a high-order mask for the characters

  2482     // between first and last, inclusive

  2483     private static long highMask(char first, char last) {

  2484         long m = 0;

  2485         int f = Math.max(Math.min(first, 127), 64) - 64;

  2486         int l = Math.max(Math.min(last, 127), 64) - 64;

  2487         for (int i = f; i <= l; i++)

  2488             m |= 1L << i;

  2489         return m;

  2490     }

  2492     // Tell whether the given character is permitted by the given mask pair

  2493     private static boolean match(char c, long lowMask, long highMask) {

  2494         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.

  2495             return false;

  2496         if (c < 64)

  2497             return ((1L << c) & lowMask) != 0;

  2498         if (c < 128)

  2499             return ((1L << (c - 64)) & highMask) != 0;

  2500         return false;

  2501     }

  2503     // Character-class masks, in reverse order from RFC2396 because

  2504     // initializers for static fields cannot make forward references.

  2506     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |

  2507     //            "8" | "9"

  2508     private static final long L_DIGIT = lowMask('0', '9');

  2509     private static final long H_DIGIT = 0L;

  2511     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |

  2512     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |

  2513     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"

  2514     private static final long L_UPALPHA = 0L;

  2515     private static final long H_UPALPHA = highMask('A', 'Z');

  2517     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |

  2518     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |

  2519     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"

  2520     private static final long L_LOWALPHA = 0L;

  2521     private static final long H_LOWALPHA = highMask('a', 'z');

  2523     // alpha         = lowalpha | upalpha

  2524     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;

  2525     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;

  2527     // alphanum      = alpha | digit

  2528     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;

  2529     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;

  2531     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |

  2532     //                         "a" | "b" | "c" | "d" | "e" | "f"

  2533     private static final long L_HEX = L_DIGIT;

  2534     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');

  2536     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |

  2537     //                 "(" | ")"

  2538     private static final long L_MARK = lowMask("-_.!~*'()");

  2539     private static final long H_MARK = highMask("-_.!~*'()");

  2541     // unreserved    = alphanum | mark

  2542     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;

  2543     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;

  2545     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

  2546     //                 "$" | "," | "[" | "]"

  2547     // Added per RFC2732: "[", "]"

  2548     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");

  2549     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");

  2551     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII

  2552     // characters are allowed; this is handled by the scanEscape method below.

  2553     private static final long L_ESCAPED = 1L;

  2554     private static final long H_ESCAPED = 0L;

  2556     // uric          = reserved | unreserved | escaped

  2557     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;

  2558     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;

  2560     // pchar         = unreserved | escaped |

  2561     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","

  2562     private static final long L_PCHAR

  2563         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");

  2564     private static final long H_PCHAR

  2565         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");

  2567     // All valid path characters

  2568     private static final long L_PATH = L_PCHAR | lowMask(";/");

  2569     private static final long H_PATH = H_PCHAR | highMask(";/");

  2571     // Dash, for use in domainlabel and toplabel

  2572     private static final long L_DASH = lowMask("-");

  2573     private static final long H_DASH = highMask("-");

  2575     // Dot, for use in hostnames

  2576     private static final long L_DOT = lowMask(".");

  2577     private static final long H_DOT = highMask(".");

  2579     // userinfo      = *( unreserved | escaped |

  2580     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )

  2581     private static final long L_USERINFO

  2582         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");

  2583     private static final long H_USERINFO

  2584         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");

  2586     // reg_name      = 1*( unreserved | escaped | "$" | "," |

  2587     //                     ";" | ":" | "@" | "&" | "=" | "+" )

  2588     private static final long L_REG_NAME

  2589         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");

  2590     private static final long H_REG_NAME

  2591         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");

  2593     // All valid characters for server-based authorities

  2594     private static final long L_SERVER

  2595         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");

  2596     private static final long H_SERVER

  2597         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");

  2599     // Special case of server authority that represents an IPv6 address

  2600     // In this case, a % does not signify an escape sequence

  2601     private static final long L_SERVER_PERCENT

  2602         = L_SERVER | lowMask("%");

  2603     private static final long H_SERVER_PERCENT

  2604         = H_SERVER | highMask("%");

  2605     private static final long L_LEFT_BRACKET = lowMask("[");

  2606     private static final long H_LEFT_BRACKET = highMask("[");

  2608     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )

  2609     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");

  2610     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");

  2612     // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |

  2613     //                 "&" | "=" | "+" | "$" | ","

  2614     private static final long L_URIC_NO_SLASH

  2615         = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");

  2616     private static final long H_URIC_NO_SLASH

  2617         = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");

  2620     // -- Escaping and encoding --

  2622     private final static char[] hexDigits = {

  2623         '0', '1', '2', '3', '4', '5', '6', '7',

  2624         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'

  2625     };

  2627     private static void appendEscape(StringBuffer sb, byte b) {

  2628         sb.append('%');

  2629         sb.append(hexDigits[(b >> 4) & 0x0f]);

  2630         sb.append(hexDigits[(b >> 0) & 0x0f]);

  2631     }

  2633     private static void appendEncoded(StringBuffer sb, char c) {

  2634         ByteBuffer bb = null;

  2635         try {

  2636             bb = ThreadLocalCoders.encoderFor("UTF-8")

  2637                 .encode(CharBuffer.wrap("" + c));

  2638         } catch (CharacterCodingException x) {

  2639             assert false;

  2640         }

  2641         while (bb.hasRemaining()) {

  2642             int b = bb.get() & 0xff;

  2643             if (b >= 0x80)

  2644                 appendEscape(sb, (byte)b);

  2645             else

  2646                 sb.append((char)b);

  2647         }

  2648     }

  2650     // Quote any characters in s that are not permitted

  2651     // by the given mask pair

  2652     //

  2653     private static String quote(String s, long lowMask, long highMask) {

  2654         int n = s.length();

  2655         StringBuffer sb = null;

  2656         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);

  2657         for (int i = 0; i < s.length(); i++) {

  2658             char c = s.charAt(i);

  2659             if (c < '\u0080') {

  2660                 if (!match(c, lowMask, highMask)) {

  2661                     if (sb == null) {

  2662                         sb = new StringBuffer();

  2663                         sb.append(s.substring(0, i));

  2664                     }

  2665                     appendEscape(sb, (byte)c);

  2666                 } else {

  2667                     if (sb != null)

  2668                         sb.append(c);

  2669                 }

  2670             } else if (allowNonASCII

  2671                        && (Character.isSpaceChar(c)

  2672                            || Character.isISOControl(c))) {

  2673                 if (sb == null) {

  2674                     sb = new StringBuffer();

  2675                     sb.append(s.substring(0, i));

  2676                 }

  2677                 appendEncoded(sb, c);

  2678             } else {

  2679                 if (sb != null)

  2680                     sb.append(c);

  2681             }

  2682         }

  2683         return (sb == null) ? s : sb.toString();

  2684     }

  2686     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,

  2687     // assuming that s is otherwise legal

  2688     //

  2689     private static String encode(String s) {

  2690         int n = s.length();

  2691         if (n == 0)

  2692             return s;

  2694         // First check whether we actually need to encode

  2695         for (int i = 0;;) {

  2696             if (s.charAt(i) >= '\u0080')

  2697                 break;

  2698             if (++i >= n)

  2699                 return s;

  2700         }

  2702         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);

  2703         ByteBuffer bb = null;

  2704         try {

  2705             bb = ThreadLocalCoders.encoderFor("UTF-8")

  2706                 .encode(CharBuffer.wrap(ns));

  2707         } catch (CharacterCodingException x) {

  2708             assert false;

  2709         }

  2711         StringBuffer sb = new StringBuffer();

  2712         while (bb.hasRemaining()) {

  2713             int b = bb.get() & 0xff;

  2714             if (b >= 0x80)

  2715                 appendEscape(sb, (byte)b);

  2716             else

  2717                 sb.append((char)b);

  2718         }

  2719         return sb.toString();

  2720     }

  2722     private static int decode(char c) {

  2723         if ((c >= '0') && (c <= '9'))

  2724             return c - '0';

  2725         if ((c >= 'a') && (c <= 'f'))

  2726             return c - 'a' + 10;

  2727         if ((c >= 'A') && (c <= 'F'))

  2728             return c - 'A' + 10;

  2729         assert false;

  2730         return -1;

  2731     }

  2733     private static byte decode(char c1, char c2) {

  2734         return (byte)(  ((decode(c1) & 0xf) << 4)

  2735                       | ((decode(c2) & 0xf) << 0));

  2736     }

  2738     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes

  2739     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a

  2740     // sequence of escaped octets is not valid UTF-8 then the erroneous octets

  2741     // are replaced with '\uFFFD'.

  2742     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal

  2743     //            with a scope_id

  2744     //

  2745     private static String decode(String s) {

  2746         if (s == null)

  2747             return s;

  2748         int n = s.length();

  2749         if (n == 0)

  2750             return s;

  2751         if (s.indexOf('%') < 0)

  2752             return s;

  2754         StringBuffer sb = new StringBuffer(n);

  2755         ByteBuffer bb = ByteBuffer.allocate(n);

  2756         CharBuffer cb = CharBuffer.allocate(n);

  2757         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")

  2758             .onMalformedInput(CodingErrorAction.REPLACE)

  2759             .onUnmappableCharacter(CodingErrorAction.REPLACE);

  2761         // This is not horribly efficient, but it will do for now

  2762         char c = s.charAt(0);

  2763         boolean betweenBrackets = false;

  2765         for (int i = 0; i < n;) {

  2766             assert c == s.charAt(i);    // Loop invariant

  2767             if (c == '[') {

  2768                 betweenBrackets = true;

  2769             } else if (betweenBrackets && c == ']') {

  2770                 betweenBrackets = false;

  2771             }

  2772             if (c != '%' || betweenBrackets) {

  2773                 sb.append(c);

  2774                 if (++i >= n)

  2775                     break;

  2776                 c = s.charAt(i);

  2777                 continue;

  2778             }

  2779             bb.clear();

  2780             int ui = i;

  2781             for (;;) {

  2782                 assert (n - i >= 2);

  2783                 bb.put(decode(s.charAt(++i), s.charAt(++i)));

  2784                 if (++i >= n)

  2785                     break;

  2786                 c = s.charAt(i);

  2787                 if (c != '%')

  2788                     break;

  2789             }

  2790             bb.flip();

  2791             cb.clear();

  2792             dec.reset();

  2793             CoderResult cr = dec.decode(bb, cb, true);

  2794             assert cr.isUnderflow();

  2795             cr = dec.flush(cb);

  2796             assert cr.isUnderflow();

  2797             sb.append(cb.flip().toString());

  2798         }

  2800         return sb.toString();

  2801     }

  2804     // -- Parsing --

  2806     // For convenience we wrap the input URI string in a new instance of the

  2807     // following internal class.  This saves always having to pass the input

  2808     // string as an argument to each internal scan/parse method.

  2810     private class Parser {

  2812         private String input;           // URI input string

  2813         private boolean requireServerAuthority = false;

  2815         Parser(String s) {

  2816             input = s;

  2817             string = s;

  2818         }

  2820         // -- Methods for throwing URISyntaxException in various ways --

  2822         private void fail(String reason) throws URISyntaxException {

  2823             throw new URISyntaxException(input, reason);

  2824         }

  2826         private void fail(String reason, int p) throws URISyntaxException {

  2827             throw new URISyntaxException(input, reason, p);

  2828         }

  2830         private void failExpecting(String expected, int p)

  2831             throws URISyntaxException

  2832         {

  2833             fail("Expected " + expected, p);

  2834         }

  2836         private void failExpecting(String expected, String prior, int p)

  2837             throws URISyntaxException

  2838         {

  2839             fail("Expected " + expected + " following " + prior, p);

  2840         }

  2843         // -- Simple access to the input string --

  2845         // Return a substring of the input string

  2846         //

  2847         private String substring(int start, int end) {

  2848             return input.substring(start, end);

  2849         }

  2851         // Return the char at position p,

  2852         // assuming that p < input.length()

  2853         //

  2854         private char charAt(int p) {

  2855             return input.charAt(p);

  2856         }

  2858         // Tells whether start < end and, if so, whether charAt(start) == c

  2859         //

  2860         private boolean at(int start, int end, char c) {

  2861             return (start < end) && (charAt(start) == c);

  2862         }

  2864         // Tells whether start + s.length() < end and, if so,

  2865         // whether the chars at the start position match s exactly

  2866         //

  2867         private boolean at(int start, int end, String s) {

  2868             int p = start;

  2869             int sn = s.length();

  2870             if (sn > end - p)

  2871                 return false;

  2872             int i = 0;

  2873             while (i < sn) {

  2874                 if (charAt(p++) != s.charAt(i)) {

  2875                     break;

  2876                 }

  2877                 i++;

  2878             }

  2879             return (i == sn);

  2880         }

  2883         // -- Scanning --

  2885         // The various scan and parse methods that follow use a uniform

  2886         // convention of taking the current start position and end index as

  2887         // their first two arguments.  The start is inclusive while the end is

  2888         // exclusive, just as in the String class, i.e., a start/end pair

  2889         // denotes the left-open interval [start, end) of the input string.

  2890         //

  2891         // These methods never proceed past the end position.  They may return

  2892         // -1 to indicate outright failure, but more often they simply return

  2893         // the position of the first char after the last char scanned.  Thus

  2894         // a typical idiom is

  2895         //

  2896         //     int p = start;

  2897         //     int q = scan(p, end, ...);

  2898         //     if (q > p)

  2899         //         // We scanned something

  2900         //         ...;

  2901         //     else if (q == p)

  2902         //         // We scanned nothing

  2903         //         ...;

  2904         //     else if (q == -1)

  2905         //         // Something went wrong

  2906         //         ...;

  2909         // Scan a specific char: If the char at the given start position is

  2910         // equal to c, return the index of the next char; otherwise, return the

  2911         // start position.

  2912         //

  2913         private int scan(int start, int end, char c) {

  2914             if ((start < end) && (charAt(start) == c))

  2915                 return start + 1;

  2916             return start;

  2917         }

  2919         // Scan forward from the given start position.  Stop at the first char

  2920         // in the err string (in which case -1 is returned), or the first char

  2921         // in the stop string (in which case the index of the preceding char is

  2922         // returned), or the end of the input string (in which case the length

  2923         // of the input string is returned).  May return the start position if

  2924         // nothing matches.

  2925         //

  2926         private int scan(int start, int end, String err, String stop) {

  2927             int p = start;

  2928             while (p < end) {

  2929                 char c = charAt(p);

  2930                 if (err.indexOf(c) >= 0)

  2931                     return -1;

  2932                 if (stop.indexOf(c) >= 0)

  2933                     break;

  2934                 p++;

  2935             }

  2936             return p;

  2937         }

  2939         // Scan a potential escape sequence, starting at the given position,

  2940         // with the given first char (i.e., charAt(start) == c).

  2941         //

  2942         // This method assumes that if escapes are allowed then visible

  2943         // non-US-ASCII chars are also allowed.

  2944         //

  2945         private int scanEscape(int start, int n, char first)

  2946             throws URISyntaxException

  2947         {

  2948             int p = start;

  2949             char c = first;

  2950             if (c == '%') {

  2951                 // Process escape pair

  2952                 if ((p + 3 <= n)

  2953                     && match(charAt(p + 1), L_HEX, H_HEX)

  2954                     && match(charAt(p + 2), L_HEX, H_HEX)) {

  2955                     return p + 3;

  2956                 }

  2957                 fail("Malformed escape pair", p);

  2958             } else if ((c > 128)

  2959                        && !Character.isSpaceChar(c)

  2960                        && !Character.isISOControl(c)) {

  2961                 // Allow unescaped but visible non-US-ASCII chars

  2962                 return p + 1;

  2963             }

  2964             return p;

  2965         }

  2967         // Scan chars that match the given mask pair

  2968         //

  2969         private int scan(int start, int n, long lowMask, long highMask)

  2970             throws URISyntaxException

  2971         {

  2972             int p = start;

  2973             while (p < n) {

  2974                 char c = charAt(p);

  2975                 if (match(c, lowMask, highMask)) {

  2976                     p++;

  2977                     continue;

  2978                 }

  2979                 if ((lowMask & L_ESCAPED) != 0) {

  2980                     int q = scanEscape(p, n, c);

  2981                     if (q > p) {

  2982                         p = q;

  2983                         continue;

  2984                     }

  2985                 }

  2986                 break;

  2987             }

  2988             return p;

  2989         }

  2991         // Check that each of the chars in [start, end) matches the given mask

  2992         //

  2993         private void checkChars(int start, int end,

  2994                                 long lowMask, long highMask,

  2995                                 String what)

  2996             throws URISyntaxException

  2997         {

  2998             int p = scan(start, end, lowMask, highMask);

  2999             if (p < end)

  3000                 fail("Illegal character in " + what, p);

  3001         }

  3003         // Check that the char at position p matches the given mask

  3004         //

  3005         private void checkChar(int p,

  3006                                long lowMask, long highMask,

  3007                                String what)

  3008             throws URISyntaxException

  3009         {

  3010             checkChars(p, p + 1, lowMask, highMask, what);

  3011         }

  3014         // -- Parsing --

  3016         // [<scheme>:]<scheme-specific-part>[#<fragment>]

  3017         //

  3018         void parse(boolean rsa) throws URISyntaxException {

  3019             requireServerAuthority = rsa;

  3020             int ssp;                    // Start of scheme-specific part

  3021             int n = input.length();

  3022             int p = scan(0, n, "/?#", ":");

  3023             if ((p >= 0) && at(p, n, ':')) {

  3024                 if (p == 0)

  3025                     failExpecting("scheme name", 0);

  3026                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");

  3027                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");

  3028                 scheme = substring(0, p);

  3029                 p++;                    // Skip ':'

  3030                 ssp = p;

  3031                 if (at(p, n, '/')) {

  3032                     p = parseHierarchical(p, n);

  3033                 } else {

  3034                     int q = scan(p, n, "", "#");

  3035                     if (q <= p)

  3036                         failExpecting("scheme-specific part", p);

  3037                     checkChars(p, q, L_URIC, H_URIC, "opaque part");

  3038                     p = q;

  3039                 }

  3040             } else {

  3041                 ssp = 0;

  3042                 p = parseHierarchical(0, n);

  3043             }

  3044             schemeSpecificPart = substring(ssp, p);

  3045             if (at(p, n, '#')) {

  3046                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");

  3047                 fragment = substring(p + 1, n);

  3048                 p = n;

  3049             }

  3050             if (p < n)

  3051                 fail("end of URI", p);

  3052         }

  3054         // [//authority]<path>[?<query>]

  3055         //

  3056         // DEVIATION from RFC2396: We allow an empty authority component as

  3057         // long as it's followed by a non-empty path, query component, or

  3058         // fragment component.  This is so that URIs such as "file:///foo/bar"

  3059         // will parse.  This seems to be the intent of RFC2396, though the

  3060         // grammar does not permit it.  If the authority is empty then the

  3061         // userInfo, host, and port components are undefined.

  3062         //

  3063         // DEVIATION from RFC2396: We allow empty relative paths.  This seems

  3064         // to be the intent of RFC2396, but the grammar does not permit it.

  3065         // The primary consequence of this deviation is that "#f" parses as a

  3066         // relative URI with an empty path.

  3067         //

  3068         private int parseHierarchical(int start, int n)

  3069             throws URISyntaxException

  3070         {

  3071             int p = start;

  3072             if (at(p, n, '/') && at(p + 1, n, '/')) {

  3073                 p += 2;

  3074                 int q = scan(p, n, "", "/?#");

  3075                 if (q > p) {

  3076                     p = parseAuthority(p, q);

  3077                 } else if (q < n) {

  3078                     // DEVIATION: Allow empty authority prior to non-empty

  3079                     // path, query component or fragment identifier

  3080                 } else

  3081                     failExpecting("authority", p);

  3082             }

  3083             int q = scan(p, n, "", "?#"); // DEVIATION: May be empty

  3084             checkChars(p, q, L_PATH, H_PATH, "path");

  3085             path = substring(p, q);

  3086             p = q;

  3087             if (at(p, n, '?')) {

  3088                 p++;

  3089                 q = scan(p, n, "", "#");

  3090                 checkChars(p, q, L_URIC, H_URIC, "query");

  3091                 query = substring(p, q);

  3092                 p = q;

  3093             }

  3094             return p;

  3095         }

  3097         // authority     = server | reg_name

  3098         //

  3099         // Ambiguity: An authority that is a registry name rather than a server

  3100         // might have a prefix that parses as a server.  We use the fact that

  3101         // the authority component is always followed by '/' or the end of the

  3102         // input string to resolve this: If the complete authority did not

  3103         // parse as a server then we try to parse it as a registry name.

  3104         //

  3105         private int parseAuthority(int start, int n)

  3106             throws URISyntaxException

  3107         {

  3108             int p = start;

  3109             int q = p;

  3110             URISyntaxException ex = null;

  3112             boolean serverChars;

  3113             boolean regChars;

  3115             if (scan(p, n, "", "]") > p) {

  3116                 // contains a literal IPv6 address, therefore % is allowed

  3117                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);

  3118             } else {

  3119                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);

  3120             }

  3121             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);

  3123             if (regChars && !serverChars) {

  3124                 // Must be a registry-based authority

  3125                 authority = substring(p, n);

  3126                 return n;

  3127             }

  3129             if (serverChars) {

  3130                 // Might be (probably is) a server-based authority, so attempt

  3131                 // to parse it as such.  If the attempt fails, try to treat it

  3132                 // as a registry-based authority.

  3133                 try {

  3134                     q = parseServer(p, n);

  3135                     if (q < n)

  3136                         failExpecting("end of authority", q);

  3137                     authority = substring(p, n);

  3138                 } catch (URISyntaxException x) {

  3139                     // Undo results of failed parse

  3140                     userInfo = null;

  3141                     host = null;

  3142                     port = -1;

  3143                     if (requireServerAuthority) {

  3144                         // If we're insisting upon a server-based authority,

  3145                         // then just re-throw the exception

  3146                         throw x;

  3147                     } else {

  3148                         // Save the exception in case it doesn't parse as a

  3149                         // registry either

  3150                         ex = x;

  3151                         q = p;

  3152                     }

  3153                 }

  3154             }

  3156             if (q < n) {

  3157                 if (regChars) {

  3158                     // Registry-based authority

  3159                     authority = substring(p, n);

  3160                 } else if (ex != null) {

  3161                     // Re-throw exception; it was probably due to

  3162                     // a malformed IPv6 address

  3163                     throw ex;

  3164                 } else {

  3165                     fail("Illegal character in authority", q);

  3166                 }

  3167             }

  3169             return n;

  3170         }

  3173         // [<userinfo>@]<host>[:<port>]

  3174         //

  3175         private int parseServer(int start, int n)

  3176             throws URISyntaxException

  3177         {

  3178             int p = start;

  3179             int q;

  3181             // userinfo

  3182             q = scan(p, n, "/?#", "@");

  3183             if ((q >= p) && at(q, n, '@')) {

  3184                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");

  3185                 userInfo = substring(p, q);

  3186                 p = q + 1;              // Skip '@'

  3187             }

  3189             // hostname, IPv4 address, or IPv6 address

  3190             if (at(p, n, '[')) {

  3191                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732

  3192                 p++;

  3193                 q = scan(p, n, "/?#", "]");

  3194                 if ((q > p) && at(q, n, ']')) {

  3195                     // look for a "%" scope id

  3196                     int r = scan (p, q, "", "%");

  3197                     if (r > p) {

  3198                         parseIPv6Reference(p, r);

  3199                         if (r+1 == q) {

  3200                             fail ("scope id expected");

  3201                         }

  3202                         checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,

  3203                                                 "scope id");

  3204                     } else {

  3205                         parseIPv6Reference(p, q);

  3206                     }

  3207                     host = substring(p-1, q+1);

  3208                     p = q + 1;

  3209                 } else {

  3210                     failExpecting("closing bracket for IPv6 address", q);

  3211                 }

  3212             } else {

  3213                 q = parseIPv4Address(p, n);

  3214                 if (q <= p)

  3215                     q = parseHostname(p, n);

  3216                 p = q;

  3217             }

  3219             // port

  3220             if (at(p, n, ':')) {

  3221                 p++;

  3222                 q = scan(p, n, "", "/");

  3223                 if (q > p) {

  3224                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");

  3225                     try {

  3226                         port = Integer.parseInt(substring(p, q));

  3227                     } catch (NumberFormatException x) {

  3228                         fail("Malformed port number", p);

  3229                     }

  3230                     p = q;

  3231                 }

  3232             }

  3233             if (p < n)

  3234                 failExpecting("port number", p);

  3236             return p;

  3237         }

  3239         // Scan a string of decimal digits whose value fits in a byte

  3240         //

  3241         private int scanByte(int start, int n)

  3242             throws URISyntaxException

  3243         {

  3244             int p = start;

  3245             int q = scan(p, n, L_DIGIT, H_DIGIT);

  3246             if (q <= p) return q;

  3247             if (Integer.parseInt(substring(p, q)) > 255) return p;

  3248             return q;

  3249         }

  3251         // Scan an IPv4 address.

  3252         //

  3253         // If the strict argument is true then we require that the given

  3254         // interval contain nothing besides an IPv4 address; if it is false

  3255         // then we only require that it start with an IPv4 address.

  3256         //

  3257         // If the interval does not contain or start with (depending upon the

  3258         // strict argument) a legal IPv4 address characters then we return -1

  3259         // immediately; otherwise we insist that these characters parse as a

  3260         // legal IPv4 address and throw an exception on failure.

  3261         //

  3262         // We assume that any string of decimal digits and dots must be an IPv4

  3263         // address.  It won't parse as a hostname anyway, so making that

  3264         // assumption here allows more meaningful exceptions to be thrown.

  3265         //

  3266         private int scanIPv4Address(int start, int n, boolean strict)

  3267             throws URISyntaxException

  3268         {

  3269             int p = start;

  3270             int q;

  3271             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);

  3272             if ((m <= p) || (strict && (m != n)))

  3273                 return -1;

  3274             for (;;) {

  3275                 // Per RFC2732: At most three digits per byte

  3276                 // Further constraint: Each element fits in a byte

  3277                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3278                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3279                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3280                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3281                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3282                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3283                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3284                 if (q < m) break;

  3285                 return q;

  3286             }

  3287             fail("Malformed IPv4 address", q);

  3288             return -1;

  3289         }

  3291         // Take an IPv4 address: Throw an exception if the given interval

  3292         // contains anything except an IPv4 address

  3293         //

  3294         private int takeIPv4Address(int start, int n, String expected)

  3295             throws URISyntaxException

  3296         {

  3297             int p = scanIPv4Address(start, n, true);

  3298             if (p <= start)

  3299                 failExpecting(expected, start);

  3300             return p;

  3301         }

  3303         // Attempt to parse an IPv4 address, returning -1 on failure but

  3304         // allowing the given interval to contain [:<characters>] after

  3305         // the IPv4 address.

  3306         //

  3307         private int parseIPv4Address(int start, int n) {

  3308             int p;

  3310             try {

  3311                 p = scanIPv4Address(start, n, false);

  3312             } catch (URISyntaxException x) {

  3313                 return -1;

  3314             } catch (NumberFormatException nfe) {

  3315                 return -1;

  3316             }

  3318             if (p > start && p < n) {

  3319                 // IPv4 address is followed by something - check that

  3320                 // it's a ":" as this is the only valid character to

  3321                 // follow an address.

  3322                 if (charAt(p) != ':') {

  3323                     p = -1;

  3324                 }

  3325             }

  3327             if (p > start)

  3328                 host = substring(start, p);

  3330             return p;

  3331         }

  3333         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]

  3334         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum

  3335         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum

  3336         //

  3337         private int parseHostname(int start, int n)

  3338             throws URISyntaxException

  3339         {

  3340             int p = start;

  3341             int q;

  3342             int l = -1;                 // Start of last parsed label

  3344             do {

  3345                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]

  3346                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);

  3347                 if (q <= p)

  3348                     break;

  3349                 l = p;

  3350                 if (q > p) {

  3351                     p = q;

  3352                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);

  3353                     if (q > p) {

  3354                         if (charAt(q - 1) == '-')

  3355                             fail("Illegal character in hostname", q - 1);

  3356                         p = q;

  3357                     }

  3358                 }

  3359                 q = scan(p, n, '.');

  3360                 if (q <= p)

  3361                     break;

  3362                 p = q;

  3363             } while (p < n);

  3365             if ((p < n) && !at(p, n, ':'))

  3366                 fail("Illegal character in hostname", p);

  3368             if (l < 0)

  3369                 failExpecting("hostname", start);

  3371             // for a fully qualified hostname check that the rightmost

  3372             // label starts with an alpha character.

  3373             if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {

  3374                 fail("Illegal character in hostname", l);

  3375             }

  3377             host = substring(start, p);

  3378             return p;

  3379         }

  3382         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture

  3383         //

  3384         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of

  3385         // the form ::12.34.56.78, which are clearly shown in the examples

  3386         // earlier in the document.  Here is the original grammar:

  3387         //

  3388         //   IPv6address = hexpart [ ":" IPv4address ]

  3389         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]

  3390         //   hexseq      = hex4 *( ":" hex4)

  3391         //   hex4        = 1*4HEXDIG

  3392         //

  3393         // We therefore use the following revised grammar:

  3394         //

  3395         //   IPv6address = hexseq [ ":" IPv4address ]

  3396         //                 | hexseq [ "::" [ hexpost ] ]

  3397         //                 | "::" [ hexpost ]

  3398         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address

  3399         //   hexseq      = hex4 *( ":" hex4)

  3400         //   hex4        = 1*4HEXDIG

  3401         //

  3402         // This covers all and only the following cases:

  3403         //

  3404         //   hexseq

  3405         //   hexseq : IPv4address

  3406         //   hexseq ::

  3407         //   hexseq :: hexseq

  3408         //   hexseq :: hexseq : IPv4address

  3409         //   hexseq :: IPv4address

  3410         //   :: hexseq

  3411         //   :: hexseq : IPv4address

  3412         //   :: IPv4address

  3413         //   ::

  3414         //

  3415         // Additionally we constrain the IPv6 address as follows :-

  3416         //

  3417         //  i.  IPv6 addresses without compressed zeros should contain

  3418         //      exactly 16 bytes.

  3419         //

  3420         //  ii. IPv6 addresses with compressed zeros should contain

  3421         //      less than 16 bytes.

  3423         private int ipv6byteCount = 0;

  3425         private int parseIPv6Reference(int start, int n)

  3426             throws URISyntaxException

  3427         {

  3428             int p = start;

  3429             int q;

  3430             boolean compressedZeros = false;

  3432             q = scanHexSeq(p, n);

  3434             if (q > p) {

  3435                 p = q;

  3436                 if (at(p, n, "::")) {

  3437                     compressedZeros = true;

  3438                     p = scanHexPost(p + 2, n);

  3439                 } else if (at(p, n, ':')) {

  3440                     p = takeIPv4Address(p + 1,  n, "IPv4 address");

  3441                     ipv6byteCount += 4;

  3442                 }

  3443             } else if (at(p, n, "::")) {

  3444                 compressedZeros = true;

  3445                 p = scanHexPost(p + 2, n);

  3446             }

  3447             if (p < n)

  3448                 fail("Malformed IPv6 address", start);

  3449             if (ipv6byteCount > 16)

  3450                 fail("IPv6 address too long", start);

  3451             if (!compressedZeros && ipv6byteCount < 16)

  3452                 fail("IPv6 address too short", start);

  3453             if (compressedZeros && ipv6byteCount == 16)

  3454                 fail("Malformed IPv6 address", start);

  3456             return p;

  3457         }

  3459         private int scanHexPost(int start, int n)

  3460             throws URISyntaxException

  3461         {

  3462             int p = start;

  3463             int q;

  3465             if (p == n)

  3466                 return p;

  3468             q = scanHexSeq(p, n);

  3469             if (q > p) {

  3470                 p = q;

  3471                 if (at(p, n, ':')) {

  3472                     p++;

  3473                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");

  3474                     ipv6byteCount += 4;

  3475                 }

  3476             } else {

  3477                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");

  3478                 ipv6byteCount += 4;

  3479             }

  3480             return p;

  3481         }

  3483         // Scan a hex sequence; return -1 if one could not be scanned

  3484         //

  3485         private int scanHexSeq(int start, int n)

  3486             throws URISyntaxException

  3487         {

  3488             int p = start;

  3489             int q;

  3491             q = scan(p, n, L_HEX, H_HEX);

  3492             if (q <= p)

  3493                 return -1;

  3494             if (at(q, n, '.'))          // Beginning of IPv4 address

  3495                 return -1;

  3496             if (q > p + 4)

  3497                 fail("IPv6 hexadecimal digit sequence too long", p);

  3498             ipv6byteCount += 2;

  3499             p = q;

  3500             while (p < n) {

  3501                 if (!at(p, n, ':'))

  3502                     break;

  3503                 if (at(p + 1, n, ':'))

  3504                     break;              // "::"

  3505                 p++;

  3506                 q = scan(p, n, L_HEX, H_HEX);

  3507                 if (q <= p)

  3508                     failExpecting("digits for an IPv6 address", p);

  3509                 if (at(q, n, '.')) {    // Beginning of IPv4 address

  3510                     p--;

  3511                     break;

  3512                 }

  3513                 if (q > p + 4)

  3514                     fail("IPv6 hexadecimal digit sequence too long", p);

  3515                 ipv6byteCount += 2;

  3516                 p = q;

  3517             }

  3519             return p;

  3520         }

  3522     }

  3524 }

author	Jaroslav Tulach <jaroslav.tulach@apidesign.org>
	Sat, 07 Sep 2013 13:51:24 +0200
branch	jdk7-b147
changeset 1258	724f3e1ea53e
permissions	-rw-r--r--