hg/bck2brwsr: rt/emul/compact/src/main/java/java/net/URI.java@9926996eca2d

     1 /*

     2  * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.

     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     4  *

     5  * This code is free software; you can redistribute it and/or modify it

     6  * under the terms of the GNU General Public License version 2 only, as

     7  * published by the Free Software Foundation.  Oracle designates this

     8  * particular file as subject to the "Classpath" exception as provided

     9  * by Oracle in the LICENSE file that accompanied this code.

    10  *

    11  * This code is distributed in the hope that it will be useful, but WITHOUT

    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    14  * version 2 for more details (a copy is included in the LICENSE file that

    15  * accompanied this code).

    16  *

    17  * You should have received a copy of the GNU General Public License version

    18  * 2 along with this work; if not, write to the Free Software Foundation,

    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    20  *

    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    22  * or visit www.oracle.com if you need additional information or have any

    23  * questions.

    24  */

    26 package java.net;

    28 import java.io.IOException;

    29 import java.io.InvalidObjectException;

    30 import java.io.ObjectInputStream;

    31 import java.io.ObjectOutputStream;

    32 import java.io.Serializable;

    34 import java.lang.Character;             // for javadoc

    35 import java.lang.NullPointerException;  // for javadoc

    38 /**

    39  * Represents a Uniform Resource Identifier (URI) reference.

    40  *

    41  * <p> Aside from some minor deviations noted below, an instance of this

    42  * class represents a URI reference as defined by

    43  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform

    44  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a

    45  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for

    46  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format

    47  * also supports scope_ids. The syntax and usage of scope_ids is described

    48  * <a href="Inet6Address.html#scoped">here</a>.

    49  * This class provides constructors for creating URI instances from

    50  * their components or by parsing their string forms, methods for accessing the

    51  * various components of an instance, and methods for normalizing, resolving,

    52  * and relativizing URI instances.  Instances of this class are immutable.

    53  *

    54  *

    55  * <h4> URI syntax and components </h4>

    56  *

    57  * At the highest level a URI reference (hereinafter simply "URI") in string

    58  * form has the syntax

    59  *

    60  * <blockquote>

    61  * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]

    62  * </blockquote>

    63  *

    64  * where square brackets [...] delineate optional components and the characters

    65  * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.

    66  *

    67  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is

    68  * said to be <i>relative</i>.  URIs are also classified according to whether

    69  * they are <i>opaque</i> or <i>hierarchical</i>.

    70  *

    71  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does

    72  * not begin with a slash character (<tt>'/'</tt>).  Opaque URIs are not

    73  * subject to further parsing.  Some examples of opaque URIs are:

    74  *

    75  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">

    76  * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>

    77  * <tr><td><tt>news:comp.lang.java</tt><td></tr>

    78  * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>

    79  * </table></blockquote>

    80  *

    81  * <p> A <i>hierarchical</i> URI is either an absolute URI whose

    82  * scheme-specific part begins with a slash character, or a relative URI, that

    83  * is, a URI that does not specify a scheme.  Some examples of hierarchical

    84  * URIs are:

    85  *

    86  * <blockquote>

    87  * <tt>http://java.sun.com/j2se/1.3/</tt><br>

    88  * <tt>docs/guide/collections/designfaq.html#28</tt><br>

    89  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>

    90  * <tt>file:///~/calendar</tt>

    91  * </blockquote>

    92  *

    93  * <p> A hierarchical URI is subject to further parsing according to the syntax

    94  *

    95  * <blockquote>

    96  * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]

    97  * </blockquote>

    98  *

    99  * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,

   100  * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves.  The

   101  * scheme-specific part of a hierarchical URI consists of the characters

   102  * between the scheme and fragment components.

   103  *

   104  * <p> The authority component of a hierarchical URI is, if specified, either

   105  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority

   106  * parses according to the familiar syntax

   107  *

   108  * <blockquote>

   109  * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]

   110  * </blockquote>

   111  *

   112  * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for

   113  * themselves.  Nearly all URI schemes currently in use are server-based.  An

   114  * authority component that does not parse in this way is considered to be

   115  * registry-based.

   116  *

   117  * <p> The path component of a hierarchical URI is itself said to be absolute

   118  * if it begins with a slash character (<tt>'/'</tt>); otherwise it is

   119  * relative.  The path of a hierarchical URI that is either absolute or

   120  * specifies an authority is always absolute.

   121  *

   122  * <p> All told, then, a URI instance has the following nine components:

   123  *

   124  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">

   125  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>

   126  * <tr><td>scheme</td><td><tt>String</tt></td></tr>

   127  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>

   128  * <tr><td>authority</td><td><tt>String</tt></td></tr>

   129  * <tr><td>user-info</td><td><tt>String</tt></td></tr>

   130  * <tr><td>host</td><td><tt>String</tt></td></tr>

   131  * <tr><td>port</td><td><tt>int</tt></td></tr>

   132  * <tr><td>path</td><td><tt>String</tt></td></tr>

   133  * <tr><td>query</td><td><tt>String</tt></td></tr>

   134  * <tr><td>fragment</td><td><tt>String</tt></td></tr>

   135  * </table></blockquote>

   136  *

   137  * In a given instance any particular component is either <i>undefined</i> or

   138  * <i>defined</i> with a distinct value.  Undefined string components are

   139  * represented by <tt>null</tt>, while undefined integer components are

   140  * represented by <tt>-1</tt>.  A string component may be defined to have the

   141  * empty string as its value; this is not equivalent to that component being

   142  * undefined.

   143  *

   144  * <p> Whether a particular component is or is not defined in an instance

   145  * depends upon the type of the URI being represented.  An absolute URI has a

   146  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and

   147  * possibly a fragment, but has no other components.  A hierarchical URI always

   148  * has a path (though it may be empty) and a scheme-specific-part (which at

   149  * least contains the path), and may have any of the other components.  If the

   150  * authority component is present and is server-based then the host component

   151  * will be defined and the user-information and port components may be defined.

   152  *

   153  *

   154  * <h4> Operations on URI instances </h4>

   155  *

   156  * The key operations supported by this class are those of

   157  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.

   158  *

   159  * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>

   160  * and <tt>".."</tt> segments from the path component of a hierarchical URI.

   161  * Each <tt>"."</tt> segment is simply removed.  A <tt>".."</tt> segment is

   162  * removed only if it is preceded by a non-<tt>".."</tt> segment.

   163  * Normalization has no effect upon opaque URIs.

   164  *

   165  * <p> <i>Resolution</i> is the process of resolving one URI against another,

   166  * <i>base</i> URI.  The resulting URI is constructed from components of both

   167  * URIs in the manner specified by RFC&nbsp;2396, taking components from the

   168  * base URI for those not specified in the original.  For hierarchical URIs,

   169  * the path of the original is resolved against the path of the base and then

   170  * normalized.  The result, for example, of resolving

   171  *

   172  * <blockquote>

   173  * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)

   174  * </blockquote>

   175  *

   176  * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result

   177  * URI

   178  *

   179  * <blockquote>

   180  * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>

   181  * </blockquote>

   182  *

   183  * Resolving the relative URI

   184  *

   185  * <blockquote>

   186  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)

   187  * </blockquote>

   188  *

   189  * against this result yields, in turn,

   190  *

   191  * <blockquote>

   192  * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>

   193  * </blockquote>

   194  *

   195  * Resolution of both absolute and relative URIs, and of both absolute and

   196  * relative paths in the case of hierarchical URIs, is supported.  Resolving

   197  * the URI <tt>file:///~calendar</tt> against any other URI simply yields the

   198  * original URI, since it is absolute.  Resolving the relative URI (2) above

   199  * against the relative base URI (1) yields the normalized, but still relative,

   200  * URI

   201  *

   202  * <blockquote>

   203  * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>

   204  * </blockquote>

   205  *

   206  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any

   207  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,

   208  *

   209  * <blockquote>

   210  *   <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>

   211  *   <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>

   212  * </blockquote>

   213  *

   214  * This operation is often useful when constructing a document containing URIs

   215  * that must be made relative to the base URI of the document wherever

   216  * possible.  For example, relativizing the URI

   217  *

   218  * <blockquote>

   219  * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>

   220  * </blockquote>

   221  *

   222  * against the base URI

   223  *

   224  * <blockquote>

   225  * <tt>http://java.sun.com/j2se/1.3</tt>

   226  * </blockquote>

   227  *

   228  * yields the relative URI <tt>docs/guide/index.html</tt>.

   229  *

   230  *

   231  * <h4> Character categories </h4>

   232  *

   233  * RFC&nbsp;2396 specifies precisely which characters are permitted in the

   234  * various components of a URI reference.  The following categories, most of

   235  * which are taken from that specification, are used below to describe these

   236  * constraints:

   237  *

   238  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">

   239  *   <tr><th valign=top><i>alpha</i></th>

   240  *       <td>The US-ASCII alphabetic characters,

   241  *        <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>

   242  *        and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>

   243  *   <tr><th valign=top><i>digit</i></th>

   244  *       <td>The US-ASCII decimal digit characters,

   245  *       <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>

   246  *   <tr><th valign=top><i>alphanum</i></th>

   247  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>

   248  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>

   249  *       <td>All <i>alphanum</i> characters together with those in the string

   250  *        <tt>"_-!.~'()*"</tt></td></tr>

   251  *   <tr><th valign=top><i>punct</i></th>

   252  *       <td>The characters in the string <tt>",;:$&+="</tt></td></tr>

   253  *   <tr><th valign=top><i>reserved</i></th>

   254  *       <td>All <i>punct</i> characters together with those in the string

   255  *        <tt>"?/[]@"</tt></td></tr>

   256  *   <tr><th valign=top><i>escaped</i></th>

   257  *       <td>Escaped octets, that is, triplets consisting of the percent

   258  *           character (<tt>'%'</tt>) followed by two hexadecimal digits

   259  *           (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and

   260  *           <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>

   261  *   <tr><th valign=top><i>other</i></th>

   262  *       <td>The Unicode characters that are not in the US-ASCII character set,

   263  *           are not control characters (according to the {@link

   264  *           java.lang.Character#isISOControl(char) Character.isISOControl}

   265  *           method), and are not space characters (according to the {@link

   266  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}

   267  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is

   268  *           limited to US-ASCII)</i></td></tr>

   269  * </table></blockquote>

   270  *

   271  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of

   272  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>

   273  * characters.

   274  *

   275  *

   276  * <h4> Escaped octets, quotation, encoding, and decoding </h4>

   277  *

   278  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and

   279  * fragment components.  Escaping serves two purposes in URIs:

   280  *

   281  * <ul>

   282  *

   283  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to

   284  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>

   285  *   characters.  </p></li>

   286  *

   287  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a

   288  *   component.  The user-info, path, query, and fragment components differ

   289  *   slightly in terms of which characters are considered legal and illegal.

   290  *   </p></li>

   291  *

   292  * </ul>

   293  *

   294  * These purposes are served in this class by three related operations:

   295  *

   296  * <ul>

   297  *

   298  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it

   299  *   with the sequence of escaped octets that represent that character in the

   300  *   UTF-8 character set.  The Euro currency symbol (<tt>'&#92;u20AC'</tt>),

   301  *   for example, is encoded as <tt>"%E2%82%AC"</tt>.  <i>(<b>Deviation from

   302  *   RFC&nbsp;2396</b>, which does not specify any particular character

   303  *   set.)</i> </p></li>

   304  *

   305  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by

   306  *   encoding it.  The space character, for example, is quoted by replacing it

   307  *   with <tt>"%20"</tt>.  UTF-8 contains US-ASCII, hence for US-ASCII

   308  *   characters this transformation has exactly the effect required by

   309  *   RFC&nbsp;2396. </p></li>

   310  *

   311  *   <li><p><a name="decode"></a>

   312  *   A sequence of escaped octets is <i>decoded</i> by

   313  *   replacing it with the sequence of characters that it represents in the

   314  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the

   315  *   effect of de-quoting any quoted US-ASCII characters as well as that of

   316  *   decoding any encoded non-US-ASCII characters.  If a <a

   317  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs

   318  *   when decoding the escaped octets then the erroneous octets are replaced by

   319  *   <tt>'&#92;uFFFD'</tt>, the Unicode replacement character.  </p></li>

   320  *

   321  * </ul>

   322  *

   323  * These operations are exposed in the constructors and methods of this class

   324  * as follows:

   325  *

   326  * <ul>

   327  *

   328  *   <li><p> The {@link #URI(java.lang.String) <code>single-argument

   329  *   constructor</code>} requires any illegal characters in its argument to be

   330  *   quoted and preserves any escaped octets and <i>other</i> characters that

   331  *   are present.  </p></li>

   332  *

   333  *   <li><p> The {@link

   334  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)

   335  *   <code>multi-argument constructors</code>} quote illegal characters as

   336  *   required by the components in which they appear.  The percent character

   337  *   (<tt>'%'</tt>) is always quoted by these constructors.  Any <i>other</i>

   338  *   characters are preserved.  </p></li>

   339  *

   340  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()

   341  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()

   342  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link

   343  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the

   344  *   values of their corresponding components in raw form, without interpreting

   345  *   any escaped octets.  The strings returned by these methods may contain

   346  *   both escaped octets and <i>other</i> characters, and will not contain any

   347  *   illegal characters.  </p></li>

   348  *

   349  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()

   350  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()

   351  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link

   352  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped

   353  *   octets in their corresponding components.  The strings returned by these

   354  *   methods may contain both <i>other</i> characters and illegal characters,

   355  *   and will not contain any escaped octets.  </p></li>

   356  *

   357  *   <li><p> The {@link #toString() toString} method returns a URI string with

   358  *   all necessary quotation but which may contain <i>other</i> characters.

   359  *   </p></li>

   360  *

   361  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully

   362  *   quoted and encoded URI string that does not contain any <i>other</i>

   363  *   characters.  </p></li>

   364  *

   365  * </ul>

   366  *

   367  *

   368  * <h4> Identities </h4>

   369  *

   370  * For any URI <i>u</i>, it is always the case that

   371  *

   372  * <blockquote>

   373  * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.

   374  * </blockquote>

   375  *

   376  * For any URI <i>u</i> that does not contain redundant syntax such as two

   377  * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a

   378  * colon following a host name but no port (as in

   379  * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters

   380  * except those that must be quoted, the following identities also hold:

   381  *

   382  * <blockquote>

   383  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   384  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>

   385  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   386  * .equals(</tt><i>u</i><tt>)</tt>

   387  * </blockquote>

   388  *

   389  * in all cases,

   390  *

   391  * <blockquote>

   392  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   393  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>

   394  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>

   395  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   396  * .equals(</tt><i>u</i><tt>)</tt>

   397  * </blockquote>

   398  *

   399  * if <i>u</i> is hierarchical, and

   400  *

   401  * <blockquote>

   402  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>

   403  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>

   404  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>

   405  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>

   406  * .equals(</tt><i>u</i><tt>)</tt>

   407  * </blockquote>

   408  *

   409  * if <i>u</i> is hierarchical and has either no authority or a server-based

   410  * authority.

   411  *

   412  *

   413  * <h4> URIs, URLs, and URNs </h4>

   414  *

   415  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform

   416  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but

   417  * not every URI is a URL.  This is because there is another subcategory of

   418  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not

   419  * specify how to locate them.  The <tt>mailto</tt>, <tt>news</tt>, and

   420  * <tt>isbn</tt> URIs shown above are examples of URNs.

   421  *

   422  * <p> The conceptual distinction between URIs and URLs is reflected in the

   423  * differences between this class and the {@link URL} class.

   424  *

   425  * <p> An instance of this class represents a URI reference in the syntactic

   426  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.

   427  * A URI string is parsed according to the generic syntax without regard to the

   428  * scheme, if any, that it specifies.  No lookup of the host, if any, is

   429  * performed, and no scheme-dependent stream handler is constructed.  Equality,

   430  * hashing, and comparison are defined strictly in terms of the character

   431  * content of the instance.  In other words, a URI instance is little more than

   432  * a structured string that supports the syntactic, scheme-independent

   433  * operations of comparison, normalization, resolution, and relativization.

   434  *

   435  * <p> An instance of the {@link URL} class, by contrast, represents the

   436  * syntactic components of a URL together with some of the information required

   437  * to access the resource that it describes.  A URL must be absolute, that is,

   438  * it must always specify a scheme.  A URL string is parsed according to its

   439  * scheme.  A stream handler is always established for a URL, and in fact it is

   440  * impossible to create a URL instance for a scheme for which no handler is

   441  * available.  Equality and hashing depend upon both the scheme and the

   442  * Internet address of the host, if any; comparison is not defined.  In other

   443  * words, a URL is a structured string that supports the syntactic operation of

   444  * resolution as well as the network I/O operations of looking up the host and

   445  * opening a connection to the specified resource.

   446  *

   447  *

   448  * @author Mark Reinhold

   449  * @since 1.4

   450  *

   451  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a

   452  * transformation format of ISO 10646</i></a>, <br><a

   453  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing

   454  * Architecture</i></a>, <br><a

   455  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform

   456  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a

   457  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for

   458  * Literal IPv6 Addresses in URLs</i></a>, <br><a

   459  * href="URISyntaxException.html">URISyntaxException</a>

   460  */

   462 public final class URI

   463     implements Comparable<URI>, Serializable

   464 {

   466     // Note: Comments containing the word "ASSERT" indicate places where a

   467     // throw of an InternalError should be replaced by an appropriate assertion

   468     // statement once asserts are enabled in the build.

   470     static final long serialVersionUID = -6052424284110960213L;

   473     // -- Properties and components of this instance --

   475     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]

   476     private transient String scheme;            // null ==> relative URI

   477     private transient String fragment;

   479     // Hierarchical URI components: [//<authority>]<path>[?<query>]

   480     private transient String authority;         // Registry or server

   482     // Server-based authority: [<userInfo>@]<host>[:<port>]

   483     private transient String userInfo;

   484     private transient String host;              // null ==> registry-based

   485     private transient int port = -1;            // -1 ==> undefined

   487     // Remaining components of hierarchical URIs

   488     private transient String path;              // null ==> opaque

   489     private transient String query;

   491     // The remaining fields may be computed on demand

   493     private volatile transient String schemeSpecificPart;

   494     private volatile transient int hash;        // Zero ==> undefined

   496     private volatile transient String decodedUserInfo = null;

   497     private volatile transient String decodedAuthority = null;

   498     private volatile transient String decodedPath = null;

   499     private volatile transient String decodedQuery = null;

   500     private volatile transient String decodedFragment = null;

   501     private volatile transient String decodedSchemeSpecificPart = null;

   503     /**

   504      * The string form of this URI.

   505      *

   506      * @serial

   507      */

   508     private volatile String string;             // The only serializable field

   512     // -- Constructors and factories --

   514     private URI() { }                           // Used internally

   516     /**

   517      * Constructs a URI by parsing the given string.

   518      *

   519      * <p> This constructor parses the given string exactly as specified by the

   520      * grammar in <a

   521      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   522      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>

   523      *

   524      * <ul type=disc>

   525      *

   526      *   <li><p> An empty authority component is permitted as long as it is

   527      *   followed by a non-empty path, a query component, or a fragment

   528      *   component.  This allows the parsing of URIs such as

   529      *   <tt>"file:///foo/bar"</tt>, which seems to be the intent of

   530      *   RFC&nbsp;2396 although the grammar does not permit it.  If the

   531      *   authority component is empty then the user-information, host, and port

   532      *   components are undefined. </p></li>

   533      *

   534      *   <li><p> Empty relative paths are permitted; this seems to be the

   535      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The

   536      *   primary consequence of this deviation is that a standalone fragment

   537      *   such as <tt>"#foo"</tt> parses as a relative URI with an empty path

   538      *   and the given fragment, and can be usefully <a

   539      *   href="#resolve-frag">resolved</a> against a base URI.

   540      *

   541      *   <li><p> IPv4 addresses in host components are parsed rigorously, as

   542      *   specified by <a

   543      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each

   544      *   element of a dotted-quad address must contain no more than three

   545      *   decimal digits.  Each element is further constrained to have a value

   546      *   no greater than 255. </p></li>

   547      *

   548      *   <li> <p> Hostnames in host components that comprise only a single

   549      *   domain label are permitted to start with an <i>alphanum</i>

   550      *   character. This seems to be the intent of <a

   551      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>

   552      *   section&nbsp;3.2.2 although the grammar does not permit it. The

   553      *   consequence of this deviation is that the authority component of a

   554      *   hierarchical URI such as <tt>s://123</tt>, will parse as a server-based

   555      *   authority. </p></li>

   556      *

   557      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6

   558      *   address must be enclosed in square brackets (<tt>'['</tt> and

   559      *   <tt>']'</tt>) as specified by <a

   560      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The

   561      *   IPv6 address itself must parse according to <a

   562      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6

   563      *   addresses are further constrained to describe no more than sixteen

   564      *   bytes of address information, a constraint implicit in RFC&nbsp;2373

   565      *   but not expressible in the grammar. </p></li>

   566      *

   567      *   <li><p> Characters in the <i>other</i> category are permitted wherever

   568      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the

   569      *   user-information, path, query, and fragment components, as well as in

   570      *   the authority component if the authority is registry-based.  This

   571      *   allows URIs to contain Unicode characters beyond those in the US-ASCII

   572      *   character set. </p></li>

   573      *

   574      * </ul>

   575      *

   576      * @param  str   The string to be parsed into a URI

   577      *

   578      * @throws  NullPointerException

   579      *          If <tt>str</tt> is <tt>null</tt>

   580      *

   581      * @throws  URISyntaxException

   582      *          If the given string violates RFC&nbsp;2396, as augmented

   583      *          by the above deviations

   584      */

   585     public URI(String str) throws URISyntaxException {

   586         new Parser(str).parse(false);

   587     }

   589     /**

   590      * Constructs a hierarchical URI from the given components.

   591      *

   592      * <p> If a scheme is given then the path, if also given, must either be

   593      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a

   594      * component of the new URI may be left undefined by passing <tt>null</tt>

   595      * for the corresponding parameter or, in the case of the <tt>port</tt>

   596      * parameter, by passing <tt>-1</tt>.

   597      *

   598      * <p> This constructor first builds a URI string from the given components

   599      * according to the rules specified in <a

   600      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   601      * section&nbsp;5.2, step&nbsp;7: </p>

   602      *

   603      * <ol>

   604      *

   605      *   <li><p> Initially, the result string is empty. </p></li>

   606      *

   607      *   <li><p> If a scheme is given then it is appended to the result,

   608      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   609      *

   610      *   <li><p> If user information, a host, or a port are given then the

   611      *   string <tt>"//"</tt> is appended.  </p></li>

   612      *

   613      *   <li><p> If user information is given then it is appended, followed by

   614      *   a commercial-at character (<tt>'@'</tt>).  Any character not in the

   615      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   616      *   categories is <a href="#quote">quoted</a>.  </p></li>

   617      *

   618      *   <li><p> If a host is given then it is appended.  If the host is a

   619      *   literal IPv6 address but is not enclosed in square brackets

   620      *   (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.

   621      *   </p></li>

   622      *

   623      *   <li><p> If a port number is given then a colon character

   624      *   (<tt>':'</tt>) is appended, followed by the port number in decimal.

   625      *   </p></li>

   626      *

   627      *   <li><p> If a path is given then it is appended.  Any character not in

   628      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   629      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the

   630      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>

   631      *

   632      *   <li><p> If a query is given then a question-mark character

   633      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that

   634      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.

   635      *   </p></li>

   636      *

   637      *   <li><p> Finally, if a fragment is given then a hash character

   638      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character

   639      *   that is not a legal URI character is quoted.  </p></li>

   640      *

   641      * </ol>

   642      *

   643      * <p> The resulting URI string is then parsed as if by invoking the {@link

   644      * #URI(String)} constructor and then invoking the {@link

   645      * #parseServerAuthority()} method upon the result; this may cause a {@link

   646      * URISyntaxException} to be thrown.  </p>

   647      *

   648      * @param   scheme    Scheme name

   649      * @param   userInfo  User name and authorization information

   650      * @param   host      Host name

   651      * @param   port      Port number

   652      * @param   path      Path

   653      * @param   query     Query

   654      * @param   fragment  Fragment

   655      *

   656      * @throws URISyntaxException

   657      *         If both a scheme and a path are given but the path is relative,

   658      *         if the URI string constructed from the given components violates

   659      *         RFC&nbsp;2396, or if the authority component of the string is

   660      *         present but cannot be parsed as a server-based authority

   661      */

   662     public URI(String scheme,

   663                String userInfo, String host, int port,

   664                String path, String query, String fragment)

   665         throws URISyntaxException

   666     {

   667         String s = toString(scheme, null,

   668                             null, userInfo, host, port,

   669                             path, query, fragment);

   670         checkPath(s, scheme, path);

   671         new Parser(s).parse(true);

   672     }

   674     /**

   675      * Constructs a hierarchical URI from the given components.

   676      *

   677      * <p> If a scheme is given then the path, if also given, must either be

   678      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a

   679      * component of the new URI may be left undefined by passing <tt>null</tt>

   680      * for the corresponding parameter.

   681      *

   682      * <p> This constructor first builds a URI string from the given components

   683      * according to the rules specified in <a

   684      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   685      * section&nbsp;5.2, step&nbsp;7: </p>

   686      *

   687      * <ol>

   688      *

   689      *   <li><p> Initially, the result string is empty.  </p></li>

   690      *

   691      *   <li><p> If a scheme is given then it is appended to the result,

   692      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   693      *

   694      *   <li><p> If an authority is given then the string <tt>"//"</tt> is

   695      *   appended, followed by the authority.  If the authority contains a

   696      *   literal IPv6 address then the address must be enclosed in square

   697      *   brackets (<tt>'['</tt> and <tt>']'</tt>).  Any character not in the

   698      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   699      *   categories, and not equal to the commercial-at character

   700      *   (<tt>'@'</tt>), is <a href="#quote">quoted</a>.  </p></li>

   701      *

   702      *   <li><p> If a path is given then it is appended.  Any character not in

   703      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>

   704      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the

   705      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>

   706      *

   707      *   <li><p> If a query is given then a question-mark character

   708      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that

   709      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.

   710      *   </p></li>

   711      *

   712      *   <li><p> Finally, if a fragment is given then a hash character

   713      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character

   714      *   that is not a legal URI character is quoted.  </p></li>

   715      *

   716      * </ol>

   717      *

   718      * <p> The resulting URI string is then parsed as if by invoking the {@link

   719      * #URI(String)} constructor and then invoking the {@link

   720      * #parseServerAuthority()} method upon the result; this may cause a {@link

   721      * URISyntaxException} to be thrown.  </p>

   722      *

   723      * @param   scheme     Scheme name

   724      * @param   authority  Authority

   725      * @param   path       Path

   726      * @param   query      Query

   727      * @param   fragment   Fragment

   728      *

   729      * @throws URISyntaxException

   730      *         If both a scheme and a path are given but the path is relative,

   731      *         if the URI string constructed from the given components violates

   732      *         RFC&nbsp;2396, or if the authority component of the string is

   733      *         present but cannot be parsed as a server-based authority

   734      */

   735     public URI(String scheme,

   736                String authority,

   737                String path, String query, String fragment)

   738         throws URISyntaxException

   739     {

   740         String s = toString(scheme, null,

   741                             authority, null, null, -1,

   742                             path, query, fragment);

   743         checkPath(s, scheme, path);

   744         new Parser(s).parse(false);

   745     }

   747     /**

   748      * Constructs a hierarchical URI from the given components.

   749      *

   750      * <p> A component may be left undefined by passing <tt>null</tt>.

   751      *

   752      * <p> This convenience constructor works as if by invoking the

   753      * seven-argument constructor as follows:

   754      *

   755      * <blockquote><tt>

   756      * new&nbsp;{@link #URI(String, String, String, int, String, String, String)

   757      * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);

   758      * </tt></blockquote>

   759      *

   760      * @param   scheme    Scheme name

   761      * @param   host      Host name

   762      * @param   path      Path

   763      * @param   fragment  Fragment

   764      *

   765      * @throws  URISyntaxException

   766      *          If the URI string constructed from the given components

   767      *          violates RFC&nbsp;2396

   768      */

   769     public URI(String scheme, String host, String path, String fragment)

   770         throws URISyntaxException

   771     {

   772         this(scheme, null, host, -1, path, null, fragment);

   773     }

   775     /**

   776      * Constructs a URI from the given components.

   777      *

   778      * <p> A component may be left undefined by passing <tt>null</tt>.

   779      *

   780      * <p> This constructor first builds a URI in string form using the given

   781      * components as follows:  </p>

   782      *

   783      * <ol>

   784      *

   785      *   <li><p> Initially, the result string is empty.  </p></li>

   786      *

   787      *   <li><p> If a scheme is given then it is appended to the result,

   788      *   followed by a colon character (<tt>':'</tt>).  </p></li>

   789      *

   790      *   <li><p> If a scheme-specific part is given then it is appended.  Any

   791      *   character that is not a <a href="#legal-chars">legal URI character</a>

   792      *   is <a href="#quote">quoted</a>.  </p></li>

   793      *

   794      *   <li><p> Finally, if a fragment is given then a hash character

   795      *   (<tt>'#'</tt>) is appended to the string, followed by the fragment.

   796      *   Any character that is not a legal URI character is quoted.  </p></li>

   797      *

   798      * </ol>

   799      *

   800      * <p> The resulting URI string is then parsed in order to create the new

   801      * URI instance as if by invoking the {@link #URI(String)} constructor;

   802      * this may cause a {@link URISyntaxException} to be thrown.  </p>

   803      *

   804      * @param   scheme    Scheme name

   805      * @param   ssp       Scheme-specific part

   806      * @param   fragment  Fragment

   807      *

   808      * @throws  URISyntaxException

   809      *          If the URI string constructed from the given components

   810      *          violates RFC&nbsp;2396

   811      */

   812     public URI(String scheme, String ssp, String fragment)

   813         throws URISyntaxException

   814     {

   815         new Parser(toString(scheme, ssp,

   816                             null, null, null, -1,

   817                             null, null, fragment))

   818             .parse(false);

   819     }

   821     /**

   822      * Creates a URI by parsing the given string.

   823      *

   824      * <p> This convenience factory method works as if by invoking the {@link

   825      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the

   826      * constructor is caught and wrapped in a new {@link

   827      * IllegalArgumentException} object, which is then thrown.

   828      *

   829      * <p> This method is provided for use in situations where it is known that

   830      * the given string is a legal URI, for example for URI constants declared

   831      * within in a program, and so it would be considered a programming error

   832      * for the string not to parse as such.  The constructors, which throw

   833      * {@link URISyntaxException} directly, should be used situations where a

   834      * URI is being constructed from user input or from some other source that

   835      * may be prone to errors.  </p>

   836      *

   837      * @param  str   The string to be parsed into a URI

   838      * @return The new URI

   839      *

   840      * @throws  NullPointerException

   841      *          If <tt>str</tt> is <tt>null</tt>

   842      *

   843      * @throws  IllegalArgumentException

   844      *          If the given string violates RFC&nbsp;2396

   845      */

   846     public static URI create(String str) {

   847         try {

   848             return new URI(str);

   849         } catch (URISyntaxException x) {

   850             throw new IllegalArgumentException(x.getMessage(), x);

   851         }

   852     }

   855     // -- Operations --

   857     /**

   858      * Attempts to parse this URI's authority component, if defined, into

   859      * user-information, host, and port components.

   860      *

   861      * <p> If this URI's authority component has already been recognized as

   862      * being server-based then it will already have been parsed into

   863      * user-information, host, and port components.  In this case, or if this

   864      * URI has no authority component, this method simply returns this URI.

   865      *

   866      * <p> Otherwise this method attempts once more to parse the authority

   867      * component into user-information, host, and port components, and throws

   868      * an exception describing why the authority component could not be parsed

   869      * in that way.

   870      *

   871      * <p> This method is provided because the generic URI syntax specified in

   872      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>

   873      * cannot always distinguish a malformed server-based authority from a

   874      * legitimate registry-based authority.  It must therefore treat some

   875      * instances of the former as instances of the latter.  The authority

   876      * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a

   877      * legal server-based authority but it is legal as a registry-based

   878      * authority.

   879      *

   880      * <p> In many common situations, for example when working URIs that are

   881      * known to be either URNs or URLs, the hierarchical URIs being used will

   882      * always be server-based.  They therefore must either be parsed as such or

   883      * treated as an error.  In these cases a statement such as

   884      *

   885      * <blockquote>

   886      * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>

   887      * </blockquote>

   888      *

   889      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if

   890      * it has an authority component, has a server-based authority with proper

   891      * user-information, host, and port components.  Invoking this method also

   892      * ensures that if the authority could not be parsed in that way then an

   893      * appropriate diagnostic message can be issued based upon the exception

   894      * that is thrown. </p>

   895      *

   896      * @return  A URI whose authority field has been parsed

   897      *          as a server-based authority

   898      *

   899      * @throws  URISyntaxException

   900      *          If the authority component of this URI is defined

   901      *          but cannot be parsed as a server-based authority

   902      *          according to RFC&nbsp;2396

   903      */

   904     public URI parseServerAuthority()

   905         throws URISyntaxException

   906     {

   907         // We could be clever and cache the error message and index from the

   908         // exception thrown during the original parse, but that would require

   909         // either more fields or a more-obscure representation.

   910         if ((host != null) || (authority == null))

   911             return this;

   912         defineString();

   913         new Parser(string).parse(true);

   914         return this;

   915     }

   917     /**

   918      * Normalizes this URI's path.

   919      *

   920      * <p> If this URI is opaque, or if its path is already in normal form,

   921      * then this URI is returned.  Otherwise a new URI is constructed that is

   922      * identical to this URI except that its path is computed by normalizing

   923      * this URI's path in a manner consistent with <a

   924      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   925      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:

   926      * </p>

   927      *

   928      * <ol>

   929      *

   930      *   <li><p> All <tt>"."</tt> segments are removed. </p></li>

   931      *

   932      *   <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>

   933      *   segment then both of these segments are removed.  This step is

   934      *   repeated until it is no longer applicable. </p></li>

   935      *

   936      *   <li><p> If the path is relative, and if its first segment contains a

   937      *   colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is

   938      *   prepended.  This prevents a relative URI with a path such as

   939      *   <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a

   940      *   scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.

   941      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>

   942      *

   943      * </ol>

   944      *

   945      * <p> A normalized path will begin with one or more <tt>".."</tt> segments

   946      * if there were insufficient non-<tt>".."</tt> segments preceding them to

   947      * allow their removal.  A normalized path will begin with a <tt>"."</tt>

   948      * segment if one was inserted by step 3 above.  Otherwise, a normalized

   949      * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>

   950      *

   951      * @return  A URI equivalent to this URI,

   952      *          but whose path is in normal form

   953      */

   954     public URI normalize() {

   955         return normalize(this);

   956     }

   958     /**

   959      * Resolves the given URI against this URI.

   960      *

   961      * <p> If the given URI is already absolute, or if this URI is opaque, then

   962      * the given URI is returned.

   963      *

   964      * <p><a name="resolve-frag"></a> If the given URI's fragment component is

   965      * defined, its path component is empty, and its scheme, authority, and

   966      * query components are undefined, then a URI with the given fragment but

   967      * with all other components equal to those of this URI is returned.  This

   968      * allows a URI representing a standalone fragment reference, such as

   969      * <tt>"#foo"</tt>, to be usefully resolved against a base URI.

   970      *

   971      * <p> Otherwise this method constructs a new hierarchical URI in a manner

   972      * consistent with <a

   973      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

   974      * section&nbsp;5.2; that is: </p>

   975      *

   976      * <ol>

   977      *

   978      *   <li><p> A new URI is constructed with this URI's scheme and the given

   979      *   URI's query and fragment components. </p></li>

   980      *

   981      *   <li><p> If the given URI has an authority component then the new URI's

   982      *   authority and path are taken from the given URI. </p></li>

   983      *

   984      *   <li><p> Otherwise the new URI's authority component is copied from

   985      *   this URI, and its path is computed as follows: </p>

   986      *

   987      *   <ol type=a>

   988      *

   989      *     <li><p> If the given URI's path is absolute then the new URI's path

   990      *     is taken from the given URI. </p></li>

   991      *

   992      *     <li><p> Otherwise the given URI's path is relative, and so the new

   993      *     URI's path is computed by resolving the path of the given URI

   994      *     against the path of this URI.  This is done by concatenating all but

   995      *     the last segment of this URI's path, if any, with the given URI's

   996      *     path and then normalizing the result as if by invoking the {@link

   997      *     #normalize() normalize} method. </p></li>

   998      *

   999      *   </ol></li>

  1000      *

  1001      * </ol>

  1002      *

  1003      * <p> The result of this method is absolute if, and only if, either this

  1004      * URI is absolute or the given URI is absolute.  </p>

  1005      *

  1006      * @param  uri  The URI to be resolved against this URI

  1007      * @return The resulting URI

  1008      *

  1009      * @throws  NullPointerException

  1010      *          If <tt>uri</tt> is <tt>null</tt>

  1011      */

  1012     public URI resolve(URI uri) {

  1013         return resolve(this, uri);

  1014     }

  1016     /**

  1017      * Constructs a new URI by parsing the given string and then resolving it

  1018      * against this URI.

  1019      *

  1020      * <p> This convenience method works as if invoking it were equivalent to

  1021      * evaluating the expression <tt>{@link #resolve(java.net.URI)

  1022      * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>

  1023      *

  1024      * @param  str   The string to be parsed into a URI

  1025      * @return The resulting URI

  1026      *

  1027      * @throws  NullPointerException

  1028      *          If <tt>str</tt> is <tt>null</tt>

  1029      *

  1030      * @throws  IllegalArgumentException

  1031      *          If the given string violates RFC&nbsp;2396

  1032      */

  1033     public URI resolve(String str) {

  1034         return resolve(URI.create(str));

  1035     }

  1037     /**

  1038      * Relativizes the given URI against this URI.

  1039      *

  1040      * <p> The relativization of the given URI against this URI is computed as

  1041      * follows: </p>

  1042      *

  1043      * <ol>

  1044      *

  1045      *   <li><p> If either this URI or the given URI are opaque, or if the

  1046      *   scheme and authority components of the two URIs are not identical, or

  1047      *   if the path of this URI is not a prefix of the path of the given URI,

  1048      *   then the given URI is returned. </p></li>

  1049      *

  1050      *   <li><p> Otherwise a new relative hierarchical URI is constructed with

  1051      *   query and fragment components taken from the given URI and with a path

  1052      *   component computed by removing this URI's path from the beginning of

  1053      *   the given URI's path. </p></li>

  1054      *

  1055      * </ol>

  1056      *

  1057      * @param  uri  The URI to be relativized against this URI

  1058      * @return The resulting URI

  1059      *

  1060      * @throws  NullPointerException

  1061      *          If <tt>uri</tt> is <tt>null</tt>

  1062      */

  1063     public URI relativize(URI uri) {

  1064         return relativize(this, uri);

  1065     }

  1067     /**

  1068      * Constructs a URL from this URI.

  1069      *

  1070      * <p> This convenience method works as if invoking it were equivalent to

  1071      * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after

  1072      * first checking that this URI is absolute. </p>

  1073      *

  1074      * @return  A URL constructed from this URI

  1075      *

  1076      * @throws  IllegalArgumentException

  1077      *          If this URL is not absolute

  1078      *

  1079      * @throws  MalformedURLException

  1080      *          If a protocol handler for the URL could not be found,

  1081      *          or if some other error occurred while constructing the URL

  1082      */

  1083     public URL toURL()

  1084         throws MalformedURLException {

  1085         if (!isAbsolute())

  1086             throw new IllegalArgumentException("URI is not absolute");

  1087         return new URL(toString());

  1088     }

  1090     // -- Component access methods --

  1092     /**

  1093      * Returns the scheme component of this URI.

  1094      *

  1095      * <p> The scheme component of a URI, if defined, only contains characters

  1096      * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>.  A

  1097      * scheme always starts with an <i>alpha</i> character. <p>

  1098      *

  1099      * The scheme component of a URI cannot contain escaped octets, hence this

  1100      * method does not perform any decoding.

  1101      *

  1102      * @return  The scheme component of this URI,

  1103      *          or <tt>null</tt> if the scheme is undefined

  1104      */

  1105     public String getScheme() {

  1106         return scheme;

  1107     }

  1109     /**

  1110      * Tells whether or not this URI is absolute.

  1111      *

  1112      * <p> A URI is absolute if, and only if, it has a scheme component. </p>

  1113      *

  1114      * @return  <tt>true</tt> if, and only if, this URI is absolute

  1115      */

  1116     public boolean isAbsolute() {

  1117         return scheme != null;

  1118     }

  1120     /**

  1121      * Tells whether or not this URI is opaque.

  1122      *

  1123      * <p> A URI is opaque if, and only if, it is absolute and its

  1124      * scheme-specific part does not begin with a slash character ('/').

  1125      * An opaque URI has a scheme, a scheme-specific part, and possibly

  1126      * a fragment; all other components are undefined. </p>

  1127      *

  1128      * @return  <tt>true</tt> if, and only if, this URI is opaque

  1129      */

  1130     public boolean isOpaque() {

  1131         return path == null;

  1132     }

  1134     /**

  1135      * Returns the raw scheme-specific part of this URI.  The scheme-specific

  1136      * part is never undefined, though it may be empty.

  1137      *

  1138      * <p> The scheme-specific part of a URI only contains legal URI

  1139      * characters. </p>

  1140      *

  1141      * @return  The raw scheme-specific part of this URI

  1142      *          (never <tt>null</tt>)

  1143      */

  1144     public String getRawSchemeSpecificPart() {

  1145         defineSchemeSpecificPart();

  1146         return schemeSpecificPart;

  1147     }

  1149     /**

  1150      * Returns the decoded scheme-specific part of this URI.

  1151      *

  1152      * <p> The string returned by this method is equal to that returned by the

  1153      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method

  1154      * except that all sequences of escaped octets are <a

  1155      * href="#decode">decoded</a>.  </p>

  1156      *

  1157      * @return  The decoded scheme-specific part of this URI

  1158      *          (never <tt>null</tt>)

  1159      */

  1160     public String getSchemeSpecificPart() {

  1161         if (decodedSchemeSpecificPart == null)

  1162             decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());

  1163         return decodedSchemeSpecificPart;

  1164     }

  1166     /**

  1167      * Returns the raw authority component of this URI.

  1168      *

  1169      * <p> The authority component of a URI, if defined, only contains the

  1170      * commercial-at character (<tt>'@'</tt>) and characters in the

  1171      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>

  1172      * categories.  If the authority is server-based then it is further

  1173      * constrained to have valid user-information, host, and port

  1174      * components. </p>

  1175      *

  1176      * @return  The raw authority component of this URI,

  1177      *          or <tt>null</tt> if the authority is undefined

  1178      */

  1179     public String getRawAuthority() {

  1180         return authority;

  1181     }

  1183     /**

  1184      * Returns the decoded authority component of this URI.

  1185      *

  1186      * <p> The string returned by this method is equal to that returned by the

  1187      * {@link #getRawAuthority() getRawAuthority} method except that all

  1188      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1189      *

  1190      * @return  The decoded authority component of this URI,

  1191      *          or <tt>null</tt> if the authority is undefined

  1192      */

  1193     public String getAuthority() {

  1194         if (decodedAuthority == null)

  1195             decodedAuthority = decode(authority);

  1196         return decodedAuthority;

  1197     }

  1199     /**

  1200      * Returns the raw user-information component of this URI.

  1201      *

  1202      * <p> The user-information component of a URI, if defined, only contains

  1203      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and

  1204      * <i>other</i> categories. </p>

  1205      *

  1206      * @return  The raw user-information component of this URI,

  1207      *          or <tt>null</tt> if the user information is undefined

  1208      */

  1209     public String getRawUserInfo() {

  1210         return userInfo;

  1211     }

  1213     /**

  1214      * Returns the decoded user-information component of this URI.

  1215      *

  1216      * <p> The string returned by this method is equal to that returned by the

  1217      * {@link #getRawUserInfo() getRawUserInfo} method except that all

  1218      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1219      *

  1220      * @return  The decoded user-information component of this URI,

  1221      *          or <tt>null</tt> if the user information is undefined

  1222      */

  1223     public String getUserInfo() {

  1224         if ((decodedUserInfo == null) && (userInfo != null))

  1225             decodedUserInfo = decode(userInfo);

  1226         return decodedUserInfo;

  1227     }

  1229     /**

  1230      * Returns the host component of this URI.

  1231      *

  1232      * <p> The host component of a URI, if defined, will have one of the

  1233      * following forms: </p>

  1234      *

  1235      * <ul type=disc>

  1236      *

  1237      *   <li><p> A domain name consisting of one or more <i>labels</i>

  1238      *   separated by period characters (<tt>'.'</tt>), optionally followed by

  1239      *   a period character.  Each label consists of <i>alphanum</i> characters

  1240      *   as well as hyphen characters (<tt>'-'</tt>), though hyphens never

  1241      *   occur as the first or last characters in a label. The rightmost

  1242      *   label of a domain name consisting of two or more labels, begins

  1243      *   with an <i>alpha</i> character. </li>

  1244      *

  1245      *   <li><p> A dotted-quad IPv4 address of the form

  1246      *   <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,

  1247      *   where no <i>digit</i> sequence is longer than three characters and no

  1248      *   sequence has a value larger than 255. </p></li>

  1249      *

  1250      *   <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and

  1251      *   <tt>']'</tt>) and consisting of hexadecimal digits, colon characters

  1252      *   (<tt>':'</tt>), and possibly an embedded IPv4 address.  The full

  1253      *   syntax of IPv6 addresses is specified in <a

  1254      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6

  1255      *   Addressing Architecture</i></a>.  </p></li>

  1256      *

  1257      * </ul>

  1258      *

  1259      * The host component of a URI cannot contain escaped octets, hence this

  1260      * method does not perform any decoding.

  1261      *

  1262      * @return  The host component of this URI,

  1263      *          or <tt>null</tt> if the host is undefined

  1264      */

  1265     public String getHost() {

  1266         return host;

  1267     }

  1269     /**

  1270      * Returns the port number of this URI.

  1271      *

  1272      * <p> The port component of a URI, if defined, is a non-negative

  1273      * integer. </p>

  1274      *

  1275      * @return  The port component of this URI,

  1276      *          or <tt>-1</tt> if the port is undefined

  1277      */

  1278     public int getPort() {

  1279         return port;

  1280     }

  1282     /**

  1283      * Returns the raw path component of this URI.

  1284      *

  1285      * <p> The path component of a URI, if defined, only contains the slash

  1286      * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),

  1287      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,

  1288      * and <i>other</i> categories. </p>

  1289      *

  1290      * @return  The path component of this URI,

  1291      *          or <tt>null</tt> if the path is undefined

  1292      */

  1293     public String getRawPath() {

  1294         return path;

  1295     }

  1297     /**

  1298      * Returns the decoded path component of this URI.

  1299      *

  1300      * <p> The string returned by this method is equal to that returned by the

  1301      * {@link #getRawPath() getRawPath} method except that all sequences of

  1302      * escaped octets are <a href="#decode">decoded</a>.  </p>

  1303      *

  1304      * @return  The decoded path component of this URI,

  1305      *          or <tt>null</tt> if the path is undefined

  1306      */

  1307     public String getPath() {

  1308         if ((decodedPath == null) && (path != null))

  1309             decodedPath = decode(path);

  1310         return decodedPath;

  1311     }

  1313     /**

  1314      * Returns the raw query component of this URI.

  1315      *

  1316      * <p> The query component of a URI, if defined, only contains legal URI

  1317      * characters. </p>

  1318      *

  1319      * @return  The raw query component of this URI,

  1320      *          or <tt>null</tt> if the query is undefined

  1321      */

  1322     public String getRawQuery() {

  1323         return query;

  1324     }

  1326     /**

  1327      * Returns the decoded query component of this URI.

  1328      *

  1329      * <p> The string returned by this method is equal to that returned by the

  1330      * {@link #getRawQuery() getRawQuery} method except that all sequences of

  1331      * escaped octets are <a href="#decode">decoded</a>.  </p>

  1332      *

  1333      * @return  The decoded query component of this URI,

  1334      *          or <tt>null</tt> if the query is undefined

  1335      */

  1336     public String getQuery() {

  1337         if ((decodedQuery == null) && (query != null))

  1338             decodedQuery = decode(query);

  1339         return decodedQuery;

  1340     }

  1342     /**

  1343      * Returns the raw fragment component of this URI.

  1344      *

  1345      * <p> The fragment component of a URI, if defined, only contains legal URI

  1346      * characters. </p>

  1347      *

  1348      * @return  The raw fragment component of this URI,

  1349      *          or <tt>null</tt> if the fragment is undefined

  1350      */

  1351     public String getRawFragment() {

  1352         return fragment;

  1353     }

  1355     /**

  1356      * Returns the decoded fragment component of this URI.

  1357      *

  1358      * <p> The string returned by this method is equal to that returned by the

  1359      * {@link #getRawFragment() getRawFragment} method except that all

  1360      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>

  1361      *

  1362      * @return  The decoded fragment component of this URI,

  1363      *          or <tt>null</tt> if the fragment is undefined

  1364      */

  1365     public String getFragment() {

  1366         if ((decodedFragment == null) && (fragment != null))

  1367             decodedFragment = decode(fragment);

  1368         return decodedFragment;

  1369     }

  1372     // -- Equality, comparison, hash code, toString, and serialization --

  1374     /**

  1375      * Tests this URI for equality with another object.

  1376      *

  1377      * <p> If the given object is not a URI then this method immediately

  1378      * returns <tt>false</tt>.

  1379      *

  1380      * <p> For two URIs to be considered equal requires that either both are

  1381      * opaque or both are hierarchical.  Their schemes must either both be

  1382      * undefined or else be equal without regard to case. Their fragments

  1383      * must either both be undefined or else be equal.

  1384      *

  1385      * <p> For two opaque URIs to be considered equal, their scheme-specific

  1386      * parts must be equal.

  1387      *

  1388      * <p> For two hierarchical URIs to be considered equal, their paths must

  1389      * be equal and their queries must either both be undefined or else be

  1390      * equal.  Their authorities must either both be undefined, or both be

  1391      * registry-based, or both be server-based.  If their authorities are

  1392      * defined and are registry-based, then they must be equal.  If their

  1393      * authorities are defined and are server-based, then their hosts must be

  1394      * equal without regard to case, their port numbers must be equal, and

  1395      * their user-information components must be equal.

  1396      *

  1397      * <p> When testing the user-information, path, query, fragment, authority,

  1398      * or scheme-specific parts of two URIs for equality, the raw forms rather

  1399      * than the encoded forms of these components are compared and the

  1400      * hexadecimal digits of escaped octets are compared without regard to

  1401      * case.

  1402      *

  1403      * <p> This method satisfies the general contract of the {@link

  1404      * java.lang.Object#equals(Object) Object.equals} method. </p>

  1405      *

  1406      * @param   ob   The object to which this object is to be compared

  1407      *

  1408      * @return  <tt>true</tt> if, and only if, the given object is a URI that

  1409      *          is identical to this URI

  1410      */

  1411     public boolean equals(Object ob) {

  1412         if (ob == this)

  1413             return true;

  1414         if (!(ob instanceof URI))

  1415             return false;

  1416         URI that = (URI)ob;

  1417         if (this.isOpaque() != that.isOpaque()) return false;

  1418         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;

  1419         if (!equal(this.fragment, that.fragment)) return false;

  1421         // Opaque

  1422         if (this.isOpaque())

  1423             return equal(this.schemeSpecificPart, that.schemeSpecificPart);

  1425         // Hierarchical

  1426         if (!equal(this.path, that.path)) return false;

  1427         if (!equal(this.query, that.query)) return false;

  1429         // Authorities

  1430         if (this.authority == that.authority) return true;

  1431         if (this.host != null) {

  1432             // Server-based

  1433             if (!equal(this.userInfo, that.userInfo)) return false;

  1434             if (!equalIgnoringCase(this.host, that.host)) return false;

  1435             if (this.port != that.port) return false;

  1436         } else if (this.authority != null) {

  1437             // Registry-based

  1438             if (!equal(this.authority, that.authority)) return false;

  1439         } else if (this.authority != that.authority) {

  1440             return false;

  1441         }

  1443         return true;

  1444     }

  1446     /**

  1447      * Returns a hash-code value for this URI.  The hash code is based upon all

  1448      * of the URI's components, and satisfies the general contract of the

  1449      * {@link java.lang.Object#hashCode() Object.hashCode} method.

  1450      *

  1451      * @return  A hash-code value for this URI

  1452      */

  1453     public int hashCode() {

  1454         if (hash != 0)

  1455             return hash;

  1456         int h = hashIgnoringCase(0, scheme);

  1457         h = hash(h, fragment);

  1458         if (isOpaque()) {

  1459             h = hash(h, schemeSpecificPart);

  1460         } else {

  1461             h = hash(h, path);

  1462             h = hash(h, query);

  1463             if (host != null) {

  1464                 h = hash(h, userInfo);

  1465                 h = hashIgnoringCase(h, host);

  1466                 h += 1949 * port;

  1467             } else {

  1468                 h = hash(h, authority);

  1469             }

  1470         }

  1471         hash = h;

  1472         return h;

  1473     }

  1475     /**

  1476      * Compares this URI to another object, which must be a URI.

  1477      *

  1478      * <p> When comparing corresponding components of two URIs, if one

  1479      * component is undefined but the other is defined then the first is

  1480      * considered to be less than the second.  Unless otherwise noted, string

  1481      * components are ordered according to their natural, case-sensitive

  1482      * ordering as defined by the {@link java.lang.String#compareTo(Object)

  1483      * String.compareTo} method.  String components that are subject to

  1484      * encoding are compared by comparing their raw forms rather than their

  1485      * encoded forms.

  1486      *

  1487      * <p> The ordering of URIs is defined as follows: </p>

  1488      *

  1489      * <ul type=disc>

  1490      *

  1491      *   <li><p> Two URIs with different schemes are ordered according the

  1492      *   ordering of their schemes, without regard to case. </p></li>

  1493      *

  1494      *   <li><p> A hierarchical URI is considered to be less than an opaque URI

  1495      *   with an identical scheme. </p></li>

  1496      *

  1497      *   <li><p> Two opaque URIs with identical schemes are ordered according

  1498      *   to the ordering of their scheme-specific parts. </p></li>

  1499      *

  1500      *   <li><p> Two opaque URIs with identical schemes and scheme-specific

  1501      *   parts are ordered according to the ordering of their

  1502      *   fragments. </p></li>

  1503      *

  1504      *   <li><p> Two hierarchical URIs with identical schemes are ordered

  1505      *   according to the ordering of their authority components: </p>

  1506      *

  1507      *   <ul type=disc>

  1508      *

  1509      *     <li><p> If both authority components are server-based then the URIs

  1510      *     are ordered according to their user-information components; if these

  1511      *     components are identical then the URIs are ordered according to the

  1512      *     ordering of their hosts, without regard to case; if the hosts are

  1513      *     identical then the URIs are ordered according to the ordering of

  1514      *     their ports. </p></li>

  1515      *

  1516      *     <li><p> If one or both authority components are registry-based then

  1517      *     the URIs are ordered according to the ordering of their authority

  1518      *     components. </p></li>

  1519      *

  1520      *   </ul></li>

  1521      *

  1522      *   <li><p> Finally, two hierarchical URIs with identical schemes and

  1523      *   authority components are ordered according to the ordering of their

  1524      *   paths; if their paths are identical then they are ordered according to

  1525      *   the ordering of their queries; if the queries are identical then they

  1526      *   are ordered according to the order of their fragments. </p></li>

  1527      *

  1528      * </ul>

  1529      *

  1530      * <p> This method satisfies the general contract of the {@link

  1531      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}

  1532      * method. </p>

  1533      *

  1534      * @param   that

  1535      *          The object to which this URI is to be compared

  1536      *

  1537      * @return  A negative integer, zero, or a positive integer as this URI is

  1538      *          less than, equal to, or greater than the given URI

  1539      *

  1540      * @throws  ClassCastException

  1541      *          If the given object is not a URI

  1542      */

  1543     public int compareTo(URI that) {

  1544         int c;

  1546         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)

  1547             return c;

  1549         if (this.isOpaque()) {

  1550             if (that.isOpaque()) {

  1551                 // Both opaque

  1552                 if ((c = compare(this.schemeSpecificPart,

  1553                                  that.schemeSpecificPart)) != 0)

  1554                     return c;

  1555                 return compare(this.fragment, that.fragment);

  1556             }

  1557             return +1;                  // Opaque > hierarchical

  1558         } else if (that.isOpaque()) {

  1559             return -1;                  // Hierarchical < opaque

  1560         }

  1562         // Hierarchical

  1563         if ((this.host != null) && (that.host != null)) {

  1564             // Both server-based

  1565             if ((c = compare(this.userInfo, that.userInfo)) != 0)

  1566                 return c;

  1567             if ((c = compareIgnoringCase(this.host, that.host)) != 0)

  1568                 return c;

  1569             if ((c = this.port - that.port) != 0)

  1570                 return c;

  1571         } else {

  1572             // If one or both authorities are registry-based then we simply

  1573             // compare them in the usual, case-sensitive way.  If one is

  1574             // registry-based and one is server-based then the strings are

  1575             // guaranteed to be unequal, hence the comparison will never return

  1576             // zero and the compareTo and equals methods will remain

  1577             // consistent.

  1578             if ((c = compare(this.authority, that.authority)) != 0) return c;

  1579         }

  1581         if ((c = compare(this.path, that.path)) != 0) return c;

  1582         if ((c = compare(this.query, that.query)) != 0) return c;

  1583         return compare(this.fragment, that.fragment);

  1584     }

  1586     /**

  1587      * Returns the content of this URI as a string.

  1588      *

  1589      * <p> If this URI was created by invoking one of the constructors in this

  1590      * class then a string equivalent to the original input string, or to the

  1591      * string computed from the originally-given components, as appropriate, is

  1592      * returned.  Otherwise this URI was created by normalization, resolution,

  1593      * or relativization, and so a string is constructed from this URI's

  1594      * components according to the rules specified in <a

  1595      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,

  1596      * section&nbsp;5.2, step&nbsp;7. </p>

  1597      *

  1598      * @return  The string form of this URI

  1599      */

  1600     public String toString() {

  1601         defineString();

  1602         return string;

  1603     }

  1605     /**

  1606      * Returns the content of this URI as a US-ASCII string.

  1607      *

  1608      * <p> If this URI does not contain any characters in the <i>other</i>

  1609      * category then an invocation of this method will return the same value as

  1610      * an invocation of the {@link #toString() toString} method.  Otherwise

  1611      * this method works as if by invoking that method and then <a

  1612      * href="#encode">encoding</a> the result.  </p>

  1613      *

  1614      * @return  The string form of this URI, encoded as needed

  1615      *          so that it only contains characters in the US-ASCII

  1616      *          charset

  1617      */

  1618     public String toASCIIString() {

  1619         defineString();

  1620         return encode(string);

  1621     }

  1624     // -- Serialization support --

  1626     /**

  1627      * Saves the content of this URI to the given serial stream.

  1628      *

  1629      * <p> The only serializable field of a URI instance is its <tt>string</tt>

  1630      * field.  That field is given a value, if it does not have one already,

  1631      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}

  1632      * method of the given object-output stream is invoked. </p>

  1633      *

  1634      * @param  os  The object-output stream to which this object

  1635      *             is to be written

  1636      */

  1637     private void writeObject(ObjectOutputStream os)

  1638         throws IOException

  1639     {

  1640         defineString();

  1641         os.defaultWriteObject();        // Writes the string field only

  1642     }

  1644     /**

  1645      * Reconstitutes a URI from the given serial stream.

  1646      *

  1647      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is

  1648      * invoked to read the value of the <tt>string</tt> field.  The result is

  1649      * then parsed in the usual way.

  1650      *

  1651      * @param  is  The object-input stream from which this object

  1652      *             is being read

  1653      */

  1654     private void readObject(ObjectInputStream is)

  1655         throws ClassNotFoundException, IOException

  1656     {

  1657         port = -1;                      // Argh

  1658         is.defaultReadObject();

  1659         try {

  1660             new Parser(string).parse(false);

  1661         } catch (URISyntaxException x) {

  1662             IOException y = new InvalidObjectException("Invalid URI");

  1663             y.initCause(x);

  1664             throw y;

  1665         }

  1666     }

  1669     // -- End of public methods --

  1672     // -- Utility methods for string-field comparison and hashing --

  1674     // These methods return appropriate values for null string arguments,

  1675     // thereby simplifying the equals, hashCode, and compareTo methods.

  1676     //

  1677     // The case-ignoring methods should only be applied to strings whose

  1678     // characters are all known to be US-ASCII.  Because of this restriction,

  1679     // these methods are faster than the similar methods in the String class.

  1681     // US-ASCII only

  1682     private static int toLower(char c) {

  1683         if ((c >= 'A') && (c <= 'Z'))

  1684             return c + ('a' - 'A');

  1685         return c;

  1686     }

  1688     private static boolean equal(String s, String t) {

  1689         if (s == t) return true;

  1690         if ((s != null) && (t != null)) {

  1691             if (s.length() != t.length())

  1692                 return false;

  1693             if (s.indexOf('%') < 0)

  1694                 return s.equals(t);

  1695             int n = s.length();

  1696             for (int i = 0; i < n;) {

  1697                 char c = s.charAt(i);

  1698                 char d = t.charAt(i);

  1699                 if (c != '%') {

  1700                     if (c != d)

  1701                         return false;

  1702                     i++;

  1703                     continue;

  1704                 }

  1705                 i++;

  1706                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1707                     return false;

  1708                 i++;

  1709                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1710                     return false;

  1711                 i++;

  1712             }

  1713             return true;

  1714         }

  1715         return false;

  1716     }

  1718     // US-ASCII only

  1719     private static boolean equalIgnoringCase(String s, String t) {

  1720         if (s == t) return true;

  1721         if ((s != null) && (t != null)) {

  1722             int n = s.length();

  1723             if (t.length() != n)

  1724                 return false;

  1725             for (int i = 0; i < n; i++) {

  1726                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))

  1727                     return false;

  1728             }

  1729             return true;

  1730         }

  1731         return false;

  1732     }

  1734     private static int hash(int hash, String s) {

  1735         if (s == null) return hash;

  1736         return hash * 127 + s.hashCode();

  1737     }

  1739     // US-ASCII only

  1740     private static int hashIgnoringCase(int hash, String s) {

  1741         if (s == null) return hash;

  1742         int h = hash;

  1743         int n = s.length();

  1744         for (int i = 0; i < n; i++)

  1745             h = 31 * h + toLower(s.charAt(i));

  1746         return h;

  1747     }

  1749     private static int compare(String s, String t) {

  1750         if (s == t) return 0;

  1751         if (s != null) {

  1752             if (t != null)

  1753                 return s.compareTo(t);

  1754             else

  1755                 return +1;

  1756         } else {

  1757             return -1;

  1758         }

  1759     }

  1761     // US-ASCII only

  1762     private static int compareIgnoringCase(String s, String t) {

  1763         if (s == t) return 0;

  1764         if (s != null) {

  1765             if (t != null) {

  1766                 int sn = s.length();

  1767                 int tn = t.length();

  1768                 int n = sn < tn ? sn : tn;

  1769                 for (int i = 0; i < n; i++) {

  1770                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));

  1771                     if (c != 0)

  1772                         return c;

  1773                 }

  1774                 return sn - tn;

  1775             }

  1776             return +1;

  1777         } else {

  1778             return -1;

  1779         }

  1780     }

  1783     // -- String construction --

  1785     // If a scheme is given then the path, if given, must be absolute

  1786     //

  1787     private static void checkPath(String s, String scheme, String path)

  1788         throws URISyntaxException

  1789     {

  1790         if (scheme != null) {

  1791             if ((path != null)

  1792                 && ((path.length() > 0) && (path.charAt(0) != '/')))

  1793                 throw new URISyntaxException(s,

  1794                                              "Relative path in absolute URI");

  1795         }

  1796     }

  1798     private void appendAuthority(StringBuffer sb,

  1799                                  String authority,

  1800                                  String userInfo,

  1801                                  String host,

  1802                                  int port)

  1803     {

  1804         if (host != null) {

  1805             sb.append("//");

  1806             if (userInfo != null) {

  1807                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));

  1808                 sb.append('@');

  1809             }

  1810             boolean needBrackets = ((host.indexOf(':') >= 0)

  1811                                     && !host.startsWith("[")

  1812                                     && !host.endsWith("]"));

  1813             if (needBrackets) sb.append('[');

  1814             sb.append(host);

  1815             if (needBrackets) sb.append(']');

  1816             if (port != -1) {

  1817                 sb.append(':');

  1818                 sb.append(port);

  1819             }

  1820         } else if (authority != null) {

  1821             sb.append("//");

  1822             if (authority.startsWith("[")) {

  1823                 // authority should (but may not) contain an embedded IPv6 address

  1824                 int end = authority.indexOf("]");

  1825                 String doquote = authority, dontquote = "";

  1826                 if (end != -1 && authority.indexOf(":") != -1) {

  1827                     // the authority contains an IPv6 address

  1828                     if (end == authority.length()) {

  1829                         dontquote = authority;

  1830                         doquote = "";

  1831                     } else {

  1832                         dontquote = authority.substring(0 , end + 1);

  1833                         doquote = authority.substring(end + 1);

  1834                     }

  1835                 }

  1836                 sb.append(dontquote);

  1837                 sb.append(quote(doquote,

  1838                             L_REG_NAME | L_SERVER,

  1839                             H_REG_NAME | H_SERVER));

  1840             } else {

  1841                 sb.append(quote(authority,

  1842                             L_REG_NAME | L_SERVER,

  1843                             H_REG_NAME | H_SERVER));

  1844             }

  1845         }

  1846     }

  1848     private void appendSchemeSpecificPart(StringBuffer sb,

  1849                                           String opaquePart,

  1850                                           String authority,

  1851                                           String userInfo,

  1852                                           String host,

  1853                                           int port,

  1854                                           String path,

  1855                                           String query)

  1856     {

  1857         if (opaquePart != null) {

  1858             /* check if SSP begins with an IPv6 address

  1859              * because we must not quote a literal IPv6 address

  1860              */

  1861             if (opaquePart.startsWith("//[")) {

  1862                 int end =  opaquePart.indexOf("]");

  1863                 if (end != -1 && opaquePart.indexOf(":")!=-1) {

  1864                     String doquote, dontquote;

  1865                     if (end == opaquePart.length()) {

  1866                         dontquote = opaquePart;

  1867                         doquote = "";

  1868                     } else {

  1869                         dontquote = opaquePart.substring(0,end+1);

  1870                         doquote = opaquePart.substring(end+1);

  1871                     }

  1872                     sb.append (dontquote);

  1873                     sb.append(quote(doquote, L_URIC, H_URIC));

  1874                 }

  1875             } else {

  1876                 sb.append(quote(opaquePart, L_URIC, H_URIC));

  1877             }

  1878         } else {

  1879             appendAuthority(sb, authority, userInfo, host, port);

  1880             if (path != null)

  1881                 sb.append(quote(path, L_PATH, H_PATH));

  1882             if (query != null) {

  1883                 sb.append('?');

  1884                 sb.append(quote(query, L_URIC, H_URIC));

  1885             }

  1886         }

  1887     }

  1889     private void appendFragment(StringBuffer sb, String fragment) {

  1890         if (fragment != null) {

  1891             sb.append('#');

  1892             sb.append(quote(fragment, L_URIC, H_URIC));

  1893         }

  1894     }

  1896     private String toString(String scheme,

  1897                             String opaquePart,

  1898                             String authority,

  1899                             String userInfo,

  1900                             String host,

  1901                             int port,

  1902                             String path,

  1903                             String query,

  1904                             String fragment)

  1905     {

  1906         StringBuffer sb = new StringBuffer();

  1907         if (scheme != null) {

  1908             sb.append(scheme);

  1909             sb.append(':');

  1910         }

  1911         appendSchemeSpecificPart(sb, opaquePart,

  1912                                  authority, userInfo, host, port,

  1913                                  path, query);

  1914         appendFragment(sb, fragment);

  1915         return sb.toString();

  1916     }

  1918     private void defineSchemeSpecificPart() {

  1919         if (schemeSpecificPart != null) return;

  1920         StringBuffer sb = new StringBuffer();

  1921         appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),

  1922                                  host, port, getPath(), getQuery());

  1923         if (sb.length() == 0) return;

  1924         schemeSpecificPart = sb.toString();

  1925     }

  1927     private void defineString() {

  1928         if (string != null) return;

  1930         StringBuffer sb = new StringBuffer();

  1931         if (scheme != null) {

  1932             sb.append(scheme);

  1933             sb.append(':');

  1934         }

  1935         if (isOpaque()) {

  1936             sb.append(schemeSpecificPart);

  1937         } else {

  1938             if (host != null) {

  1939                 sb.append("//");

  1940                 if (userInfo != null) {

  1941                     sb.append(userInfo);

  1942                     sb.append('@');

  1943                 }

  1944                 boolean needBrackets = ((host.indexOf(':') >= 0)

  1945                                     && !host.startsWith("[")

  1946                                     && !host.endsWith("]"));

  1947                 if (needBrackets) sb.append('[');

  1948                 sb.append(host);

  1949                 if (needBrackets) sb.append(']');

  1950                 if (port != -1) {

  1951                     sb.append(':');

  1952                     sb.append(port);

  1953                 }

  1954             } else if (authority != null) {

  1955                 sb.append("//");

  1956                 sb.append(authority);

  1957             }

  1958             if (path != null)

  1959                 sb.append(path);

  1960             if (query != null) {

  1961                 sb.append('?');

  1962                 sb.append(query);

  1963             }

  1964         }

  1965         if (fragment != null) {

  1966             sb.append('#');

  1967             sb.append(fragment);

  1968         }

  1969         string = sb.toString();

  1970     }

  1973     // -- Normalization, resolution, and relativization --

  1975     // RFC2396 5.2 (6)

  1976     private static String resolvePath(String base, String child,

  1977                                       boolean absolute)

  1978     {

  1979         int i = base.lastIndexOf('/');

  1980         int cn = child.length();

  1981         String path = "";

  1983         if (cn == 0) {

  1984             // 5.2 (6a)

  1985             if (i >= 0)

  1986                 path = base.substring(0, i + 1);

  1987         } else {

  1988             StringBuffer sb = new StringBuffer(base.length() + cn);

  1989             // 5.2 (6a)

  1990             if (i >= 0)

  1991                 sb.append(base.substring(0, i + 1));

  1992             // 5.2 (6b)

  1993             sb.append(child);

  1994             path = sb.toString();

  1995         }

  1997         // 5.2 (6c-f)

  1998         String np = normalize(path);

  2000         // 5.2 (6g): If the result is absolute but the path begins with "../",

  2001         // then we simply leave the path as-is

  2003         return np;

  2004     }

  2006     // RFC2396 5.2

  2007     private static URI resolve(URI base, URI child) {

  2008         // check if child if opaque first so that NPE is thrown

  2009         // if child is null.

  2010         if (child.isOpaque() || base.isOpaque())

  2011             return child;

  2013         // 5.2 (2): Reference to current document (lone fragment)

  2014         if ((child.scheme == null) && (child.authority == null)

  2015             && child.path.equals("") && (child.fragment != null)

  2016             && (child.query == null)) {

  2017             if ((base.fragment != null)

  2018                 && child.fragment.equals(base.fragment)) {

  2019                 return base;

  2020             }

  2021             URI ru = new URI();

  2022             ru.scheme = base.scheme;

  2023             ru.authority = base.authority;

  2024             ru.userInfo = base.userInfo;

  2025             ru.host = base.host;

  2026             ru.port = base.port;

  2027             ru.path = base.path;

  2028             ru.fragment = child.fragment;

  2029             ru.query = base.query;

  2030             return ru;

  2031         }

  2033         // 5.2 (3): Child is absolute

  2034         if (child.scheme != null)

  2035             return child;

  2037         URI ru = new URI();             // Resolved URI

  2038         ru.scheme = base.scheme;

  2039         ru.query = child.query;

  2040         ru.fragment = child.fragment;

  2042         // 5.2 (4): Authority

  2043         if (child.authority == null) {

  2044             ru.authority = base.authority;

  2045             ru.host = base.host;

  2046             ru.userInfo = base.userInfo;

  2047             ru.port = base.port;

  2049             String cp = (child.path == null) ? "" : child.path;

  2050             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {

  2051                 // 5.2 (5): Child path is absolute

  2052                 ru.path = child.path;

  2053             } else {

  2054                 // 5.2 (6): Resolve relative path

  2055                 ru.path = resolvePath(base.path, cp, base.isAbsolute());

  2056             }

  2057         } else {

  2058             ru.authority = child.authority;

  2059             ru.host = child.host;

  2060             ru.userInfo = child.userInfo;

  2061             ru.host = child.host;

  2062             ru.port = child.port;

  2063             ru.path = child.path;

  2064         }

  2066         // 5.2 (7): Recombine (nothing to do here)

  2067         return ru;

  2068     }

  2070     // If the given URI's path is normal then return the URI;

  2071     // o.w., return a new URI containing the normalized path.

  2072     //

  2073     private static URI normalize(URI u) {

  2074         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))

  2075             return u;

  2077         String np = normalize(u.path);

  2078         if (np == u.path)

  2079             return u;

  2081         URI v = new URI();

  2082         v.scheme = u.scheme;

  2083         v.fragment = u.fragment;

  2084         v.authority = u.authority;

  2085         v.userInfo = u.userInfo;

  2086         v.host = u.host;

  2087         v.port = u.port;

  2088         v.path = np;

  2089         v.query = u.query;

  2090         return v;

  2091     }

  2093     // If both URIs are hierarchical, their scheme and authority components are

  2094     // identical, and the base path is a prefix of the child's path, then

  2095     // return a relative URI that, when resolved against the base, yields the

  2096     // child; otherwise, return the child.

  2097     //

  2098     private static URI relativize(URI base, URI child) {

  2099         // check if child if opaque first so that NPE is thrown

  2100         // if child is null.

  2101         if (child.isOpaque() || base.isOpaque())

  2102             return child;

  2103         if (!equalIgnoringCase(base.scheme, child.scheme)

  2104             || !equal(base.authority, child.authority))

  2105             return child;

  2107         String bp = normalize(base.path);

  2108         String cp = normalize(child.path);

  2109         if (!bp.equals(cp)) {

  2110             if (!bp.endsWith("/"))

  2111                 bp = bp + "/";

  2112             if (!cp.startsWith(bp))

  2113                 return child;

  2114         }

  2116         URI v = new URI();

  2117         v.path = cp.substring(bp.length());

  2118         v.query = child.query;

  2119         v.fragment = child.fragment;

  2120         return v;

  2121     }

  2125     // -- Path normalization --

  2127     // The following algorithm for path normalization avoids the creation of a

  2128     // string object for each segment, as well as the use of a string buffer to

  2129     // compute the final result, by using a single char array and editing it in

  2130     // place.  The array is first split into segments, replacing each slash

  2131     // with '\0' and creating a segment-index array, each element of which is

  2132     // the index of the first char in the corresponding segment.  We then walk

  2133     // through both arrays, removing ".", "..", and other segments as necessary

  2134     // by setting their entries in the index array to -1.  Finally, the two

  2135     // arrays are used to rejoin the segments and compute the final result.

  2136     //

  2137     // This code is based upon src/solaris/native/java/io/canonicalize_md.c

  2140     // Check the given path to see if it might need normalization.  A path

  2141     // might need normalization if it contains duplicate slashes, a "."

  2142     // segment, or a ".." segment.  Return -1 if no further normalization is

  2143     // possible, otherwise return the number of segments found.

  2144     //

  2145     // This method takes a string argument rather than a char array so that

  2146     // this test can be performed without invoking path.toCharArray().

  2147     //

  2148     static private int needsNormalization(String path) {

  2149         boolean normal = true;

  2150         int ns = 0;                     // Number of segments

  2151         int end = path.length() - 1;    // Index of last char in path

  2152         int p = 0;                      // Index of next char in path

  2154         // Skip initial slashes

  2155         while (p <= end) {

  2156             if (path.charAt(p) != '/') break;

  2157             p++;

  2158         }

  2159         if (p > 1) normal = false;

  2161         // Scan segments

  2162         while (p <= end) {

  2164             // Looking at "." or ".." ?

  2165             if ((path.charAt(p) == '.')

  2166                 && ((p == end)

  2167                     || ((path.charAt(p + 1) == '/')

  2168                         || ((path.charAt(p + 1) == '.')

  2169                             && ((p + 1 == end)

  2170                                 || (path.charAt(p + 2) == '/')))))) {

  2171                 normal = false;

  2172             }

  2173             ns++;

  2175             // Find beginning of next segment

  2176             while (p <= end) {

  2177                 if (path.charAt(p++) != '/')

  2178                     continue;

  2180                 // Skip redundant slashes

  2181                 while (p <= end) {

  2182                     if (path.charAt(p) != '/') break;

  2183                     normal = false;

  2184                     p++;

  2185                 }

  2187                 break;

  2188             }

  2189         }

  2191         return normal ? -1 : ns;

  2192     }

  2195     // Split the given path into segments, replacing slashes with nulls and

  2196     // filling in the given segment-index array.

  2197     //

  2198     // Preconditions:

  2199     //   segs.length == Number of segments in path

  2200     //

  2201     // Postconditions:

  2202     //   All slashes in path replaced by '\0'

  2203     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)

  2204     //

  2205     static private void split(char[] path, int[] segs) {

  2206         int end = path.length - 1;      // Index of last char in path

  2207         int p = 0;                      // Index of next char in path

  2208         int i = 0;                      // Index of current segment

  2210         // Skip initial slashes

  2211         while (p <= end) {

  2212             if (path[p] != '/') break;

  2213             path[p] = '\0';

  2214             p++;

  2215         }

  2217         while (p <= end) {

  2219             // Note start of segment

  2220             segs[i++] = p++;

  2222             // Find beginning of next segment

  2223             while (p <= end) {

  2224                 if (path[p++] != '/')

  2225                     continue;

  2226                 path[p - 1] = '\0';

  2228                 // Skip redundant slashes

  2229                 while (p <= end) {

  2230                     if (path[p] != '/') break;

  2231                     path[p++] = '\0';

  2232                 }

  2233                 break;

  2234             }

  2235         }

  2237         if (i != segs.length)

  2238             throw new InternalError();  // ASSERT

  2239     }

  2242     // Join the segments in the given path according to the given segment-index

  2243     // array, ignoring those segments whose index entries have been set to -1,

  2244     // and inserting slashes as needed.  Return the length of the resulting

  2245     // path.

  2246     //

  2247     // Preconditions:

  2248     //   segs[i] == -1 implies segment i is to be ignored

  2249     //   path computed by split, as above, with '\0' having replaced '/'

  2250     //

  2251     // Postconditions:

  2252     //   path[0] .. path[return value] == Resulting path

  2253     //

  2254     static private int join(char[] path, int[] segs) {

  2255         int ns = segs.length;           // Number of segments

  2256         int end = path.length - 1;      // Index of last char in path

  2257         int p = 0;                      // Index of next path char to write

  2259         if (path[p] == '\0') {

  2260             // Restore initial slash for absolute paths

  2261             path[p++] = '/';

  2262         }

  2264         for (int i = 0; i < ns; i++) {

  2265             int q = segs[i];            // Current segment

  2266             if (q == -1)

  2267                 // Ignore this segment

  2268                 continue;

  2270             if (p == q) {

  2271                 // We're already at this segment, so just skip to its end

  2272                 while ((p <= end) && (path[p] != '\0'))

  2273                     p++;

  2274                 if (p <= end) {

  2275                     // Preserve trailing slash

  2276                     path[p++] = '/';

  2277                 }

  2278             } else if (p < q) {

  2279                 // Copy q down to p

  2280                 while ((q <= end) && (path[q] != '\0'))

  2281                     path[p++] = path[q++];

  2282                 if (q <= end) {

  2283                     // Preserve trailing slash

  2284                     path[p++] = '/';

  2285                 }

  2286             } else

  2287                 throw new InternalError(); // ASSERT false

  2288         }

  2290         return p;

  2291     }

  2294     // Remove "." segments from the given path, and remove segment pairs

  2295     // consisting of a non-".." segment followed by a ".." segment.

  2296     //

  2297     private static void removeDots(char[] path, int[] segs) {

  2298         int ns = segs.length;

  2299         int end = path.length - 1;

  2301         for (int i = 0; i < ns; i++) {

  2302             int dots = 0;               // Number of dots found (0, 1, or 2)

  2304             // Find next occurrence of "." or ".."

  2305             do {

  2306                 int p = segs[i];

  2307                 if (path[p] == '.') {

  2308                     if (p == end) {

  2309                         dots = 1;

  2310                         break;

  2311                     } else if (path[p + 1] == '\0') {

  2312                         dots = 1;

  2313                         break;

  2314                     } else if ((path[p + 1] == '.')

  2315                                && ((p + 1 == end)

  2316                                    || (path[p + 2] == '\0'))) {

  2317                         dots = 2;

  2318                         break;

  2319                     }

  2320                 }

  2321                 i++;

  2322             } while (i < ns);

  2323             if ((i > ns) || (dots == 0))

  2324                 break;

  2326             if (dots == 1) {

  2327                 // Remove this occurrence of "."

  2328                 segs[i] = -1;

  2329             } else {

  2330                 // If there is a preceding non-".." segment, remove both that

  2331                 // segment and this occurrence of ".."; otherwise, leave this

  2332                 // ".." segment as-is.

  2333                 int j;

  2334                 for (j = i - 1; j >= 0; j--) {

  2335                     if (segs[j] != -1) break;

  2336                 }

  2337                 if (j >= 0) {

  2338                     int q = segs[j];

  2339                     if (!((path[q] == '.')

  2340                           && (path[q + 1] == '.')

  2341                           && (path[q + 2] == '\0'))) {

  2342                         segs[i] = -1;

  2343                         segs[j] = -1;

  2344                     }

  2345                 }

  2346             }

  2347         }

  2348     }

  2351     // DEVIATION: If the normalized path is relative, and if the first

  2352     // segment could be parsed as a scheme name, then prepend a "." segment

  2353     //

  2354     private static void maybeAddLeadingDot(char[] path, int[] segs) {

  2356         if (path[0] == '\0')

  2357             // The path is absolute

  2358             return;

  2360         int ns = segs.length;

  2361         int f = 0;                      // Index of first segment

  2362         while (f < ns) {

  2363             if (segs[f] >= 0)

  2364                 break;

  2365             f++;

  2366         }

  2367         if ((f >= ns) || (f == 0))

  2368             // The path is empty, or else the original first segment survived,

  2369             // in which case we already know that no leading "." is needed

  2370             return;

  2372         int p = segs[f];

  2373         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;

  2374         if (p >= path.length || path[p] == '\0')

  2375             // No colon in first segment, so no "." needed

  2376             return;

  2378         // At this point we know that the first segment is unused,

  2379         // hence we can insert a "." segment at that position

  2380         path[0] = '.';

  2381         path[1] = '\0';

  2382         segs[0] = 0;

  2383     }

  2386     // Normalize the given path string.  A normal path string has no empty

  2387     // segments (i.e., occurrences of "//"), no segments equal to ".", and no

  2388     // segments equal to ".." that are preceded by a segment not equal to "..".

  2389     // In contrast to Unix-style pathname normalization, for URI paths we

  2390     // always retain trailing slashes.

  2391     //

  2392     private static String normalize(String ps) {

  2394         // Does this path need normalization?

  2395         int ns = needsNormalization(ps);        // Number of segments

  2396         if (ns < 0)

  2397             // Nope -- just return it

  2398             return ps;

  2400         char[] path = ps.toCharArray();         // Path in char-array form

  2402         // Split path into segments

  2403         int[] segs = new int[ns];               // Segment-index array

  2404         split(path, segs);

  2406         // Remove dots

  2407         removeDots(path, segs);

  2409         // Prevent scheme-name confusion

  2410         maybeAddLeadingDot(path, segs);

  2412         // Join the remaining segments and return the result

  2413         String s = new String(path, 0, join(path, segs));

  2414         if (s.equals(ps)) {

  2415             // string was already normalized

  2416             return ps;

  2417         }

  2418         return s;

  2419     }

  2423     // -- Character classes for parsing --

  2425     // RFC2396 precisely specifies which characters in the US-ASCII charset are

  2426     // permissible in the various components of a URI reference.  We here

  2427     // define a set of mask pairs to aid in enforcing these restrictions.  Each

  2428     // mask pair consists of two longs, a low mask and a high mask.  Taken

  2429     // together they represent a 128-bit mask, where bit i is set iff the

  2430     // character with value i is permitted.

  2431     //

  2432     // This approach is more efficient than sequentially searching arrays of

  2433     // permitted characters.  It could be made still more efficient by

  2434     // precompiling the mask information so that a character's presence in a

  2435     // given mask could be determined by a single table lookup.

  2437     // Compute the low-order mask for the characters in the given string

  2438     private static long lowMask(String chars) {

  2439         int n = chars.length();

  2440         long m = 0;

  2441         for (int i = 0; i < n; i++) {

  2442             char c = chars.charAt(i);

  2443             if (c < 64)

  2444                 m |= (1L << c);

  2445         }

  2446         return m;

  2447     }

  2449     // Compute the high-order mask for the characters in the given string

  2450     private static long highMask(String chars) {

  2451         int n = chars.length();

  2452         long m = 0;

  2453         for (int i = 0; i < n; i++) {

  2454             char c = chars.charAt(i);

  2455             if ((c >= 64) && (c < 128))

  2456                 m |= (1L << (c - 64));

  2457         }

  2458         return m;

  2459     }

  2461     // Compute a low-order mask for the characters

  2462     // between first and last, inclusive

  2463     private static long lowMask(char first, char last) {

  2464         long m = 0;

  2465         int f = Math.max(Math.min(first, 63), 0);

  2466         int l = Math.max(Math.min(last, 63), 0);

  2467         for (int i = f; i <= l; i++)

  2468             m |= 1L << i;

  2469         return m;

  2470     }

  2472     // Compute a high-order mask for the characters

  2473     // between first and last, inclusive

  2474     private static long highMask(char first, char last) {

  2475         long m = 0;

  2476         int f = Math.max(Math.min(first, 127), 64) - 64;

  2477         int l = Math.max(Math.min(last, 127), 64) - 64;

  2478         for (int i = f; i <= l; i++)

  2479             m |= 1L << i;

  2480         return m;

  2481     }

  2483     // Tell whether the given character is permitted by the given mask pair

  2484     private static boolean match(char c, long lowMask, long highMask) {

  2485         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.

  2486             return false;

  2487         if (c < 64)

  2488             return ((1L << c) & lowMask) != 0;

  2489         if (c < 128)

  2490             return ((1L << (c - 64)) & highMask) != 0;

  2491         return false;

  2492     }

  2494     // Character-class masks, in reverse order from RFC2396 because

  2495     // initializers for static fields cannot make forward references.

  2497     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |

  2498     //            "8" | "9"

  2499     private static final long L_DIGIT = lowMask('0', '9');

  2500     private static final long H_DIGIT = 0L;

  2502     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |

  2503     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |

  2504     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"

  2505     private static final long L_UPALPHA = 0L;

  2506     private static final long H_UPALPHA = highMask('A', 'Z');

  2508     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |

  2509     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |

  2510     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"

  2511     private static final long L_LOWALPHA = 0L;

  2512     private static final long H_LOWALPHA = highMask('a', 'z');

  2514     // alpha         = lowalpha | upalpha

  2515     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;

  2516     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;

  2518     // alphanum      = alpha | digit

  2519     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;

  2520     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;

  2522     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |

  2523     //                         "a" | "b" | "c" | "d" | "e" | "f"

  2524     private static final long L_HEX = L_DIGIT;

  2525     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');

  2527     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |

  2528     //                 "(" | ")"

  2529     private static final long L_MARK = lowMask("-_.!~*'()");

  2530     private static final long H_MARK = highMask("-_.!~*'()");

  2532     // unreserved    = alphanum | mark

  2533     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;

  2534     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;

  2536     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

  2537     //                 "$" | "," | "[" | "]"

  2538     // Added per RFC2732: "[", "]"

  2539     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");

  2540     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");

  2542     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII

  2543     // characters are allowed; this is handled by the scanEscape method below.

  2544     private static final long L_ESCAPED = 1L;

  2545     private static final long H_ESCAPED = 0L;

  2547     // uric          = reserved | unreserved | escaped

  2548     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;

  2549     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;

  2551     // pchar         = unreserved | escaped |

  2552     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","

  2553     private static final long L_PCHAR

  2554         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");

  2555     private static final long H_PCHAR

  2556         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");

  2558     // All valid path characters

  2559     private static final long L_PATH = L_PCHAR | lowMask(";/");

  2560     private static final long H_PATH = H_PCHAR | highMask(";/");

  2562     // Dash, for use in domainlabel and toplabel

  2563     private static final long L_DASH = lowMask("-");

  2564     private static final long H_DASH = highMask("-");

  2566     // Dot, for use in hostnames

  2567     private static final long L_DOT = lowMask(".");

  2568     private static final long H_DOT = highMask(".");

  2570     // userinfo      = *( unreserved | escaped |

  2571     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )

  2572     private static final long L_USERINFO

  2573         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");

  2574     private static final long H_USERINFO

  2575         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");

  2577     // reg_name      = 1*( unreserved | escaped | "$" | "," |

  2578     //                     ";" | ":" | "@" | "&" | "=" | "+" )

  2579     private static final long L_REG_NAME

  2580         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");

  2581     private static final long H_REG_NAME

  2582         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");

  2584     // All valid characters for server-based authorities

  2585     private static final long L_SERVER

  2586         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");

  2587     private static final long H_SERVER

  2588         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");

  2590     // Special case of server authority that represents an IPv6 address

  2591     // In this case, a % does not signify an escape sequence

  2592     private static final long L_SERVER_PERCENT

  2593         = L_SERVER | lowMask("%");

  2594     private static final long H_SERVER_PERCENT

  2595         = H_SERVER | highMask("%");

  2596     private static final long L_LEFT_BRACKET = lowMask("[");

  2597     private static final long H_LEFT_BRACKET = highMask("[");

  2599     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )

  2600     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");

  2601     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");

  2603     // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |

  2604     //                 "&" | "=" | "+" | "$" | ","

  2605     private static final long L_URIC_NO_SLASH

  2606         = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");

  2607     private static final long H_URIC_NO_SLASH

  2608         = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");

  2611     // -- Escaping and encoding --

  2613     private final static char[] hexDigits = {

  2614         '0', '1', '2', '3', '4', '5', '6', '7',

  2615         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'

  2616     };

  2618     private static void appendEscape(StringBuffer sb, byte b) {

  2619         sb.append('%');

  2620         sb.append(hexDigits[(b >> 4) & 0x0f]);

  2621         sb.append(hexDigits[(b >> 0) & 0x0f]);

  2622     }

  2624     private static void appendEncoded(StringBuffer sb, char c) {

  2625         /*

  2626         ByteBuffer bb = null;

  2627         try {

  2628             bb = ThreadLocalCoders.encoderFor("UTF-8")

  2629                 .encode(CharBuffer.wrap("" + c));

  2630         } catch (CharacterCodingException x) {

  2631             assert false;

  2632         }

  2633         while (bb.hasRemaining()) {

  2634             int b = bb.get() & 0xff;

  2635             if (b >= 0x80)

  2636                 appendEscape(sb, (byte)b);

  2637             else

  2638                 sb.append((char)b);

  2639         }

  2640         */

  2641     }

  2643     // Quote any characters in s that are not permitted

  2644     // by the given mask pair

  2645     //

  2646     private static String quote(String s, long lowMask, long highMask) {

  2647         int n = s.length();

  2648         StringBuffer sb = null;

  2649         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);

  2650         for (int i = 0; i < s.length(); i++) {

  2651             char c = s.charAt(i);

  2652             if (c < '\u0080') {

  2653                 if (!match(c, lowMask, highMask)) {

  2654                     if (sb == null) {

  2655                         sb = new StringBuffer();

  2656                         sb.append(s.substring(0, i));

  2657                     }

  2658                     appendEscape(sb, (byte)c);

  2659                 } else {

  2660                     if (sb != null)

  2661                         sb.append(c);

  2662                 }

  2663             } else if (allowNonASCII

  2664                        && (Character.isSpaceChar(c)

  2665                            || Character.isISOControl(c))) {

  2666                 if (sb == null) {

  2667                     sb = new StringBuffer();

  2668                     sb.append(s.substring(0, i));

  2669                 }

  2670                 appendEncoded(sb, c);

  2671             } else {

  2672                 if (sb != null)

  2673                     sb.append(c);

  2674             }

  2675         }

  2676         return (sb == null) ? s : sb.toString();

  2677     }

  2679     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,

  2680     // assuming that s is otherwise legal

  2681     //

  2682     private static String encode(String s) {

  2683         int n = s.length();

  2684         if (n == 0)

  2685             return s;

  2687         // First check whether we actually need to encode

  2688         for (int i = 0;;) {

  2689             if (s.charAt(i) >= '\u0080')

  2690                 break;

  2691             if (++i >= n)

  2692                 return s;

  2693         }

  2694 /*

  2695         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);

  2696         ByteBuffer bb = null;

  2697         try {

  2698             bb = ThreadLocalCoders.encoderFor("UTF-8")

  2699                 .encode(CharBuffer.wrap(ns));

  2700         } catch (CharacterCodingException x) {

  2701             assert false;

  2702         }

  2703 */

  2704         StringBuffer sb = new StringBuffer();

  2705         /*

  2706         while (bb.hasRemaining()) {

  2707             int b = bb.get() & 0xff;

  2708             if (b >= 0x80)

  2709                 appendEscape(sb, (byte)b);

  2710             else

  2711                 sb.append((char)b);

  2712         }

  2713         */

  2714         return sb.toString();

  2715     }

  2717     private static int decode(char c) {

  2718         if ((c >= '0') && (c <= '9'))

  2719             return c - '0';

  2720         if ((c >= 'a') && (c <= 'f'))

  2721             return c - 'a' + 10;

  2722         if ((c >= 'A') && (c <= 'F'))

  2723             return c - 'A' + 10;

  2724         assert false;

  2725         return -1;

  2726     }

  2728     private static byte decode(char c1, char c2) {

  2729         return (byte)(  ((decode(c1) & 0xf) << 4)

  2730                       | ((decode(c2) & 0xf) << 0));

  2731     }

  2733     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes

  2734     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a

  2735     // sequence of escaped octets is not valid UTF-8 then the erroneous octets

  2736     // are replaced with '\uFFFD'.

  2737     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal

  2738     //            with a scope_id

  2739     //

  2740     private static String decode(String s) {

  2741         if (s == null)

  2742             return s;

  2743         int n = s.length();

  2744         if (n == 0)

  2745             return s;

  2746         if (s.indexOf('%') < 0)

  2747             return s;

  2749         StringBuffer sb = new StringBuffer(n);

  2750         /*

  2751         ByteBuffer bb = ByteBuffer.allocate(n);

  2752         CharBuffer cb = CharBuffer.allocate(n);

  2753         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")

  2754             .onMalformedInput(CodingErrorAction.REPLACE)

  2755             .onUnmappableCharacter(CodingErrorAction.REPLACE);

  2757         // This is not horribly efficient, but it will do for now

  2758         char c = s.charAt(0);

  2759         boolean betweenBrackets = false;

  2761         for (int i = 0; i < n;) {

  2762             assert c == s.charAt(i);    // Loop invariant

  2763             if (c == '[') {

  2764                 betweenBrackets = true;

  2765             } else if (betweenBrackets && c == ']') {

  2766                 betweenBrackets = false;

  2767             }

  2768             if (c != '%' || betweenBrackets) {

  2769                 sb.append(c);

  2770                 if (++i >= n)

  2771                     break;

  2772                 c = s.charAt(i);

  2773                 continue;

  2774             }

  2775             bb.clear();

  2776             int ui = i;

  2777             for (;;) {

  2778                 assert (n - i >= 2);

  2779                 bb.put(decode(s.charAt(++i), s.charAt(++i)));

  2780                 if (++i >= n)

  2781                     break;

  2782                 c = s.charAt(i);

  2783                 if (c != '%')

  2784                     break;

  2785             }

  2786             bb.flip();

  2787             cb.clear();

  2788             dec.reset();

  2789             CoderResult cr = dec.decode(bb, cb, true);

  2790             assert cr.isUnderflow();

  2791             cr = dec.flush(cb);

  2792             assert cr.isUnderflow();

  2793             sb.append(cb.flip().toString());

  2794         }

  2795 */

  2796         return sb.toString();

  2797     }

  2800     // -- Parsing --

  2802     // For convenience we wrap the input URI string in a new instance of the

  2803     // following internal class.  This saves always having to pass the input

  2804     // string as an argument to each internal scan/parse method.

  2806     private class Parser {

  2808         private String input;           // URI input string

  2809         private boolean requireServerAuthority = false;

  2811         Parser(String s) {

  2812             input = s;

  2813             string = s;

  2814         }

  2816         // -- Methods for throwing URISyntaxException in various ways --

  2818         private void fail(String reason) throws URISyntaxException {

  2819             throw new URISyntaxException(input, reason);

  2820         }

  2822         private void fail(String reason, int p) throws URISyntaxException {

  2823             throw new URISyntaxException(input, reason, p);

  2824         }

  2826         private void failExpecting(String expected, int p)

  2827             throws URISyntaxException

  2828         {

  2829             fail("Expected " + expected, p);

  2830         }

  2832         private void failExpecting(String expected, String prior, int p)

  2833             throws URISyntaxException

  2834         {

  2835             fail("Expected " + expected + " following " + prior, p);

  2836         }

  2839         // -- Simple access to the input string --

  2841         // Return a substring of the input string

  2842         //

  2843         private String substring(int start, int end) {

  2844             return input.substring(start, end);

  2845         }

  2847         // Return the char at position p,

  2848         // assuming that p < input.length()

  2849         //

  2850         private char charAt(int p) {

  2851             return input.charAt(p);

  2852         }

  2854         // Tells whether start < end and, if so, whether charAt(start) == c

  2855         //

  2856         private boolean at(int start, int end, char c) {

  2857             return (start < end) && (charAt(start) == c);

  2858         }

  2860         // Tells whether start + s.length() < end and, if so,

  2861         // whether the chars at the start position match s exactly

  2862         //

  2863         private boolean at(int start, int end, String s) {

  2864             int p = start;

  2865             int sn = s.length();

  2866             if (sn > end - p)

  2867                 return false;

  2868             int i = 0;

  2869             while (i < sn) {

  2870                 if (charAt(p++) != s.charAt(i)) {

  2871                     break;

  2872                 }

  2873                 i++;

  2874             }

  2875             return (i == sn);

  2876         }

  2879         // -- Scanning --

  2881         // The various scan and parse methods that follow use a uniform

  2882         // convention of taking the current start position and end index as

  2883         // their first two arguments.  The start is inclusive while the end is

  2884         // exclusive, just as in the String class, i.e., a start/end pair

  2885         // denotes the left-open interval [start, end) of the input string.

  2886         //

  2887         // These methods never proceed past the end position.  They may return

  2888         // -1 to indicate outright failure, but more often they simply return

  2889         // the position of the first char after the last char scanned.  Thus

  2890         // a typical idiom is

  2891         //

  2892         //     int p = start;

  2893         //     int q = scan(p, end, ...);

  2894         //     if (q > p)

  2895         //         // We scanned something

  2896         //         ...;

  2897         //     else if (q == p)

  2898         //         // We scanned nothing

  2899         //         ...;

  2900         //     else if (q == -1)

  2901         //         // Something went wrong

  2902         //         ...;

  2905         // Scan a specific char: If the char at the given start position is

  2906         // equal to c, return the index of the next char; otherwise, return the

  2907         // start position.

  2908         //

  2909         private int scan(int start, int end, char c) {

  2910             if ((start < end) && (charAt(start) == c))

  2911                 return start + 1;

  2912             return start;

  2913         }

  2915         // Scan forward from the given start position.  Stop at the first char

  2916         // in the err string (in which case -1 is returned), or the first char

  2917         // in the stop string (in which case the index of the preceding char is

  2918         // returned), or the end of the input string (in which case the length

  2919         // of the input string is returned).  May return the start position if

  2920         // nothing matches.

  2921         //

  2922         private int scan(int start, int end, String err, String stop) {

  2923             int p = start;

  2924             while (p < end) {

  2925                 char c = charAt(p);

  2926                 if (err.indexOf(c) >= 0)

  2927                     return -1;

  2928                 if (stop.indexOf(c) >= 0)

  2929                     break;

  2930                 p++;

  2931             }

  2932             return p;

  2933         }

  2935         // Scan a potential escape sequence, starting at the given position,

  2936         // with the given first char (i.e., charAt(start) == c).

  2937         //

  2938         // This method assumes that if escapes are allowed then visible

  2939         // non-US-ASCII chars are also allowed.

  2940         //

  2941         private int scanEscape(int start, int n, char first)

  2942             throws URISyntaxException

  2943         {

  2944             int p = start;

  2945             char c = first;

  2946             if (c == '%') {

  2947                 // Process escape pair

  2948                 if ((p + 3 <= n)

  2949                     && match(charAt(p + 1), L_HEX, H_HEX)

  2950                     && match(charAt(p + 2), L_HEX, H_HEX)) {

  2951                     return p + 3;

  2952                 }

  2953                 fail("Malformed escape pair", p);

  2954             } else if ((c > 128)

  2955                        && !Character.isSpaceChar(c)

  2956                        && !Character.isISOControl(c)) {

  2957                 // Allow unescaped but visible non-US-ASCII chars

  2958                 return p + 1;

  2959             }

  2960             return p;

  2961         }

  2963         // Scan chars that match the given mask pair

  2964         //

  2965         private int scan(int start, int n, long lowMask, long highMask)

  2966             throws URISyntaxException

  2967         {

  2968             int p = start;

  2969             while (p < n) {

  2970                 char c = charAt(p);

  2971                 if (match(c, lowMask, highMask)) {

  2972                     p++;

  2973                     continue;

  2974                 }

  2975                 if ((lowMask & L_ESCAPED) != 0) {

  2976                     int q = scanEscape(p, n, c);

  2977                     if (q > p) {

  2978                         p = q;

  2979                         continue;

  2980                     }

  2981                 }

  2982                 break;

  2983             }

  2984             return p;

  2985         }

  2987         // Check that each of the chars in [start, end) matches the given mask

  2988         //

  2989         private void checkChars(int start, int end,

  2990                                 long lowMask, long highMask,

  2991                                 String what)

  2992             throws URISyntaxException

  2993         {

  2994             int p = scan(start, end, lowMask, highMask);

  2995             if (p < end)

  2996                 fail("Illegal character in " + what, p);

  2997         }

  2999         // Check that the char at position p matches the given mask

  3000         //

  3001         private void checkChar(int p,

  3002                                long lowMask, long highMask,

  3003                                String what)

  3004             throws URISyntaxException

  3005         {

  3006             checkChars(p, p + 1, lowMask, highMask, what);

  3007         }

  3010         // -- Parsing --

  3012         // [<scheme>:]<scheme-specific-part>[#<fragment>]

  3013         //

  3014         void parse(boolean rsa) throws URISyntaxException {

  3015             requireServerAuthority = rsa;

  3016             int ssp;                    // Start of scheme-specific part

  3017             int n = input.length();

  3018             int p = scan(0, n, "/?#", ":");

  3019             if ((p >= 0) && at(p, n, ':')) {

  3020                 if (p == 0)

  3021                     failExpecting("scheme name", 0);

  3022                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");

  3023                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");

  3024                 scheme = substring(0, p);

  3025                 p++;                    // Skip ':'

  3026                 ssp = p;

  3027                 if (at(p, n, '/')) {

  3028                     p = parseHierarchical(p, n);

  3029                 } else {

  3030                     int q = scan(p, n, "", "#");

  3031                     if (q <= p)

  3032                         failExpecting("scheme-specific part", p);

  3033                     checkChars(p, q, L_URIC, H_URIC, "opaque part");

  3034                     p = q;

  3035                 }

  3036             } else {

  3037                 ssp = 0;

  3038                 p = parseHierarchical(0, n);

  3039             }

  3040             schemeSpecificPart = substring(ssp, p);

  3041             if (at(p, n, '#')) {

  3042                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");

  3043                 fragment = substring(p + 1, n);

  3044                 p = n;

  3045             }

  3046             if (p < n)

  3047                 fail("end of URI", p);

  3048         }

  3050         // [//authority]<path>[?<query>]

  3051         //

  3052         // DEVIATION from RFC2396: We allow an empty authority component as

  3053         // long as it's followed by a non-empty path, query component, or

  3054         // fragment component.  This is so that URIs such as "file:///foo/bar"

  3055         // will parse.  This seems to be the intent of RFC2396, though the

  3056         // grammar does not permit it.  If the authority is empty then the

  3057         // userInfo, host, and port components are undefined.

  3058         //

  3059         // DEVIATION from RFC2396: We allow empty relative paths.  This seems

  3060         // to be the intent of RFC2396, but the grammar does not permit it.

  3061         // The primary consequence of this deviation is that "#f" parses as a

  3062         // relative URI with an empty path.

  3063         //

  3064         private int parseHierarchical(int start, int n)

  3065             throws URISyntaxException

  3066         {

  3067             int p = start;

  3068             if (at(p, n, '/') && at(p + 1, n, '/')) {

  3069                 p += 2;

  3070                 int q = scan(p, n, "", "/?#");

  3071                 if (q > p) {

  3072                     p = parseAuthority(p, q);

  3073                 } else if (q < n) {

  3074                     // DEVIATION: Allow empty authority prior to non-empty

  3075                     // path, query component or fragment identifier

  3076                 } else

  3077                     failExpecting("authority", p);

  3078             }

  3079             int q = scan(p, n, "", "?#"); // DEVIATION: May be empty

  3080             checkChars(p, q, L_PATH, H_PATH, "path");

  3081             path = substring(p, q);

  3082             p = q;

  3083             if (at(p, n, '?')) {

  3084                 p++;

  3085                 q = scan(p, n, "", "#");

  3086                 checkChars(p, q, L_URIC, H_URIC, "query");

  3087                 query = substring(p, q);

  3088                 p = q;

  3089             }

  3090             return p;

  3091         }

  3093         // authority     = server | reg_name

  3094         //

  3095         // Ambiguity: An authority that is a registry name rather than a server

  3096         // might have a prefix that parses as a server.  We use the fact that

  3097         // the authority component is always followed by '/' or the end of the

  3098         // input string to resolve this: If the complete authority did not

  3099         // parse as a server then we try to parse it as a registry name.

  3100         //

  3101         private int parseAuthority(int start, int n)

  3102             throws URISyntaxException

  3103         {

  3104             int p = start;

  3105             int q = p;

  3106             URISyntaxException ex = null;

  3108             boolean serverChars;

  3109             boolean regChars;

  3111             if (scan(p, n, "", "]") > p) {

  3112                 // contains a literal IPv6 address, therefore % is allowed

  3113                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);

  3114             } else {

  3115                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);

  3116             }

  3117             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);

  3119             if (regChars && !serverChars) {

  3120                 // Must be a registry-based authority

  3121                 authority = substring(p, n);

  3122                 return n;

  3123             }

  3125             if (serverChars) {

  3126                 // Might be (probably is) a server-based authority, so attempt

  3127                 // to parse it as such.  If the attempt fails, try to treat it

  3128                 // as a registry-based authority.

  3129                 try {

  3130                     q = parseServer(p, n);

  3131                     if (q < n)

  3132                         failExpecting("end of authority", q);

  3133                     authority = substring(p, n);

  3134                 } catch (URISyntaxException x) {

  3135                     // Undo results of failed parse

  3136                     userInfo = null;

  3137                     host = null;

  3138                     port = -1;

  3139                     if (requireServerAuthority) {

  3140                         // If we're insisting upon a server-based authority,

  3141                         // then just re-throw the exception

  3142                         throw x;

  3143                     } else {

  3144                         // Save the exception in case it doesn't parse as a

  3145                         // registry either

  3146                         ex = x;

  3147                         q = p;

  3148                     }

  3149                 }

  3150             }

  3152             if (q < n) {

  3153                 if (regChars) {

  3154                     // Registry-based authority

  3155                     authority = substring(p, n);

  3156                 } else if (ex != null) {

  3157                     // Re-throw exception; it was probably due to

  3158                     // a malformed IPv6 address

  3159                     throw ex;

  3160                 } else {

  3161                     fail("Illegal character in authority", q);

  3162                 }

  3163             }

  3165             return n;

  3166         }

  3169         // [<userinfo>@]<host>[:<port>]

  3170         //

  3171         private int parseServer(int start, int n)

  3172             throws URISyntaxException

  3173         {

  3174             int p = start;

  3175             int q;

  3177             // userinfo

  3178             q = scan(p, n, "/?#", "@");

  3179             if ((q >= p) && at(q, n, '@')) {

  3180                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");

  3181                 userInfo = substring(p, q);

  3182                 p = q + 1;              // Skip '@'

  3183             }

  3185             // hostname, IPv4 address, or IPv6 address

  3186             if (at(p, n, '[')) {

  3187                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732

  3188                 p++;

  3189                 q = scan(p, n, "/?#", "]");

  3190                 if ((q > p) && at(q, n, ']')) {

  3191                     // look for a "%" scope id

  3192                     int r = scan (p, q, "", "%");

  3193                     if (r > p) {

  3194                         parseIPv6Reference(p, r);

  3195                         if (r+1 == q) {

  3196                             fail ("scope id expected");

  3197                         }

  3198                         checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,

  3199                                                 "scope id");

  3200                     } else {

  3201                         parseIPv6Reference(p, q);

  3202                     }

  3203                     host = substring(p-1, q+1);

  3204                     p = q + 1;

  3205                 } else {

  3206                     failExpecting("closing bracket for IPv6 address", q);

  3207                 }

  3208             } else {

  3209                 q = parseIPv4Address(p, n);

  3210                 if (q <= p)

  3211                     q = parseHostname(p, n);

  3212                 p = q;

  3213             }

  3215             // port

  3216             if (at(p, n, ':')) {

  3217                 p++;

  3218                 q = scan(p, n, "", "/");

  3219                 if (q > p) {

  3220                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");

  3221                     try {

  3222                         port = Integer.parseInt(substring(p, q));

  3223                     } catch (NumberFormatException x) {

  3224                         fail("Malformed port number", p);

  3225                     }

  3226                     p = q;

  3227                 }

  3228             }

  3229             if (p < n)

  3230                 failExpecting("port number", p);

  3232             return p;

  3233         }

  3235         // Scan a string of decimal digits whose value fits in a byte

  3236         //

  3237         private int scanByte(int start, int n)

  3238             throws URISyntaxException

  3239         {

  3240             int p = start;

  3241             int q = scan(p, n, L_DIGIT, H_DIGIT);

  3242             if (q <= p) return q;

  3243             if (Integer.parseInt(substring(p, q)) > 255) return p;

  3244             return q;

  3245         }

  3247         // Scan an IPv4 address.

  3248         //

  3249         // If the strict argument is true then we require that the given

  3250         // interval contain nothing besides an IPv4 address; if it is false

  3251         // then we only require that it start with an IPv4 address.

  3252         //

  3253         // If the interval does not contain or start with (depending upon the

  3254         // strict argument) a legal IPv4 address characters then we return -1

  3255         // immediately; otherwise we insist that these characters parse as a

  3256         // legal IPv4 address and throw an exception on failure.

  3257         //

  3258         // We assume that any string of decimal digits and dots must be an IPv4

  3259         // address.  It won't parse as a hostname anyway, so making that

  3260         // assumption here allows more meaningful exceptions to be thrown.

  3261         //

  3262         private int scanIPv4Address(int start, int n, boolean strict)

  3263             throws URISyntaxException

  3264         {

  3265             int p = start;

  3266             int q;

  3267             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);

  3268             if ((m <= p) || (strict && (m != n)))

  3269                 return -1;

  3270             for (;;) {

  3271                 // Per RFC2732: At most three digits per byte

  3272                 // Further constraint: Each element fits in a byte

  3273                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3274                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3275                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3276                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3277                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3278                 if ((q = scan(p, m, '.')) <= p) break;  p = q;

  3279                 if ((q = scanByte(p, m)) <= p) break;   p = q;

  3280                 if (q < m) break;

  3281                 return q;

  3282             }

  3283             fail("Malformed IPv4 address", q);

  3284             return -1;

  3285         }

  3287         // Take an IPv4 address: Throw an exception if the given interval

  3288         // contains anything except an IPv4 address

  3289         //

  3290         private int takeIPv4Address(int start, int n, String expected)

  3291             throws URISyntaxException

  3292         {

  3293             int p = scanIPv4Address(start, n, true);

  3294             if (p <= start)

  3295                 failExpecting(expected, start);

  3296             return p;

  3297         }

  3299         // Attempt to parse an IPv4 address, returning -1 on failure but

  3300         // allowing the given interval to contain [:<characters>] after

  3301         // the IPv4 address.

  3302         //

  3303         private int parseIPv4Address(int start, int n) {

  3304             int p;

  3306             try {

  3307                 p = scanIPv4Address(start, n, false);

  3308             } catch (URISyntaxException x) {

  3309                 return -1;

  3310             } catch (NumberFormatException nfe) {

  3311                 return -1;

  3312             }

  3314             if (p > start && p < n) {

  3315                 // IPv4 address is followed by something - check that

  3316                 // it's a ":" as this is the only valid character to

  3317                 // follow an address.

  3318                 if (charAt(p) != ':') {

  3319                     p = -1;

  3320                 }

  3321             }

  3323             if (p > start)

  3324                 host = substring(start, p);

  3326             return p;

  3327         }

  3329         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]

  3330         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum

  3331         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum

  3332         //

  3333         private int parseHostname(int start, int n)

  3334             throws URISyntaxException

  3335         {

  3336             int p = start;

  3337             int q;

  3338             int l = -1;                 // Start of last parsed label

  3340             do {

  3341                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]

  3342                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);

  3343                 if (q <= p)

  3344                     break;

  3345                 l = p;

  3346                 if (q > p) {

  3347                     p = q;

  3348                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);

  3349                     if (q > p) {

  3350                         if (charAt(q - 1) == '-')

  3351                             fail("Illegal character in hostname", q - 1);

  3352                         p = q;

  3353                     }

  3354                 }

  3355                 q = scan(p, n, '.');

  3356                 if (q <= p)

  3357                     break;

  3358                 p = q;

  3359             } while (p < n);

  3361             if ((p < n) && !at(p, n, ':'))

  3362                 fail("Illegal character in hostname", p);

  3364             if (l < 0)

  3365                 failExpecting("hostname", start);

  3367             // for a fully qualified hostname check that the rightmost

  3368             // label starts with an alpha character.

  3369             if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {

  3370                 fail("Illegal character in hostname", l);

  3371             }

  3373             host = substring(start, p);

  3374             return p;

  3375         }

  3378         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture

  3379         //

  3380         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of

  3381         // the form ::12.34.56.78, which are clearly shown in the examples

  3382         // earlier in the document.  Here is the original grammar:

  3383         //

  3384         //   IPv6address = hexpart [ ":" IPv4address ]

  3385         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]

  3386         //   hexseq      = hex4 *( ":" hex4)

  3387         //   hex4        = 1*4HEXDIG

  3388         //

  3389         // We therefore use the following revised grammar:

  3390         //

  3391         //   IPv6address = hexseq [ ":" IPv4address ]

  3392         //                 | hexseq [ "::" [ hexpost ] ]

  3393         //                 | "::" [ hexpost ]

  3394         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address

  3395         //   hexseq      = hex4 *( ":" hex4)

  3396         //   hex4        = 1*4HEXDIG

  3397         //

  3398         // This covers all and only the following cases:

  3399         //

  3400         //   hexseq

  3401         //   hexseq : IPv4address

  3402         //   hexseq ::

  3403         //   hexseq :: hexseq

  3404         //   hexseq :: hexseq : IPv4address

  3405         //   hexseq :: IPv4address

  3406         //   :: hexseq

  3407         //   :: hexseq : IPv4address

  3408         //   :: IPv4address

  3409         //   ::

  3410         //

  3411         // Additionally we constrain the IPv6 address as follows :-

  3412         //

  3413         //  i.  IPv6 addresses without compressed zeros should contain

  3414         //      exactly 16 bytes.

  3415         //

  3416         //  ii. IPv6 addresses with compressed zeros should contain

  3417         //      less than 16 bytes.

  3419         private int ipv6byteCount = 0;

  3421         private int parseIPv6Reference(int start, int n)

  3422             throws URISyntaxException

  3423         {

  3424             int p = start;

  3425             int q;

  3426             boolean compressedZeros = false;

  3428             q = scanHexSeq(p, n);

  3430             if (q > p) {

  3431                 p = q;

  3432                 if (at(p, n, "::")) {

  3433                     compressedZeros = true;

  3434                     p = scanHexPost(p + 2, n);

  3435                 } else if (at(p, n, ':')) {

  3436                     p = takeIPv4Address(p + 1,  n, "IPv4 address");

  3437                     ipv6byteCount += 4;

  3438                 }

  3439             } else if (at(p, n, "::")) {

  3440                 compressedZeros = true;

  3441                 p = scanHexPost(p + 2, n);

  3442             }

  3443             if (p < n)

  3444                 fail("Malformed IPv6 address", start);

  3445             if (ipv6byteCount > 16)

  3446                 fail("IPv6 address too long", start);

  3447             if (!compressedZeros && ipv6byteCount < 16)

  3448                 fail("IPv6 address too short", start);

  3449             if (compressedZeros && ipv6byteCount == 16)

  3450                 fail("Malformed IPv6 address", start);

  3452             return p;

  3453         }

  3455         private int scanHexPost(int start, int n)

  3456             throws URISyntaxException

  3457         {

  3458             int p = start;

  3459             int q;

  3461             if (p == n)

  3462                 return p;

  3464             q = scanHexSeq(p, n);

  3465             if (q > p) {

  3466                 p = q;

  3467                 if (at(p, n, ':')) {

  3468                     p++;

  3469                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");

  3470                     ipv6byteCount += 4;

  3471                 }

  3472             } else {

  3473                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");

  3474                 ipv6byteCount += 4;

  3475             }

  3476             return p;

  3477         }

  3479         // Scan a hex sequence; return -1 if one could not be scanned

  3480         //

  3481         private int scanHexSeq(int start, int n)

  3482             throws URISyntaxException

  3483         {

  3484             int p = start;

  3485             int q;

  3487             q = scan(p, n, L_HEX, H_HEX);

  3488             if (q <= p)

  3489                 return -1;

  3490             if (at(q, n, '.'))          // Beginning of IPv4 address

  3491                 return -1;

  3492             if (q > p + 4)

  3493                 fail("IPv6 hexadecimal digit sequence too long", p);

  3494             ipv6byteCount += 2;

  3495             p = q;

  3496             while (p < n) {

  3497                 if (!at(p, n, ':'))

  3498                     break;

  3499                 if (at(p + 1, n, ':'))

  3500                     break;              // "::"

  3501                 p++;

  3502                 q = scan(p, n, L_HEX, H_HEX);

  3503                 if (q <= p)

  3504                     failExpecting("digits for an IPv6 address", p);

  3505                 if (at(q, n, '.')) {    // Beginning of IPv4 address

  3506                     p--;

  3507                     break;

  3508                 }

  3509                 if (q > p + 4)

  3510                     fail("IPv6 hexadecimal digit sequence too long", p);

  3511                 ipv6byteCount += 2;

  3512                 p = q;

  3513             }

  3515             return p;

  3516         }

  3518     }

  3520 }

author	Jaroslav Tulach <jaroslav.tulach@apidesign.org>
	Thu, 31 Oct 2013 11:23:54 +0100
changeset 1398	9926996eca2d
parent 1259	d257b7a37635
permissions	-rw-r--r--