diff -r 000000000000 -r c880a8a8803b rt/emul/compact/src/main/java/sun/invoke/util/BytecodeName.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rt/emul/compact/src/main/java/sun/invoke/util/BytecodeName.java Sat Aug 09 11:11:13 2014 +0200
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2007, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package sun.invoke.util;
+
+/**
+ * Utility routines for dealing with bytecode-level names.
+ * Includes universal mangling rules for the JVM.
+ *
+ *
Avoiding Dangerous Characters
+ *
+ *
+ * The JVM defines a very small set of characters which are illegal
+ * in name spellings. We will slightly extend and regularize this set
+ * into a group of dangerous characters.
+ * These characters will then be replaced, in mangled names, by escape sequences.
+ * In addition, accidental escape sequences must be further escaped.
+ * Finally, a special prefix will be applied if and only if
+ * the mangling would otherwise fail to begin with the escape character.
+ * This happens to cover the corner case of the null string,
+ * and also clearly marks symbols which need demangling.
+ *
+ *
+ * Dangerous characters are the union of all characters forbidden
+ * or otherwise restricted by the JVM specification,
+ * plus their mates, if they are brackets
+ * ([
and ]
,
+ * <
and >
),
+ * plus, arbitrarily, the colon character :
.
+ * There is no distinction between type, method, and field names.
+ * This makes it easier to convert between mangled names of different
+ * types, since they do not need to be decoded (demangled).
+ *
+ *
+ * The escape character is backslash \
+ * (also known as reverse solidus).
+ * This character is, until now, unheard of in bytecode names,
+ * but traditional in the proposed role.
+ *
+ *
+ * Replacement Characters
+ *
+ *
+ *
+ * Every escape sequence is two characters
+ * (in fact, two UTF8 bytes) beginning with
+ * the escape character and followed by a
+ * replacement character.
+ * (Since the replacement character is never a backslash,
+ * iterated manglings do not double in size.)
+ *
+ *
+ * Each dangerous character has some rough visual similarity
+ * to its corresponding replacement character.
+ * This makes mangled symbols easier to recognize by sight.
+ *
+ *
+ * The dangerous characters are
+ * /
(forward slash, used to delimit package components),
+ * .
(dot, also a package delimiter),
+ * ;
(semicolon, used in signatures),
+ * $
(dollar, used in inner classes and synthetic members),
+ * <
(left angle),
+ * >
(right angle),
+ * [
(left square bracket, used in array types),
+ * ]
(right square bracket, reserved in this scheme for language use),
+ * and :
(colon, reserved in this scheme for language use).
+ * Their replacements are, respectively,
+ * |
(vertical bar),
+ * ,
(comma),
+ * ?
(question mark),
+ * %
(percent),
+ * ^
(caret),
+ * _
(underscore), and
+ * {
(left curly bracket),
+ * }
(right curly bracket),
+ * !
(exclamation mark).
+ * In addition, the replacement character for the escape character itself is
+ * -
(hyphen),
+ * and the replacement character for the null prefix is
+ * =
(equal sign).
+ *
+ *
+ * An escape character \
+ * followed by any of these replacement characters
+ * is an escape sequence, and there are no other escape sequences.
+ * An equal sign is only part of an escape sequence
+ * if it is the second character in the whole string, following a backslash.
+ * Two consecutive backslashes do not form an escape sequence.
+ *
+ *
+ * Each escape sequence replaces a so-called original character
+ * which is either one of the dangerous characters or the escape character.
+ * A null prefix replaces an initial null string, not a character.
+ *
+ *
+ * All this implies that escape sequences cannot overlap and may be
+ * determined all at once for a whole string. Note that a spelling
+ * string can contain accidental escapes, apparent escape
+ * sequences which must not be interpreted as manglings.
+ * These are disabled by replacing their leading backslash with an
+ * escape sequence (\-
). To mangle a string, three logical steps
+ * are required, though they may be carried out in one pass:
+ *
+ *
+ * - In each accidental escape, replace the backslash with an escape sequence
+ * (
\-
).
+ * - Replace each dangerous character with an escape sequence
+ * (
\|
for /
, etc.).
+ * - If the first two steps introduced any change, and
+ * if the string does not already begin with a backslash, prepend a null prefix (
\=
).
+ *
+ *
+ * To demangle a mangled string that begins with an escape,
+ * remove any null prefix, and then replace (in parallel)
+ * each escape sequence by its original character.
+ * Spelling strings which contain accidental
+ * escapes must have them replaced, even if those
+ * strings do not contain dangerous characters.
+ * This restriction means that mangling a string always
+ * requires a scan of the string for escapes.
+ * But then, a scan would be required anyway,
+ * to check for dangerous characters.
+ *
+ *
+ * Nice Properties
+ *
+ *
+ * If a bytecode name does not contain any escape sequence,
+ * demangling is a no-op: The string demangles to itself.
+ * Such a string is called self-mangling.
+ * Almost all strings are self-mangling.
+ * In practice, to demangle almost any name “found in nature”,
+ * simply verify that it does not begin with a backslash.
+ *
+ *
+ * Mangling is a one-to-one function, while demangling
+ * is a many-to-one function.
+ * A mangled string is defined as validly mangled if
+ * it is in fact the unique mangling of its spelling string.
+ * Three examples of invalidly mangled strings are \=foo
,
+ * \-bar
, and baz\!
, which demangle to foo
, \bar
, and
+ * baz\!
, but then remangle to foo
, \bar
, and \=baz\-!
.
+ * If a language back-end or runtime is using mangled names,
+ * it should never present an invalidly mangled bytecode
+ * name to the JVM. If the runtime encounters one,
+ * it should also report an error, since such an occurrence
+ * probably indicates a bug in name encoding which
+ * will lead to errors in linkage.
+ * However, this note does not propose that the JVM verifier
+ * detect invalidly mangled names.
+ *
+ *
+ * As a result of these rules, it is a simple matter to
+ * compute validly mangled substrings and concatenations
+ * of validly mangled strings, and (with a little care)
+ * these correspond to corresponding operations on their
+ * spelling strings.
+ *
+ *
+ * - Any prefix of a validly mangled string is also validly mangled,
+ * although a null prefix may need to be removed.
+ * - Any suffix of a validly mangled string is also validly mangled,
+ * although a null prefix may need to be added.
+ * - Two validly mangled strings, when concatenated,
+ * are also validly mangled, although any null prefix
+ * must be removed from the second string,
+ * and a trailing backslash on the first string may need escaping,
+ * if it would participate in an accidental escape when followed
+ * by the first character of the second string.
+ *
+ * If languages that include non-Java symbol spellings use this
+ * mangling convention, they will enjoy the following advantages:
+ *
+ *
+ * - They can interoperate via symbols they share in common.
+ * - Low-level tools, such as backtrace printers, will have readable displays.
+ * - Future JVM and language extensions can safely use the dangerous characters
+ * for structuring symbols, but will never interfere with valid spellings.
+ * - Runtimes and compilers can use standard libraries for mangling and demangling.
+ * - Occasional transliterations and name composition will be simple and regular,
+ * for classes, methods, and fields.
+ * - Bytecode names will continue to be compact.
+ * When mangled, spellings will at most double in length, either in
+ * UTF8 or UTF16 format, and most will not change at all.
+ *
+ *
+ *
+ * Suggestions for Human Readable Presentations
+ *
+ *
+ *
+ * For human readable displays of symbols,
+ * it will be better to present a string-like quoted
+ * representation of the spelling, because JVM users
+ * are generally familiar with such tokens.
+ * We suggest using single or double quotes before and after
+ * mangled symbols which are not valid Java identifiers,
+ * with quotes, backslashes, and non-printing characters
+ * escaped as if for literals in the Java language.
+ *
+ *
+ * For example, an HTML-like spelling
+ * <pre>
mangles to
+ * \^pre\_
and could
+ * display more cleanly as
+ * '<pre>'
,
+ * with the quotes included.
+ * Such string-like conventions are not suitable
+ * for mangled bytecode names, in part because
+ * dangerous characters must be eliminated, rather
+ * than just quoted. Otherwise internally structured
+ * strings like package prefixes and method signatures
+ * could not be reliably parsed.
+ *
+ *
+ * In such human-readable displays, invalidly mangled
+ * names should not be demangled and quoted,
+ * for this would be misleading. Likewise, JVM symbols
+ * which contain dangerous characters (like dots in field
+ * names or brackets in method names) should not be
+ * simply quoted. The bytecode names
+ * \=phase\,1
and
+ * phase.1
are distinct,
+ * and in demangled displays they should be presented as
+ * 'phase.1'
and something like
+ * 'phase'.1
, respectively.
+ *
+ *
+ * @author John Rose
+ * @version 1.2, 02/06/2008
+ * @see http://blogs.sun.com/jrose/entry/symbolic_freedom_in_the_vm
+ */
+public class BytecodeName {
+ private BytecodeName() { } // static only class
+
+ /** Given a source name, produce the corresponding bytecode name.
+ * The source name should not be qualified, because any syntactic
+ * markers (dots, slashes, dollar signs, colons, etc.) will be mangled.
+ * @param s the source name
+ * @return a valid bytecode name which represents the source name
+ */
+ public static String toBytecodeName(String s) {
+ String bn = mangle(s);
+ assert((Object)bn == s || looksMangled(bn)) : bn;
+ assert(s.equals(toSourceName(bn))) : s;
+ return bn;
+ }
+
+ /** Given an unqualified bytecode name, produce the corresponding source name.
+ * The bytecode name must not contain dangerous characters.
+ * In particular, it must not be qualified or segmented by colon {@code ':'}.
+ * @param s the bytecode name
+ * @return the source name, which may possibly have unsafe characters
+ * @throws IllegalArgumentException if the bytecode name is not {@link #isSafeBytecodeName safe}
+ * @see #isSafeBytecodeName(java.lang.String)
+ */
+ public static String toSourceName(String s) {
+ checkSafeBytecodeName(s);
+ String sn = s;
+ if (looksMangled(s)) {
+ sn = demangle(s);
+ assert(s.equals(mangle(sn))) : s+" => "+sn+" => "+mangle(sn);
+ }
+ return sn;
+ }
+
+ /**
+ * Given a bytecode name from a classfile, separate it into
+ * components delimited by dangerous characters.
+ * Each resulting array element will be either a dangerous character,
+ * or else a safe bytecode name.
+ * (The safe name might possibly be mangled to hide further dangerous characters.)
+ * For example, the qualified class name {@code java/lang/String}
+ * will be parsed into the array {@code {"java", '/', "lang", '/', "String"}}.
+ * The name {@code <init>} will be parsed into { '<', "init", '>'}}
+ * The name {@code foo/bar$:baz} will be parsed into
+ * {@code {"foo", '/', "bar", '$', ':', "baz"}}.
+ * The name {@code ::\=:foo:\=bar\!baz} will be parsed into
+ * {@code {':', ':', "", ':', "foo", ':', "bar:baz"}}.
+ */
+ public static Object[] parseBytecodeName(String s) {
+ int slen = s.length();
+ Object[] res = null;
+ for (int pass = 0; pass <= 1; pass++) {
+ int fillp = 0;
+ int lasti = 0;
+ for (int i = 0; i <= slen; i++) {
+ int whichDC = -1;
+ if (i < slen) {
+ whichDC = DANGEROUS_CHARS.indexOf(s.charAt(i));
+ if (whichDC < DANGEROUS_CHAR_FIRST_INDEX) continue;
+ }
+ // got to end of string or next dangerous char
+ if (lasti < i) {
+ // normal component
+ if (pass != 0)
+ res[fillp] = toSourceName(s.substring(lasti, i));
+ fillp++;
+ lasti = i+1;
+ }
+ if (whichDC >= DANGEROUS_CHAR_FIRST_INDEX) {
+ if (pass != 0)
+ res[fillp] = DANGEROUS_CHARS_CA[whichDC];
+ fillp++;
+ lasti = i+1;
+ }
+ }
+ if (pass != 0) break;
+ // between passes, build the result array
+ res = new Object[fillp];
+ if (fillp <= 1 && lasti == 0) {
+ if (fillp != 0) res[0] = toSourceName(s);
+ break;
+ }
+ }
+ return res;
+ }
+
+ /**
+ * Given a series of components, create a bytecode name for a classfile.
+ * This is the inverse of {@link #parseBytecodeName(java.lang.String)}.
+ * Each component must either be an interned one-character string of
+ * a dangerous character, or else a safe bytecode name.
+ * @param components a series of name components
+ * @return the concatenation of all components
+ * @throws IllegalArgumentException if any component contains an unsafe
+ * character, and is not an interned one-character string
+ * @throws NullPointerException if any component is null
+ */
+ public static String unparseBytecodeName(Object[] components) {
+ Object[] components0 = components;
+ for (int i = 0; i < components.length; i++) {
+ Object c = components[i];
+ if (c instanceof String) {
+ String mc = toBytecodeName((String) c);
+ if (i == 0 && components.length == 1)
+ return mc; // usual case
+ if ((Object)mc != c) {
+ if (components == components0)
+ components = components.clone();
+ components[i] = c = mc;
+ }
+ }
+ }
+ return appendAll(components);
+ }
+ private static String appendAll(Object[] components) {
+ if (components.length <= 1) {
+ if (components.length == 1) {
+ return String.valueOf(components[0]);
+ }
+ return "";
+ }
+ int slen = 0;
+ for (Object c : components) {
+ if (c instanceof String)
+ slen += String.valueOf(c).length();
+ else
+ slen += 1;
+ }
+ StringBuilder sb = new StringBuilder(slen);
+ for (Object c : components) {
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Given a bytecode name, produce the corresponding display name.
+ * This is the source name, plus quotes if needed.
+ * If the bytecode name contains dangerous characters,
+ * assume that they are being used as punctuation,
+ * and pass them through unchanged.
+ * Non-empty runs of non-dangerous characters are demangled
+ * if necessary, and the resulting names are quoted if
+ * they are not already valid Java identifiers, or if
+ * they contain a dangerous character (i.e., dollar sign "$").
+ * Single quotes are used when quoting.
+ * Within quoted names, embedded single quotes and backslashes
+ * are further escaped by prepended backslashes.
+ *
+ * @param s the original bytecode name (which may be qualified)
+ * @return a human-readable presentation
+ */
+ public static String toDisplayName(String s) {
+ Object[] components = parseBytecodeName(s);
+ for (int i = 0; i < components.length; i++) {
+ if (!(components[i] instanceof String))
+ continue;
+ String sn = (String) components[i];
+ // note that the name is already demangled!
+ //sn = toSourceName(sn);
+ if (!isJavaIdent(sn) || sn.indexOf('$') >=0 ) {
+ components[i] = quoteDisplay(sn);
+ }
+ }
+ return appendAll(components);
+ }
+ private static boolean isJavaIdent(String s) {
+ int slen = s.length();
+ if (slen == 0) return false;
+ if (!Character.isJavaIdentifierStart(s.charAt(0)))
+ return false;
+ for (int i = 1; i < slen; i++) {
+ if (!Character.isJavaIdentifierPart(s.charAt(i)))
+ return false;
+ }
+ return true;
+ }
+ private static String quoteDisplay(String s) {
+ // TO DO: Replace wierd characters in s by C-style escapes.
+ return "'"+s.replaceAll("['\\\\]", "\\\\$0")+"'";
+ }
+
+ private static void checkSafeBytecodeName(String s)
+ throws IllegalArgumentException {
+ if (!isSafeBytecodeName(s)) {
+ throw new IllegalArgumentException(s);
+ }
+ }
+
+ /**
+ * Report whether a simple name is safe as a bytecode name.
+ * Such names are acceptable in class files as class, method, and field names.
+ * Additionally, they are free of "dangerous" characters, even if those
+ * characters are legal in some (or all) names in class files.
+ * @param s the proposed bytecode name
+ * @return true if the name is non-empty and all of its characters are safe
+ */
+ public static boolean isSafeBytecodeName(String s) {
+ if (s.length() == 0) return false;
+ // check occurrences of each DANGEROUS char
+ for (char xc : DANGEROUS_CHARS_A) {
+ if (xc == ESCAPE_C) continue; // not really that dangerous
+ if (s.indexOf(xc) >= 0) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Report whether a character is safe in a bytecode name.
+ * This is true of any unicode character except the following
+ * dangerous characters: {@code ".;:$[]<>/"}.
+ * @param s the proposed character
+ * @return true if the character is safe to use in classfiles
+ */
+ public static boolean isSafeBytecodeChar(char c) {
+ return DANGEROUS_CHARS.indexOf(c) < DANGEROUS_CHAR_FIRST_INDEX;
+ }
+
+ private static boolean looksMangled(String s) {
+ return s.charAt(0) == ESCAPE_C;
+ }
+
+ private static String mangle(String s) {
+ if (s.length() == 0)
+ return NULL_ESCAPE;
+
+ // build this lazily, when we first need an escape:
+ StringBuilder sb = null;
+
+ for (int i = 0, slen = s.length(); i < slen; i++) {
+ char c = s.charAt(i);
+
+ boolean needEscape = false;
+ if (c == ESCAPE_C) {
+ if (i+1 < slen) {
+ char c1 = s.charAt(i+1);
+ if ((i == 0 && c1 == NULL_ESCAPE_C)
+ || c1 != originalOfReplacement(c1)) {
+ // an accidental escape
+ needEscape = true;
+ }
+ }
+ } else {
+ needEscape = isDangerous(c);
+ }
+
+ if (!needEscape) {
+ if (sb != null) sb.append(c);
+ continue;
+ }
+
+ // build sb if this is the first escape
+ if (sb == null) {
+ sb = new StringBuilder(s.length()+10);
+ // mangled names must begin with a backslash:
+ if (s.charAt(0) != ESCAPE_C && i > 0)
+ sb.append(NULL_ESCAPE);
+ // append the string so far, which is unremarkable:
+ sb.append(s.substring(0, i));
+ }
+
+ // rewrite \ to \-, / to \|, etc.
+ sb.append(ESCAPE_C);
+ sb.append(replacementOf(c));
+ }
+
+ if (sb != null) return sb.toString();
+
+ return s;
+ }
+
+ private static String demangle(String s) {
+ // build this lazily, when we first meet an escape:
+ StringBuilder sb = null;
+
+ int stringStart = 0;
+ if (s.startsWith(NULL_ESCAPE))
+ stringStart = 2;
+
+ for (int i = stringStart, slen = s.length(); i < slen; i++) {
+ char c = s.charAt(i);
+
+ if (c == ESCAPE_C && i+1 < slen) {
+ // might be an escape sequence
+ char rc = s.charAt(i+1);
+ char oc = originalOfReplacement(rc);
+ if (oc != rc) {
+ // build sb if this is the first escape
+ if (sb == null) {
+ sb = new StringBuilder(s.length());
+ // append the string so far, which is unremarkable:
+ sb.append(s.substring(stringStart, i));
+ }
+ ++i; // skip both characters
+ c = oc;
+ }
+ }
+
+ if (sb != null)
+ sb.append(c);
+ }
+
+ if (sb != null) return sb.toString();
+
+ return s.substring(stringStart);
+ }
+
+ static char ESCAPE_C = '\\';
+ // empty escape sequence to avoid a null name or illegal prefix
+ static char NULL_ESCAPE_C = '=';
+ static String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;
+
+ static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first
+ static final String REPLACEMENT_CHARS = "-|,?!%{}^_";
+ static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
+ static char[] DANGEROUS_CHARS_A = DANGEROUS_CHARS.toCharArray();
+ static char[] REPLACEMENT_CHARS_A = REPLACEMENT_CHARS.toCharArray();
+ static final Character[] DANGEROUS_CHARS_CA;
+ static {
+ Character[] dcca = new Character[DANGEROUS_CHARS.length()];
+ for (int i = 0; i < dcca.length; i++)
+ dcca[i] = Character.valueOf(DANGEROUS_CHARS.charAt(i));
+ DANGEROUS_CHARS_CA = dcca;
+ }
+
+ static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits
+ static {
+ String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
+ //System.out.println("SPECIAL = "+SPECIAL);
+ for (char c : SPECIAL.toCharArray()) {
+ SPECIAL_BITMAP[c >>> 6] |= 1L << c;
+ }
+ }
+ static boolean isSpecial(char c) {
+ if ((c >>> 6) < SPECIAL_BITMAP.length)
+ return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
+ else
+ return false;
+ }
+ static char replacementOf(char c) {
+ if (!isSpecial(c)) return c;
+ int i = DANGEROUS_CHARS.indexOf(c);
+ if (i < 0) return c;
+ return REPLACEMENT_CHARS.charAt(i);
+ }
+ static char originalOfReplacement(char c) {
+ if (!isSpecial(c)) return c;
+ int i = REPLACEMENT_CHARS.indexOf(c);
+ if (i < 0) return c;
+ return DANGEROUS_CHARS.charAt(i);
+ }
+ static boolean isDangerous(char c) {
+ if (!isSpecial(c)) return false;
+ return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
+ }
+ static int indexOfDangerousChar(String s, int from) {
+ for (int i = from, slen = s.length(); i < slen; i++) {
+ if (isDangerous(s.charAt(i)))
+ return i;
+ }
+ return -1;
+ }
+ static int lastIndexOfDangerousChar(String s, int from) {
+ for (int i = Math.min(from, s.length()-1); i >= 0; i--) {
+ if (isDangerous(s.charAt(i)))
+ return i;
+ }
+ return -1;
+ }
+
+
+}