diff -r 000000000000 -r c880a8a8803b rt/emul/compact/src/main/java/sun/invoke/util/BytecodeName.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rt/emul/compact/src/main/java/sun/invoke/util/BytecodeName.java Sat Aug 09 11:11:13 2014 +0200 @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2007, 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package sun.invoke.util; + +/** + * Utility routines for dealing with bytecode-level names. + * Includes universal mangling rules for the JVM. + * + *

Avoiding Dangerous Characters

+ * + *

+ * The JVM defines a very small set of characters which are illegal + * in name spellings. We will slightly extend and regularize this set + * into a group of dangerous characters. + * These characters will then be replaced, in mangled names, by escape sequences. + * In addition, accidental escape sequences must be further escaped. + * Finally, a special prefix will be applied if and only if + * the mangling would otherwise fail to begin with the escape character. + * This happens to cover the corner case of the null string, + * and also clearly marks symbols which need demangling. + *

+ *

+ * Dangerous characters are the union of all characters forbidden + * or otherwise restricted by the JVM specification, + * plus their mates, if they are brackets + * ([ and ], + * < and >), + * plus, arbitrarily, the colon character :. + * There is no distinction between type, method, and field names. + * This makes it easier to convert between mangled names of different + * types, since they do not need to be decoded (demangled). + *

+ *

+ * The escape character is backslash \ + * (also known as reverse solidus). + * This character is, until now, unheard of in bytecode names, + * but traditional in the proposed role. + * + *

+ *

Replacement Characters

+ * + * + *

+ * Every escape sequence is two characters + * (in fact, two UTF8 bytes) beginning with + * the escape character and followed by a + * replacement character. + * (Since the replacement character is never a backslash, + * iterated manglings do not double in size.) + *

+ *

+ * Each dangerous character has some rough visual similarity + * to its corresponding replacement character. + * This makes mangled symbols easier to recognize by sight. + *

+ *

+ * The dangerous characters are + * / (forward slash, used to delimit package components), + * . (dot, also a package delimiter), + * ; (semicolon, used in signatures), + * $ (dollar, used in inner classes and synthetic members), + * < (left angle), + * > (right angle), + * [ (left square bracket, used in array types), + * ] (right square bracket, reserved in this scheme for language use), + * and : (colon, reserved in this scheme for language use). + * Their replacements are, respectively, + * | (vertical bar), + * , (comma), + * ? (question mark), + * % (percent), + * ^ (caret), + * _ (underscore), and + * { (left curly bracket), + * } (right curly bracket), + * ! (exclamation mark). + * In addition, the replacement character for the escape character itself is + * - (hyphen), + * and the replacement character for the null prefix is + * = (equal sign). + *

+ *

+ * An escape character \ + * followed by any of these replacement characters + * is an escape sequence, and there are no other escape sequences. + * An equal sign is only part of an escape sequence + * if it is the second character in the whole string, following a backslash. + * Two consecutive backslashes do not form an escape sequence. + *

+ *

+ * Each escape sequence replaces a so-called original character + * which is either one of the dangerous characters or the escape character. + * A null prefix replaces an initial null string, not a character. + *

+ *

+ * All this implies that escape sequences cannot overlap and may be + * determined all at once for a whole string. Note that a spelling + * string can contain accidental escapes, apparent escape + * sequences which must not be interpreted as manglings. + * These are disabled by replacing their leading backslash with an + * escape sequence (\-). To mangle a string, three logical steps + * are required, though they may be carried out in one pass: + *

+ *
    + *
  1. In each accidental escape, replace the backslash with an escape sequence + * (\-).
  2. + *
  3. Replace each dangerous character with an escape sequence + * (\| for /, etc.).
  4. + *
  5. If the first two steps introduced any change, and + * if the string does not already begin with a backslash, prepend a null prefix (\=).
  6. + *
+ * + * To demangle a mangled string that begins with an escape, + * remove any null prefix, and then replace (in parallel) + * each escape sequence by its original character. + *

Spelling strings which contain accidental + * escapes must have them replaced, even if those + * strings do not contain dangerous characters. + * This restriction means that mangling a string always + * requires a scan of the string for escapes. + * But then, a scan would be required anyway, + * to check for dangerous characters. + * + *

+ *

Nice Properties

+ * + *

+ * If a bytecode name does not contain any escape sequence, + * demangling is a no-op: The string demangles to itself. + * Such a string is called self-mangling. + * Almost all strings are self-mangling. + * In practice, to demangle almost any name “found in nature”, + * simply verify that it does not begin with a backslash. + *

+ *

+ * Mangling is a one-to-one function, while demangling + * is a many-to-one function. + * A mangled string is defined as validly mangled if + * it is in fact the unique mangling of its spelling string. + * Three examples of invalidly mangled strings are \=foo, + * \-bar, and baz\!, which demangle to foo, \bar, and + * baz\!, but then remangle to foo, \bar, and \=baz\-!. + * If a language back-end or runtime is using mangled names, + * it should never present an invalidly mangled bytecode + * name to the JVM. If the runtime encounters one, + * it should also report an error, since such an occurrence + * probably indicates a bug in name encoding which + * will lead to errors in linkage. + * However, this note does not propose that the JVM verifier + * detect invalidly mangled names. + *

+ *

+ * As a result of these rules, it is a simple matter to + * compute validly mangled substrings and concatenations + * of validly mangled strings, and (with a little care) + * these correspond to corresponding operations on their + * spelling strings. + *

+ * + *

If languages that include non-Java symbol spellings use this + * mangling convention, they will enjoy the following advantages: + *

+ * + * + * + *

Suggestions for Human Readable Presentations

+ * + * + *

+ * For human readable displays of symbols, + * it will be better to present a string-like quoted + * representation of the spelling, because JVM users + * are generally familiar with such tokens. + * We suggest using single or double quotes before and after + * mangled symbols which are not valid Java identifiers, + * with quotes, backslashes, and non-printing characters + * escaped as if for literals in the Java language. + *

+ *

+ * For example, an HTML-like spelling + * <pre> mangles to + * \^pre\_ and could + * display more cleanly as + * '<pre>', + * with the quotes included. + * Such string-like conventions are not suitable + * for mangled bytecode names, in part because + * dangerous characters must be eliminated, rather + * than just quoted. Otherwise internally structured + * strings like package prefixes and method signatures + * could not be reliably parsed. + *

+ *

+ * In such human-readable displays, invalidly mangled + * names should not be demangled and quoted, + * for this would be misleading. Likewise, JVM symbols + * which contain dangerous characters (like dots in field + * names or brackets in method names) should not be + * simply quoted. The bytecode names + * \=phase\,1 and + * phase.1 are distinct, + * and in demangled displays they should be presented as + * 'phase.1' and something like + * 'phase'.1, respectively. + *

+ * + * @author John Rose + * @version 1.2, 02/06/2008 + * @see http://blogs.sun.com/jrose/entry/symbolic_freedom_in_the_vm + */ +public class BytecodeName { + private BytecodeName() { } // static only class + + /** Given a source name, produce the corresponding bytecode name. + * The source name should not be qualified, because any syntactic + * markers (dots, slashes, dollar signs, colons, etc.) will be mangled. + * @param s the source name + * @return a valid bytecode name which represents the source name + */ + public static String toBytecodeName(String s) { + String bn = mangle(s); + assert((Object)bn == s || looksMangled(bn)) : bn; + assert(s.equals(toSourceName(bn))) : s; + return bn; + } + + /** Given an unqualified bytecode name, produce the corresponding source name. + * The bytecode name must not contain dangerous characters. + * In particular, it must not be qualified or segmented by colon {@code ':'}. + * @param s the bytecode name + * @return the source name, which may possibly have unsafe characters + * @throws IllegalArgumentException if the bytecode name is not {@link #isSafeBytecodeName safe} + * @see #isSafeBytecodeName(java.lang.String) + */ + public static String toSourceName(String s) { + checkSafeBytecodeName(s); + String sn = s; + if (looksMangled(s)) { + sn = demangle(s); + assert(s.equals(mangle(sn))) : s+" => "+sn+" => "+mangle(sn); + } + return sn; + } + + /** + * Given a bytecode name from a classfile, separate it into + * components delimited by dangerous characters. + * Each resulting array element will be either a dangerous character, + * or else a safe bytecode name. + * (The safe name might possibly be mangled to hide further dangerous characters.) + * For example, the qualified class name {@code java/lang/String} + * will be parsed into the array {@code {"java", '/', "lang", '/', "String"}}. + * The name {@code <init>} will be parsed into { '<', "init", '>'}} + * The name {@code foo/bar$:baz} will be parsed into + * {@code {"foo", '/', "bar", '$', ':', "baz"}}. + * The name {@code ::\=:foo:\=bar\!baz} will be parsed into + * {@code {':', ':', "", ':', "foo", ':', "bar:baz"}}. + */ + public static Object[] parseBytecodeName(String s) { + int slen = s.length(); + Object[] res = null; + for (int pass = 0; pass <= 1; pass++) { + int fillp = 0; + int lasti = 0; + for (int i = 0; i <= slen; i++) { + int whichDC = -1; + if (i < slen) { + whichDC = DANGEROUS_CHARS.indexOf(s.charAt(i)); + if (whichDC < DANGEROUS_CHAR_FIRST_INDEX) continue; + } + // got to end of string or next dangerous char + if (lasti < i) { + // normal component + if (pass != 0) + res[fillp] = toSourceName(s.substring(lasti, i)); + fillp++; + lasti = i+1; + } + if (whichDC >= DANGEROUS_CHAR_FIRST_INDEX) { + if (pass != 0) + res[fillp] = DANGEROUS_CHARS_CA[whichDC]; + fillp++; + lasti = i+1; + } + } + if (pass != 0) break; + // between passes, build the result array + res = new Object[fillp]; + if (fillp <= 1 && lasti == 0) { + if (fillp != 0) res[0] = toSourceName(s); + break; + } + } + return res; + } + + /** + * Given a series of components, create a bytecode name for a classfile. + * This is the inverse of {@link #parseBytecodeName(java.lang.String)}. + * Each component must either be an interned one-character string of + * a dangerous character, or else a safe bytecode name. + * @param components a series of name components + * @return the concatenation of all components + * @throws IllegalArgumentException if any component contains an unsafe + * character, and is not an interned one-character string + * @throws NullPointerException if any component is null + */ + public static String unparseBytecodeName(Object[] components) { + Object[] components0 = components; + for (int i = 0; i < components.length; i++) { + Object c = components[i]; + if (c instanceof String) { + String mc = toBytecodeName((String) c); + if (i == 0 && components.length == 1) + return mc; // usual case + if ((Object)mc != c) { + if (components == components0) + components = components.clone(); + components[i] = c = mc; + } + } + } + return appendAll(components); + } + private static String appendAll(Object[] components) { + if (components.length <= 1) { + if (components.length == 1) { + return String.valueOf(components[0]); + } + return ""; + } + int slen = 0; + for (Object c : components) { + if (c instanceof String) + slen += String.valueOf(c).length(); + else + slen += 1; + } + StringBuilder sb = new StringBuilder(slen); + for (Object c : components) { + sb.append(c); + } + return sb.toString(); + } + + /** + * Given a bytecode name, produce the corresponding display name. + * This is the source name, plus quotes if needed. + * If the bytecode name contains dangerous characters, + * assume that they are being used as punctuation, + * and pass them through unchanged. + * Non-empty runs of non-dangerous characters are demangled + * if necessary, and the resulting names are quoted if + * they are not already valid Java identifiers, or if + * they contain a dangerous character (i.e., dollar sign "$"). + * Single quotes are used when quoting. + * Within quoted names, embedded single quotes and backslashes + * are further escaped by prepended backslashes. + * + * @param s the original bytecode name (which may be qualified) + * @return a human-readable presentation + */ + public static String toDisplayName(String s) { + Object[] components = parseBytecodeName(s); + for (int i = 0; i < components.length; i++) { + if (!(components[i] instanceof String)) + continue; + String sn = (String) components[i]; + // note that the name is already demangled! + //sn = toSourceName(sn); + if (!isJavaIdent(sn) || sn.indexOf('$') >=0 ) { + components[i] = quoteDisplay(sn); + } + } + return appendAll(components); + } + private static boolean isJavaIdent(String s) { + int slen = s.length(); + if (slen == 0) return false; + if (!Character.isJavaIdentifierStart(s.charAt(0))) + return false; + for (int i = 1; i < slen; i++) { + if (!Character.isJavaIdentifierPart(s.charAt(i))) + return false; + } + return true; + } + private static String quoteDisplay(String s) { + // TO DO: Replace wierd characters in s by C-style escapes. + return "'"+s.replaceAll("['\\\\]", "\\\\$0")+"'"; + } + + private static void checkSafeBytecodeName(String s) + throws IllegalArgumentException { + if (!isSafeBytecodeName(s)) { + throw new IllegalArgumentException(s); + } + } + + /** + * Report whether a simple name is safe as a bytecode name. + * Such names are acceptable in class files as class, method, and field names. + * Additionally, they are free of "dangerous" characters, even if those + * characters are legal in some (or all) names in class files. + * @param s the proposed bytecode name + * @return true if the name is non-empty and all of its characters are safe + */ + public static boolean isSafeBytecodeName(String s) { + if (s.length() == 0) return false; + // check occurrences of each DANGEROUS char + for (char xc : DANGEROUS_CHARS_A) { + if (xc == ESCAPE_C) continue; // not really that dangerous + if (s.indexOf(xc) >= 0) return false; + } + return true; + } + + /** + * Report whether a character is safe in a bytecode name. + * This is true of any unicode character except the following + * dangerous characters: {@code ".;:$[]<>/"}. + * @param s the proposed character + * @return true if the character is safe to use in classfiles + */ + public static boolean isSafeBytecodeChar(char c) { + return DANGEROUS_CHARS.indexOf(c) < DANGEROUS_CHAR_FIRST_INDEX; + } + + private static boolean looksMangled(String s) { + return s.charAt(0) == ESCAPE_C; + } + + private static String mangle(String s) { + if (s.length() == 0) + return NULL_ESCAPE; + + // build this lazily, when we first need an escape: + StringBuilder sb = null; + + for (int i = 0, slen = s.length(); i < slen; i++) { + char c = s.charAt(i); + + boolean needEscape = false; + if (c == ESCAPE_C) { + if (i+1 < slen) { + char c1 = s.charAt(i+1); + if ((i == 0 && c1 == NULL_ESCAPE_C) + || c1 != originalOfReplacement(c1)) { + // an accidental escape + needEscape = true; + } + } + } else { + needEscape = isDangerous(c); + } + + if (!needEscape) { + if (sb != null) sb.append(c); + continue; + } + + // build sb if this is the first escape + if (sb == null) { + sb = new StringBuilder(s.length()+10); + // mangled names must begin with a backslash: + if (s.charAt(0) != ESCAPE_C && i > 0) + sb.append(NULL_ESCAPE); + // append the string so far, which is unremarkable: + sb.append(s.substring(0, i)); + } + + // rewrite \ to \-, / to \|, etc. + sb.append(ESCAPE_C); + sb.append(replacementOf(c)); + } + + if (sb != null) return sb.toString(); + + return s; + } + + private static String demangle(String s) { + // build this lazily, when we first meet an escape: + StringBuilder sb = null; + + int stringStart = 0; + if (s.startsWith(NULL_ESCAPE)) + stringStart = 2; + + for (int i = stringStart, slen = s.length(); i < slen; i++) { + char c = s.charAt(i); + + if (c == ESCAPE_C && i+1 < slen) { + // might be an escape sequence + char rc = s.charAt(i+1); + char oc = originalOfReplacement(rc); + if (oc != rc) { + // build sb if this is the first escape + if (sb == null) { + sb = new StringBuilder(s.length()); + // append the string so far, which is unremarkable: + sb.append(s.substring(stringStart, i)); + } + ++i; // skip both characters + c = oc; + } + } + + if (sb != null) + sb.append(c); + } + + if (sb != null) return sb.toString(); + + return s.substring(stringStart); + } + + static char ESCAPE_C = '\\'; + // empty escape sequence to avoid a null name or illegal prefix + static char NULL_ESCAPE_C = '='; + static String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C; + + static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first + static final String REPLACEMENT_CHARS = "-|,?!%{}^_"; + static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\ + static char[] DANGEROUS_CHARS_A = DANGEROUS_CHARS.toCharArray(); + static char[] REPLACEMENT_CHARS_A = REPLACEMENT_CHARS.toCharArray(); + static final Character[] DANGEROUS_CHARS_CA; + static { + Character[] dcca = new Character[DANGEROUS_CHARS.length()]; + for (int i = 0; i < dcca.length; i++) + dcca[i] = Character.valueOf(DANGEROUS_CHARS.charAt(i)); + DANGEROUS_CHARS_CA = dcca; + } + + static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits + static { + String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS; + //System.out.println("SPECIAL = "+SPECIAL); + for (char c : SPECIAL.toCharArray()) { + SPECIAL_BITMAP[c >>> 6] |= 1L << c; + } + } + static boolean isSpecial(char c) { + if ((c >>> 6) < SPECIAL_BITMAP.length) + return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0; + else + return false; + } + static char replacementOf(char c) { + if (!isSpecial(c)) return c; + int i = DANGEROUS_CHARS.indexOf(c); + if (i < 0) return c; + return REPLACEMENT_CHARS.charAt(i); + } + static char originalOfReplacement(char c) { + if (!isSpecial(c)) return c; + int i = REPLACEMENT_CHARS.indexOf(c); + if (i < 0) return c; + return DANGEROUS_CHARS.charAt(i); + } + static boolean isDangerous(char c) { + if (!isSpecial(c)) return false; + return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX); + } + static int indexOfDangerousChar(String s, int from) { + for (int i = from, slen = s.length(); i < slen; i++) { + if (isDangerous(s.charAt(i))) + return i; + } + return -1; + } + static int lastIndexOfDangerousChar(String s, int from) { + for (int i = Math.min(from, s.length()-1); i >= 0; i--) { + if (isDangerous(s.charAt(i))) + return i; + } + return -1; + } + + +}