jtulach@1348: /*
jtulach@1348: * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.
jtulach@1348: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
jtulach@1348: *
jtulach@1348: * This code is free software; you can redistribute it and/or modify it
jtulach@1348: * under the terms of the GNU General Public License version 2 only, as
jtulach@1348: * published by the Free Software Foundation. Oracle designates this
jtulach@1348: * particular file as subject to the "Classpath" exception as provided
jtulach@1348: * by Oracle in the LICENSE file that accompanied this code.
jtulach@1348: *
jtulach@1348: * This code is distributed in the hope that it will be useful, but WITHOUT
jtulach@1348: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
jtulach@1348: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
jtulach@1348: * version 2 for more details (a copy is included in the LICENSE file that
jtulach@1348: * accompanied this code).
jtulach@1348: *
jtulach@1348: * You should have received a copy of the GNU General Public License version
jtulach@1348: * 2 along with this work; if not, write to the Free Software Foundation,
jtulach@1348: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
jtulach@1348: *
jtulach@1348: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
jtulach@1348: * or visit www.oracle.com if you need additional information or have any
jtulach@1348: * questions.
jtulach@1348: */
jtulach@1348:
jtulach@1348: package java.util.regex;
jtulach@1348:
jtulach@1348: import java.util.Locale;
jtulach@1348: import java.util.Map;
jtulach@1348: import java.util.ArrayList;
jtulach@1348: import java.util.HashMap;
jtulach@1348: import java.util.Arrays;
jtulach@1348:
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * A compiled representation of a regular expression.
jtulach@1348: *
jtulach@1348: *
A regular expression, specified as a string, must first be compiled into
jtulach@1348: * an instance of this class. The resulting pattern can then be used to create
jtulach@1348: * a {@link Matcher} object that can match arbitrary {@link
jtulach@1348: * java.lang.CharSequence character sequences} against the regular
jtulach@1348: * expression. All of the state involved in performing a match resides in the
jtulach@1348: * matcher, so many matchers can share the same pattern.
jtulach@1348: *
jtulach@1348: * A typical invocation sequence is thus
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Pattern p = Pattern.{@link #compile compile}("a*b");
jtulach@1348: * Matcher m = p.{@link #matcher matcher}("aaaaab");
jtulach@1348: * boolean b = m.{@link Matcher#matches matches}();
jtulach@1348: *
jtulach@1348: * A {@link #matches matches} method is defined by this class as a
jtulach@1348: * convenience for when a regular expression is used just once. This method
jtulach@1348: * compiles an expression and matches an input sequence against it in a single
jtulach@1348: * invocation. The statement
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * boolean b = Pattern.matches("a*b", "aaaaab");
jtulach@1348: *
jtulach@1348: * is equivalent to the three statements above, though for repeated matches it
jtulach@1348: * is less efficient since it does not allow the compiled pattern to be reused.
jtulach@1348: *
jtulach@1348: * Instances of this class are immutable and are safe for use by multiple
jtulach@1348: * concurrent threads. Instances of the {@link Matcher} class are not safe for
jtulach@1348: * such use.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Summary of regular-expression constructs
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Construct |
jtulach@1348: * Matches |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Characters |
jtulach@1348: *
jtulach@1348: * x |
jtulach@1348: * The character x |
jtulach@1348: * \\ |
jtulach@1348: * The backslash character |
jtulach@1348: * \0n |
jtulach@1348: * The character with octal value 0n
jtulach@1348: * (0 <= n <= 7) |
jtulach@1348: * \0nn |
jtulach@1348: * The character with octal value 0nn
jtulach@1348: * (0 <= n <= 7) |
jtulach@1348: * \0mnn |
jtulach@1348: * The character with octal value 0mnn
jtulach@1348: * (0 <= m <= 3,
jtulach@1348: * 0 <= n <= 7) |
jtulach@1348: * \xhh |
jtulach@1348: * The character with hexadecimal value 0xhh |
jtulach@1348: * \uhhhh |
jtulach@1348: * The character with hexadecimal value 0xhhhh |
jtulach@1348: * \x{h...h} |
jtulach@1348: * The character with hexadecimal value 0xh...h
jtulach@1348: * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
jtulach@1348: * <= 0xh...h <= 
jtulach@1348: * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT}) |
jtulach@1348: * \t |
jtulach@1348: * The tab character ('\u0009') |
jtulach@1348: * \n |
jtulach@1348: * The newline (line feed) character ('\u000A') |
jtulach@1348: * \r |
jtulach@1348: * The carriage-return character ('\u000D') |
jtulach@1348: * \f |
jtulach@1348: * The form-feed character ('\u000C') |
jtulach@1348: * \a |
jtulach@1348: * The alert (bell) character ('\u0007') |
jtulach@1348: * \e |
jtulach@1348: * The escape character ('\u001B') |
jtulach@1348: * \cx |
jtulach@1348: * The control character corresponding to x |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Character classes |
jtulach@1348: *
jtulach@1348: * [abc] |
jtulach@1348: * a, b, or c (simple class) |
jtulach@1348: * [^abc] |
jtulach@1348: * Any character except a, b, or c (negation) |
jtulach@1348: * [a-zA-Z] |
jtulach@1348: * a through z
jtulach@1348: * or A through Z, inclusive (range) |
jtulach@1348: * [a-d[m-p]] |
jtulach@1348: * a through d,
jtulach@1348: * or m through p: [a-dm-p] (union) |
jtulach@1348: * [a-z&&[def]] |
jtulach@1348: * d, e, or f (intersection) |
jtulach@1348: * [a-z&&[^bc]] |
jtulach@1348: * a through z,
jtulach@1348: * except for b and c: [ad-z] (subtraction) |
jtulach@1348: * [a-z&&[^m-p]] |
jtulach@1348: * a through z,
jtulach@1348: * and not m through p: [a-lq-z](subtraction) |
jtulach@1348: * |
jtulach@1348: *
jtulach@1348: * Predefined character classes |
jtulach@1348: *
jtulach@1348: * . |
jtulach@1348: * Any character (may or may not match line terminators) |
jtulach@1348: * \d |
jtulach@1348: * A digit: [0-9] |
jtulach@1348: * \D |
jtulach@1348: * A non-digit: [^0-9] |
jtulach@1348: * \s |
jtulach@1348: * A whitespace character: [ \t\n\x0B\f\r] |
jtulach@1348: * \S |
jtulach@1348: * A non-whitespace character: [^\s] |
jtulach@1348: * \w |
jtulach@1348: * A word character: [a-zA-Z_0-9] |
jtulach@1348: * \W |
jtulach@1348: * A non-word character: [^\w] |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * POSIX character classes (US-ASCII only) |
jtulach@1348: *
jtulach@1348: * \p{Lower} |
jtulach@1348: * A lower-case alphabetic character: [a-z] |
jtulach@1348: * \p{Upper} |
jtulach@1348: * An upper-case alphabetic character:[A-Z] |
jtulach@1348: * \p{ASCII} |
jtulach@1348: * All ASCII:[\x00-\x7F] |
jtulach@1348: * \p{Alpha} |
jtulach@1348: * An alphabetic character:[\p{Lower}\p{Upper}] |
jtulach@1348: * \p{Digit} |
jtulach@1348: * A decimal digit: [0-9] |
jtulach@1348: * \p{Alnum} |
jtulach@1348: * An alphanumeric character:[\p{Alpha}\p{Digit}] |
jtulach@1348: * \p{Punct} |
jtulach@1348: * Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ |
jtulach@1348: *
jtulach@1348: * \p{Graph} |
jtulach@1348: * A visible character: [\p{Alnum}\p{Punct}] |
jtulach@1348: * \p{Print} |
jtulach@1348: * A printable character: [\p{Graph}\x20] |
jtulach@1348: * \p{Blank} |
jtulach@1348: * A space or a tab: [ \t] |
jtulach@1348: * \p{Cntrl} |
jtulach@1348: * A control character: [\x00-\x1F\x7F] |
jtulach@1348: * \p{XDigit} |
jtulach@1348: * A hexadecimal digit: [0-9a-fA-F] |
jtulach@1348: * \p{Space} |
jtulach@1348: * A whitespace character: [ \t\n\x0B\f\r] |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * java.lang.Character classes (simple java character type) |
jtulach@1348: *
jtulach@1348: * \p{javaLowerCase} |
jtulach@1348: * Equivalent to java.lang.Character.isLowerCase() |
jtulach@1348: * \p{javaUpperCase} |
jtulach@1348: * Equivalent to java.lang.Character.isUpperCase() |
jtulach@1348: * \p{javaWhitespace} |
jtulach@1348: * Equivalent to java.lang.Character.isWhitespace() |
jtulach@1348: * \p{javaMirrored} |
jtulach@1348: * Equivalent to java.lang.Character.isMirrored() |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Classes for Unicode scripts, blocks, categories and binary properties |
jtulach@1348: * * \p{IsLatin} |
jtulach@1348: * A Latin script character (script) |
jtulach@1348: * \p{InGreek} |
jtulach@1348: * A character in the Greek block (block) |
jtulach@1348: * \p{Lu} |
jtulach@1348: * An uppercase letter (category) |
jtulach@1348: * \p{IsAlphabetic} |
jtulach@1348: * An alphabetic character (binary property) |
jtulach@1348: * \p{Sc} |
jtulach@1348: * A currency symbol |
jtulach@1348: * \P{InGreek} |
jtulach@1348: * Any character except one in the Greek block (negation) |
jtulach@1348: * [\p{L}&&[^\p{Lu}]] |
jtulach@1348: * Any letter except an uppercase letter (subtraction) |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Boundary matchers |
jtulach@1348: *
jtulach@1348: * ^ |
jtulach@1348: * The beginning of a line |
jtulach@1348: * $ |
jtulach@1348: * The end of a line |
jtulach@1348: * \b |
jtulach@1348: * A word boundary |
jtulach@1348: * \B |
jtulach@1348: * A non-word boundary |
jtulach@1348: * \A |
jtulach@1348: * The beginning of the input |
jtulach@1348: * \G |
jtulach@1348: * The end of the previous match |
jtulach@1348: * \Z |
jtulach@1348: * The end of the input but for the final
jtulach@1348: * terminator, if any |
jtulach@1348: * \z |
jtulach@1348: * The end of the input |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Greedy quantifiers |
jtulach@1348: *
jtulach@1348: * X? |
jtulach@1348: * X, once or not at all |
jtulach@1348: * X* |
jtulach@1348: * X, zero or more times |
jtulach@1348: * X+ |
jtulach@1348: * X, one or more times |
jtulach@1348: * X{n} |
jtulach@1348: * X, exactly n times |
jtulach@1348: * X{n,} |
jtulach@1348: * X, at least n times |
jtulach@1348: * X{n,m} |
jtulach@1348: * X, at least n but not more than m times |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Reluctant quantifiers |
jtulach@1348: *
jtulach@1348: * X?? |
jtulach@1348: * X, once or not at all |
jtulach@1348: * X*? |
jtulach@1348: * X, zero or more times |
jtulach@1348: * X+? |
jtulach@1348: * X, one or more times |
jtulach@1348: * X{n}? |
jtulach@1348: * X, exactly n times |
jtulach@1348: * X{n,}? |
jtulach@1348: * X, at least n times |
jtulach@1348: * X{n,m}? |
jtulach@1348: * X, at least n but not more than m times |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Possessive quantifiers |
jtulach@1348: *
jtulach@1348: * X?+ |
jtulach@1348: * X, once or not at all |
jtulach@1348: * X*+ |
jtulach@1348: * X, zero or more times |
jtulach@1348: * X++ |
jtulach@1348: * X, one or more times |
jtulach@1348: * X{n}+ |
jtulach@1348: * X, exactly n times |
jtulach@1348: * X{n,}+ |
jtulach@1348: * X, at least n times |
jtulach@1348: * X{n,m}+ |
jtulach@1348: * X, at least n but not more than m times |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Logical operators |
jtulach@1348: *
jtulach@1348: * XY |
jtulach@1348: * X followed by Y |
jtulach@1348: * X|Y |
jtulach@1348: * Either X or Y |
jtulach@1348: * (X) |
jtulach@1348: * X, as a capturing group |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Back references |
jtulach@1348: *
jtulach@1348: * \n |
jtulach@1348: * Whatever the nth
jtulach@1348: * capturing group matched |
jtulach@1348: *
jtulach@1348: * \k<name> |
jtulach@1348: * Whatever the
jtulach@1348: * named-capturing group "name" matched |
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Quotation |
jtulach@1348: *
jtulach@1348: * \ |
jtulach@1348: * Nothing, but quotes the following character |
jtulach@1348: * \Q |
jtulach@1348: * Nothing, but quotes all characters until \E |
jtulach@1348: * \E |
jtulach@1348: * Nothing, but ends quoting started by \Q |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * |
jtulach@1348: * Special constructs (named-capturing and non-capturing) |
jtulach@1348: *
jtulach@1348: * (?<name>X) |
jtulach@1348: * X, as a named-capturing group |
jtulach@1348: * (?:X) |
jtulach@1348: * X, as a non-capturing group |
jtulach@1348: * (?idmsuxU-idmsuxU) |
jtulach@1348: * Nothing, but turns match flags i
jtulach@1348: * d m s
jtulach@1348: * u x U
jtulach@1348: * on - off |
jtulach@1348: * (?idmsux-idmsux:X) |
jtulach@1348: * X, as a non-capturing group with the
jtulach@1348: * given flags i d
jtulach@1348: * m s u
jtulach@1348: * x on - off |
jtulach@1348: * (?=X) |
jtulach@1348: * X, via zero-width positive lookahead |
jtulach@1348: * (?!X) |
jtulach@1348: * X, via zero-width negative lookahead |
jtulach@1348: * (?<=X) |
jtulach@1348: * X, via zero-width positive lookbehind |
jtulach@1348: * (?<!X) |
jtulach@1348: * X, via zero-width negative lookbehind |
jtulach@1348: * (?>X) |
jtulach@1348: * X, as an independent, non-capturing group |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Backslashes, escapes, and quoting
jtulach@1348: *
jtulach@1348: * The backslash character ('\') serves to introduce escaped
jtulach@1348: * constructs, as defined in the table above, as well as to quote characters
jtulach@1348: * that otherwise would be interpreted as unescaped constructs. Thus the
jtulach@1348: * expression \\ matches a single backslash and \{ matches a
jtulach@1348: * left brace.
jtulach@1348: *
jtulach@1348: *
It is an error to use a backslash prior to any alphabetic character that
jtulach@1348: * does not denote an escaped construct; these are reserved for future
jtulach@1348: * extensions to the regular-expression language. A backslash may be used
jtulach@1348: * prior to a non-alphabetic character regardless of whether that character is
jtulach@1348: * part of an unescaped construct.
jtulach@1348: *
jtulach@1348: *
Backslashes within string literals in Java source code are interpreted
jtulach@1348: * as required by
jtulach@1348: * The Java™ Language Specification
jtulach@1348: * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)
jtulach@1348: * It is therefore necessary to double backslashes in string
jtulach@1348: * literals that represent regular expressions to protect them from
jtulach@1348: * interpretation by the Java bytecode compiler. The string literal
jtulach@1348: * "\b", for example, matches a single backspace character when
jtulach@1348: * interpreted as a regular expression, while "\\b" matches a
jtulach@1348: * word boundary. The string literal "\(hello\)" is illegal
jtulach@1348: * and leads to a compile-time error; in order to match the string
jtulach@1348: * (hello) the string literal "\\(hello\\)"
jtulach@1348: * must be used.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Character Classes
jtulach@1348: *
jtulach@1348: * Character classes may appear within other character classes, and
jtulach@1348: * may be composed by the union operator (implicit) and the intersection
jtulach@1348: * operator (&&).
jtulach@1348: * The union operator denotes a class that contains every character that is
jtulach@1348: * in at least one of its operand classes. The intersection operator
jtulach@1348: * denotes a class that contains every character that is in both of its
jtulach@1348: * operand classes.
jtulach@1348: *
jtulach@1348: *
The precedence of character-class operators is as follows, from
jtulach@1348: * highest to lowest:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * 1 |
jtulach@1348: * Literal escape |
jtulach@1348: * \x |
jtulach@1348: * 2 |
jtulach@1348: * Grouping |
jtulach@1348: * [...] |
jtulach@1348: * 3 |
jtulach@1348: * Range |
jtulach@1348: * a-z |
jtulach@1348: * 4 |
jtulach@1348: * Union |
jtulach@1348: * [a-e][i-u] |
jtulach@1348: * 5 |
jtulach@1348: * Intersection |
jtulach@1348: * [a-z&&[aeiou]] |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Note that a different set of metacharacters are in effect inside
jtulach@1348: * a character class than outside a character class. For instance, the
jtulach@1348: * regular expression . loses its special meaning inside a
jtulach@1348: * character class, while the expression - becomes a range
jtulach@1348: * forming metacharacter.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Line terminators
jtulach@1348: *
jtulach@1348: * A line terminator is a one- or two-character sequence that marks
jtulach@1348: * the end of a line of the input character sequence. The following are
jtulach@1348: * recognized as line terminators:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * - A newline (line feed) character ('\n'),
jtulach@1348: *
jtulach@1348: *
- A carriage-return character followed immediately by a newline
jtulach@1348: * character ("\r\n"),
jtulach@1348: *
jtulach@1348: *
- A standalone carriage-return character ('\r'),
jtulach@1348: *
jtulach@1348: *
- A next-line character ('\u0085'),
jtulach@1348: *
jtulach@1348: *
- A line-separator character ('\u2028'), or
jtulach@1348: *
jtulach@1348: *
- A paragraph-separator character ('\u2029).
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * If {@link #UNIX_LINES} mode is activated, then the only line terminators
jtulach@1348: * recognized are newline characters.
jtulach@1348: *
jtulach@1348: *
The regular expression . matches any character except a line
jtulach@1348: * terminator unless the {@link #DOTALL} flag is specified.
jtulach@1348: *
jtulach@1348: *
By default, the regular expressions ^ and $ ignore
jtulach@1348: * line terminators and only match at the beginning and the end, respectively,
jtulach@1348: * of the entire input sequence. If {@link #MULTILINE} mode is activated then
jtulach@1348: * ^ matches at the beginning of input and after any line terminator
jtulach@1348: * except at the end of input. When in {@link #MULTILINE} mode $
jtulach@1348: * matches just before a line terminator or the end of the input sequence.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Groups and capturing
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Group number
jtulach@1348: * Capturing groups are numbered by counting their opening parentheses from
jtulach@1348: * left to right. In the expression ((A)(B(C))), for example, there
jtulach@1348: * are four such groups:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * 1 |
jtulach@1348: * ((A)(B(C))) |
jtulach@1348: * 2 |
jtulach@1348: * (A) |
jtulach@1348: * 3 |
jtulach@1348: * (B(C)) |
jtulach@1348: * 4 |
jtulach@1348: * (C) |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Group zero always stands for the entire expression.
jtulach@1348: *
jtulach@1348: *
Capturing groups are so named because, during a match, each subsequence
jtulach@1348: * of the input sequence that matches such a group is saved. The captured
jtulach@1348: * subsequence may be used later in the expression, via a back reference, and
jtulach@1348: * may also be retrieved from the matcher once the match operation is complete.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Group name
jtulach@1348: * A capturing group can also be assigned a "name", a named-capturing group,
jtulach@1348: * and then be back-referenced later by the "name". Group names are composed of
jtulach@1348: * the following characters. The first character must be a letter.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * - The uppercase letters 'A' through 'Z'
jtulach@1348: * ('\u0041' through '\u005a'),
jtulach@1348: *
- The lowercase letters 'a' through 'z'
jtulach@1348: * ('\u0061' through '\u007a'),
jtulach@1348: *
- The digits '0' through '9'
jtulach@1348: * ('\u0030' through '\u0039'),
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * A named-capturing group is still numbered as described in
jtulach@1348: * Group number.
jtulach@1348: *
jtulach@1348: *
The captured input associated with a group is always the subsequence
jtulach@1348: * that the group most recently matched. If a group is evaluated a second time
jtulach@1348: * because of quantification then its previously-captured value, if any, will
jtulach@1348: * be retained if the second evaluation fails. Matching the string
jtulach@1348: * "aba" against the expression (a(b)?)+, for example, leaves
jtulach@1348: * group two set to "b". All captured input is discarded at the
jtulach@1348: * beginning of each match.
jtulach@1348: *
jtulach@1348: *
Groups beginning with (? are either pure, non-capturing groups
jtulach@1348: * that do not capture text and do not count towards the group total, or
jtulach@1348: * named-capturing group.
jtulach@1348: *
jtulach@1348: *
Unicode support
jtulach@1348: *
jtulach@1348: * This class is in conformance with Level 1 of Unicode Technical
jtulach@1348: * Standard #18: Unicode Regular Expression, plus RL2.1
jtulach@1348: * Canonical Equivalents.
jtulach@1348: *
jtulach@1348: * Unicode escape sequences such as \u2014 in Java source code
jtulach@1348: * are processed as described in section 3.3 of
jtulach@1348: * The Java™ Language Specification.
jtulach@1348: * Such escape sequences are also implemented directly by the regular-expression
jtulach@1348: * parser so that Unicode escapes can be used in expressions that are read from
jtulach@1348: * files or from the keyboard. Thus the strings "\u2014" and
jtulach@1348: * "\\u2014", while not equal, compile into the same pattern, which
jtulach@1348: * matches the character with hexadecimal value 0x2014.
jtulach@1348: *
jtulach@1348: * A Unicode character can also be represented in a regular-expression by
jtulach@1348: * using its Hex notation(hexadecimal code point value) directly as described in construct
jtulach@1348: * \x{...}, for example a supplementary character U+2011F
jtulach@1348: * can be specified as \x{2011F}, instead of two consecutive
jtulach@1348: * Unicode escape sequences of the surrogate pair
jtulach@1348: * \uD840\uDD1F.
jtulach@1348: *
jtulach@1348: * Unicode scripts, blocks, categories and binary properties are written with
jtulach@1348: * the \p and \P constructs as in Perl.
jtulach@1348: * \p{prop} matches if
jtulach@1348: * the input has the property prop, while \P{prop}
jtulach@1348: * does not match if the input has that property.
jtulach@1348: *
jtulach@1348: * Scripts, blocks, categories and binary properties can be used both inside
jtulach@1348: * and outside of a character class.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Scripts are specified either with the prefix {@code Is}, as in
jtulach@1348: * {@code IsHiragana}, or by using the {@code script} keyword (or its short
jtulach@1348: * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
jtulach@1348: *
jtulach@1348: * The script names supported by Pattern
are the valid script names
jtulach@1348: * accepted and defined by
jtulach@1348: * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Blocks are specified with the prefix {@code In}, as in
jtulach@1348: * {@code InMongolian}, or by using the keyword {@code block} (or its short
jtulach@1348: * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
jtulach@1348: *
jtulach@1348: * The block names supported by Pattern
are the valid block names
jtulach@1348: * accepted and defined by
jtulach@1348: * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Categories may be specified with the optional prefix {@code Is}:
jtulach@1348: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
jtulach@1348: * letters. Same as scripts and blocks, categories can also be specified
jtulach@1348: * by using the keyword {@code general_category} (or its short form
jtulach@1348: * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
jtulach@1348: *
jtulach@1348: * The supported categories are those of
jtulach@1348: *
jtulach@1348: * The Unicode Standard in the version specified by the
jtulach@1348: * {@link java.lang.Character Character} class. The category names are those
jtulach@1348: * defined in the Standard, both normative and informative.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Binary properties are specified with the prefix {@code Is}, as in
jtulach@1348: * {@code IsAlphabetic}. The supported binary properties by Pattern
jtulach@1348: * are
jtulach@1348: *
jtulach@1348: * - Alphabetic
jtulach@1348: *
- Ideographic
jtulach@1348: *
- Letter
jtulach@1348: *
- Lowercase
jtulach@1348: *
- Uppercase
jtulach@1348: *
- Titlecase
jtulach@1348: *
- Punctuation
jtulach@1348: *
- Control
jtulach@1348: *
- White_Space
jtulach@1348: *
- Digit
jtulach@1348: *
- Hex_Digit
jtulach@1348: *
- Noncharacter_Code_Point
jtulach@1348: *
- Assigned
jtulach@1348: *
jtulach@1348:
jtulach@1348:
jtulach@1348: *
jtulach@1348: * Predefined Character classes and POSIX character classes are in
jtulach@1348: * conformance with the recommendation of Annex C: Compatibility Properties
jtulach@1348: * of Unicode Regular Expression
jtulach@1348: * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Classes |
jtulach@1348: * Matches |
jtulach@1348: *
jtulach@1348: * \p{Lower} |
jtulach@1348: * A lowercase character:\p{IsLowercase} |
jtulach@1348: * \p{Upper} |
jtulach@1348: * An uppercase character:\p{IsUppercase} |
jtulach@1348: * \p{ASCII} |
jtulach@1348: * All ASCII:[\x00-\x7F] |
jtulach@1348: * \p{Alpha} |
jtulach@1348: * An alphabetic character:\p{IsAlphabetic} |
jtulach@1348: * \p{Digit} |
jtulach@1348: * A decimal digit character:p{IsDigit} |
jtulach@1348: * \p{Alnum} |
jtulach@1348: * An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}] |
jtulach@1348: * \p{Punct} |
jtulach@1348: * A punctuation character:p{IsPunctuation} |
jtulach@1348: * \p{Graph} |
jtulach@1348: * A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}] |
jtulach@1348: * \p{Print} |
jtulach@1348: * A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]] |
jtulach@1348: * \p{Blank} |
jtulach@1348: * A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]] |
jtulach@1348: * \p{Cntrl} |
jtulach@1348: * A control character: \p{gc=Cc} |
jtulach@1348: * \p{XDigit} |
jtulach@1348: * A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}] |
jtulach@1348: * \p{Space} |
jtulach@1348: * A whitespace character:\p{IsWhite_Space} |
jtulach@1348: * \d |
jtulach@1348: * A digit: \p{IsDigit} |
jtulach@1348: * \D |
jtulach@1348: * A non-digit: [^\d] |
jtulach@1348: * \s |
jtulach@1348: * A whitespace character: \p{IsWhite_Space} |
jtulach@1348: * \S |
jtulach@1348: * A non-whitespace character: [^\s] |
jtulach@1348: * \w |
jtulach@1348: * A word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}] |
jtulach@1348: * \W |
jtulach@1348: * A non-word character: [^\w] |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Categories that behave like the java.lang.Character
jtulach@1348: * boolean ismethodname methods (except for the deprecated ones) are
jtulach@1348: * available through the same \p{prop} syntax where
jtulach@1348: * the specified property has the name javamethodname.
jtulach@1348: *
jtulach@1348: * Comparison to Perl 5
jtulach@1348: *
jtulach@1348: * The Pattern
engine performs traditional NFA-based matching
jtulach@1348: * with ordered alternation as occurs in Perl 5.
jtulach@1348: *
jtulach@1348: *
Perl constructs not supported by this class:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Predefined character classes (Unicode character)
jtulach@1348: *
\h A horizontal whitespace
jtulach@1348: *
\H A non horizontal whitespace
jtulach@1348: *
\v A vertical whitespace
jtulach@1348: *
\V A non vertical whitespace
jtulach@1348: *
\R Any Unicode linebreak sequence
jtulach@1348: * \u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]
jtulach@1348: *
\X Match Unicode
jtulach@1348: *
jtulach@1348: * extended grapheme cluster
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * The backreference constructs, \g{n} for
jtulach@1348: * the nthcapturing group and
jtulach@1348: * \g{name} for
jtulach@1348: * named-capturing group.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * The named character construct, \N{name}
jtulach@1348: * for a Unicode character by its name.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * The conditional constructs
jtulach@1348: * (?(condition)X) and
jtulach@1348: * (?(condition)X|Y),
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * The embedded code constructs (?{code})
jtulach@1348: * and (??{code}),
jtulach@1348: *
jtulach@1348: * The embedded comment syntax (?#comment), and
jtulach@1348: *
jtulach@1348: * The preprocessing operations \l \u,
jtulach@1348: * \L, and \U.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Constructs supported by this class but not by Perl:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Notable differences from Perl:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * In Perl, \1 through \9 are always interpreted
jtulach@1348: * as back references; a backslash-escaped number greater than 9 is
jtulach@1348: * treated as a back reference if at least that many subexpressions exist,
jtulach@1348: * otherwise it is interpreted, if possible, as an octal escape. In this
jtulach@1348: * class octal escapes must always begin with a zero. In this class,
jtulach@1348: * \1 through \9 are always interpreted as back
jtulach@1348: * references, and a larger number is accepted as a back reference if at
jtulach@1348: * least that many subexpressions exist at that point in the regular
jtulach@1348: * expression, otherwise the parser will drop digits until the number is
jtulach@1348: * smaller or equal to the existing number of groups or it is one digit.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Perl uses the g flag to request a match that resumes
jtulach@1348: * where the last match left off. This functionality is provided implicitly
jtulach@1348: * by the {@link Matcher} class: Repeated invocations of the {@link
jtulach@1348: * Matcher#find find} method will resume where the last match left off,
jtulach@1348: * unless the matcher is reset.
jtulach@1348: *
jtulach@1348: * In Perl, embedded flags at the top level of an expression affect
jtulach@1348: * the whole expression. In this class, embedded flags always take effect
jtulach@1348: * at the point at which they appear, whether they are at the top level or
jtulach@1348: * within a group; in the latter case, flags are restored at the end of the
jtulach@1348: * group just as in Perl.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * For a more precise description of the behavior of regular expression
jtulach@1348: * constructs, please see
jtulach@1348: * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl,
jtulach@1348: * O'Reilly and Associates, 2006.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * @see java.lang.String#split(String, int)
jtulach@1348: * @see java.lang.String#split(String)
jtulach@1348: *
jtulach@1348: * @author Mike McCloskey
jtulach@1348: * @author Mark Reinhold
jtulach@1348: * @author JSR-51 Expert Group
jtulach@1348: * @since 1.4
jtulach@1348: * @spec JSR-51
jtulach@1348: */
jtulach@1348:
jtulach@1348: public final class Pattern
jtulach@1348: implements java.io.Serializable
jtulach@1348: {
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Regular expression modifier values. Instead of being passed as
jtulach@1348: * arguments, they can also be passed as inline modifiers.
jtulach@1348: * For example, the following statements have the same effect.
jtulach@1348: *
jtulach@1348: * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
jtulach@1348: * RegExp r2 = RegExp.compile("(?im)abc", 0);
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * The flags are duplicated so that the familiar Perl match flag
jtulach@1348: * names are available.
jtulach@1348: */
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables Unix lines mode.
jtulach@1348: *
jtulach@1348: * In this mode, only the '\n' line terminator is recognized
jtulach@1348: * in the behavior of ., ^, and $.
jtulach@1348: *
jtulach@1348: *
Unix lines mode can also be enabled via the embedded flag
jtulach@1348: * expression (?d).
jtulach@1348: */
jtulach@1348: public static final int UNIX_LINES = 0x01;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables case-insensitive matching.
jtulach@1348: *
jtulach@1348: *
By default, case-insensitive matching assumes that only characters
jtulach@1348: * in the US-ASCII charset are being matched. Unicode-aware
jtulach@1348: * case-insensitive matching can be enabled by specifying the {@link
jtulach@1348: * #UNICODE_CASE} flag in conjunction with this flag.
jtulach@1348: *
jtulach@1348: *
Case-insensitive matching can also be enabled via the embedded flag
jtulach@1348: * expression (?i).
jtulach@1348: *
jtulach@1348: *
Specifying this flag may impose a slight performance penalty.
jtulach@1348: */
jtulach@1348: public static final int CASE_INSENSITIVE = 0x02;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Permits whitespace and comments in pattern.
jtulach@1348: *
jtulach@1348: * In this mode, whitespace is ignored, and embedded comments starting
jtulach@1348: * with # are ignored until the end of a line.
jtulach@1348: *
jtulach@1348: *
Comments mode can also be enabled via the embedded flag
jtulach@1348: * expression (?x).
jtulach@1348: */
jtulach@1348: public static final int COMMENTS = 0x04;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables multiline mode.
jtulach@1348: *
jtulach@1348: *
In multiline mode the expressions ^ and $ match
jtulach@1348: * just after or just before, respectively, a line terminator or the end of
jtulach@1348: * the input sequence. By default these expressions only match at the
jtulach@1348: * beginning and the end of the entire input sequence.
jtulach@1348: *
jtulach@1348: *
Multiline mode can also be enabled via the embedded flag
jtulach@1348: * expression (?m).
jtulach@1348: */
jtulach@1348: public static final int MULTILINE = 0x08;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables literal parsing of the pattern.
jtulach@1348: *
jtulach@1348: * When this flag is specified then the input string that specifies
jtulach@1348: * the pattern is treated as a sequence of literal characters.
jtulach@1348: * Metacharacters or escape sequences in the input sequence will be
jtulach@1348: * given no special meaning.
jtulach@1348: *
jtulach@1348: *
The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
jtulach@1348: * matching when used in conjunction with this flag. The other flags
jtulach@1348: * become superfluous.
jtulach@1348: *
jtulach@1348: *
There is no embedded flag character for enabling literal parsing.
jtulach@1348: * @since 1.5
jtulach@1348: */
jtulach@1348: public static final int LITERAL = 0x10;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables dotall mode.
jtulach@1348: *
jtulach@1348: *
In dotall mode, the expression . matches any character,
jtulach@1348: * including a line terminator. By default this expression does not match
jtulach@1348: * line terminators.
jtulach@1348: *
jtulach@1348: *
Dotall mode can also be enabled via the embedded flag
jtulach@1348: * expression (?s). (The s is a mnemonic for
jtulach@1348: * "single-line" mode, which is what this is called in Perl.)
jtulach@1348: */
jtulach@1348: public static final int DOTALL = 0x20;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables Unicode-aware case folding.
jtulach@1348: *
jtulach@1348: * When this flag is specified then case-insensitive matching, when
jtulach@1348: * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
jtulach@1348: * consistent with the Unicode Standard. By default, case-insensitive
jtulach@1348: * matching assumes that only characters in the US-ASCII charset are being
jtulach@1348: * matched.
jtulach@1348: *
jtulach@1348: *
Unicode-aware case folding can also be enabled via the embedded flag
jtulach@1348: * expression (?u).
jtulach@1348: *
jtulach@1348: *
Specifying this flag may impose a performance penalty.
jtulach@1348: */
jtulach@1348: public static final int UNICODE_CASE = 0x40;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables canonical equivalence.
jtulach@1348: *
jtulach@1348: * When this flag is specified then two characters will be considered
jtulach@1348: * to match if, and only if, their full canonical decompositions match.
jtulach@1348: * The expression "a\u030A", for example, will match the
jtulach@1348: * string "\u00E5" when this flag is specified. By default,
jtulach@1348: * matching does not take canonical equivalence into account.
jtulach@1348: *
jtulach@1348: *
There is no embedded flag character for enabling canonical
jtulach@1348: * equivalence.
jtulach@1348: *
jtulach@1348: *
Specifying this flag may impose a performance penalty.
jtulach@1348: */
jtulach@1348: public static final int CANON_EQ = 0x80;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Enables the Unicode version of Predefined character classes and
jtulach@1348: * POSIX character classes.
jtulach@1348: *
jtulach@1348: * When this flag is specified then the (US-ASCII only)
jtulach@1348: * Predefined character classes and POSIX character classes
jtulach@1348: * are in conformance with
jtulach@1348: * Unicode Technical
jtulach@1348: * Standard #18: Unicode Regular Expression
jtulach@1348: * Annex C: Compatibility Properties.
jtulach@1348: *
jtulach@1348: * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
jtulach@1348: * flag expression (?U).
jtulach@1348: *
jtulach@1348: * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
jtulach@1348: * folding.
jtulach@1348: *
jtulach@1348: * Specifying this flag may impose a performance penalty.
jtulach@1348: * @since 1.7
jtulach@1348: */
jtulach@1348: public static final int UNICODE_CHARACTER_CLASS = 0x100;
jtulach@1348:
jtulach@1348: /* Pattern has only two serialized components: The pattern string
jtulach@1348: * and the flags, which are all that is needed to recompile the pattern
jtulach@1348: * when it is deserialized.
jtulach@1348: */
jtulach@1348:
jtulach@1348: /** use serialVersionUID from Merlin b59 for interoperability */
jtulach@1348: private static final long serialVersionUID = 5073258162644648461L;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The original regular-expression pattern string.
jtulach@1348: *
jtulach@1348: * @serial
jtulach@1348: */
jtulach@1348: private String pattern;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The original pattern flags.
jtulach@1348: *
jtulach@1348: * @serial
jtulach@1348: */
jtulach@1348: private int flags;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Boolean indicating this Pattern is compiled; this is necessary in order
jtulach@1348: * to lazily compile deserialized Patterns.
jtulach@1348: */
jtulach@1348: private transient volatile boolean compiled = false;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The normalized pattern string.
jtulach@1348: */
jtulach@1348: private transient String normalizedPattern;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The starting point of state machine for the find operation. This allows
jtulach@1348: * a match to start anywhere in the input.
jtulach@1348: */
jtulach@1348: transient Node root;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The root of object tree for a match operation. The pattern is matched
jtulach@1348: * at the beginning. This may include a find that uses BnM or a First
jtulach@1348: * node.
jtulach@1348: */
jtulach@1348: transient Node matchRoot;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Temporary storage used by parsing pattern slice.
jtulach@1348: */
jtulach@1348: transient int[] buffer;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Map the "name" of the "named capturing group" to its group id
jtulach@1348: * node.
jtulach@1348: */
jtulach@1348: transient volatile Map namedGroups;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Temporary storage used while parsing group references.
jtulach@1348: */
jtulach@1348: transient GroupHead[] groupNodes;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Temporary null terminated code point array used by pattern compiling.
jtulach@1348: */
jtulach@1348: private transient int[] temp;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The number of capturing groups in this Pattern. Used by matchers to
jtulach@1348: * allocate storage needed to perform a match.
jtulach@1348: */
jtulach@1348: transient int capturingGroupCount;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The local variable count used by parsing tree. Used by matchers to
jtulach@1348: * allocate storage needed to perform a match.
jtulach@1348: */
jtulach@1348: transient int localCount;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Index into the pattern string that keeps track of how much has been
jtulach@1348: * parsed.
jtulach@1348: */
jtulach@1348: private transient int cursor;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Holds the length of the pattern string.
jtulach@1348: */
jtulach@1348: private transient int patternLength;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * If the Start node might possibly match supplementary characters.
jtulach@1348: * It is set to true during compiling if
jtulach@1348: * (1) There is supplementary char in pattern, or
jtulach@1348: * (2) There is complement node of Category or Block
jtulach@1348: */
jtulach@1348: private transient boolean hasSupplementary;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Compiles the given regular expression into a pattern.
jtulach@1348: *
jtulach@1348: * @param regex
jtulach@1348: * The expression to be compiled
jtulach@1348: *
jtulach@1348: * @throws PatternSyntaxException
jtulach@1348: * If the expression's syntax is invalid
jtulach@1348: */
jtulach@1348: public static Pattern compile(String regex) {
jtulach@1348: return new Pattern(regex, 0);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Compiles the given regular expression into a pattern with the given
jtulach@1348: * flags.
jtulach@1348: *
jtulach@1348: * @param regex
jtulach@1348: * The expression to be compiled
jtulach@1348: *
jtulach@1348: * @param flags
jtulach@1348: * Match flags, a bit mask that may include
jtulach@1348: * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
jtulach@1348: * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
jtulach@1348: * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
jtulach@1348: * and {@link #COMMENTS}
jtulach@1348: *
jtulach@1348: * @throws IllegalArgumentException
jtulach@1348: * If bit values other than those corresponding to the defined
jtulach@1348: * match flags are set in flags
jtulach@1348: *
jtulach@1348: * @throws PatternSyntaxException
jtulach@1348: * If the expression's syntax is invalid
jtulach@1348: */
jtulach@1348: public static Pattern compile(String regex, int flags) {
jtulach@1348: return new Pattern(regex, flags);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns the regular expression from which this pattern was compiled.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * @return The source of this pattern
jtulach@1348: */
jtulach@1348: public String pattern() {
jtulach@1348: return pattern;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns the string representation of this pattern. This
jtulach@1348: * is the regular expression from which this pattern was
jtulach@1348: * compiled.
jtulach@1348: *
jtulach@1348: * @return The string representation of this pattern
jtulach@1348: * @since 1.5
jtulach@1348: */
jtulach@1348: public String toString() {
jtulach@1348: return pattern;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Creates a matcher that will match the given input against this pattern.
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * @param input
jtulach@1348: * The character sequence to be matched
jtulach@1348: *
jtulach@1348: * @return A new matcher for this pattern
jtulach@1348: */
jtulach@1348: public Matcher matcher(CharSequence input) {
jtulach@1348: if (!compiled) {
jtulach@1348: synchronized(this) {
jtulach@1348: if (!compiled)
jtulach@1348: compile();
jtulach@1348: }
jtulach@1348: }
jtulach@1348: Matcher m = new Matcher(this, input);
jtulach@1348: return m;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns this pattern's match flags.
jtulach@1348: *
jtulach@1348: * @return The match flags specified when this pattern was compiled
jtulach@1348: */
jtulach@1348: public int flags() {
jtulach@1348: return flags;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Compiles the given regular expression and attempts to match the given
jtulach@1348: * input against it.
jtulach@1348: *
jtulach@1348: * An invocation of this convenience method of the form
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Pattern.matches(regex, input);
jtulach@1348: *
jtulach@1348: * behaves in exactly the same way as the expression
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Pattern.compile(regex).matcher(input).matches()
jtulach@1348: *
jtulach@1348: * If a pattern is to be used multiple times, compiling it once and reusing
jtulach@1348: * it will be more efficient than invoking this method each time.
jtulach@1348: *
jtulach@1348: * @param regex
jtulach@1348: * The expression to be compiled
jtulach@1348: *
jtulach@1348: * @param input
jtulach@1348: * The character sequence to be matched
jtulach@1348: *
jtulach@1348: * @throws PatternSyntaxException
jtulach@1348: * If the expression's syntax is invalid
jtulach@1348: */
jtulach@1348: public static boolean matches(String regex, CharSequence input) {
jtulach@1348: Pattern p = Pattern.compile(regex);
jtulach@1348: Matcher m = p.matcher(input);
jtulach@1348: return m.matches();
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Splits the given input sequence around matches of this pattern.
jtulach@1348: *
jtulach@1348: * The array returned by this method contains each substring of the
jtulach@1348: * input sequence that is terminated by another subsequence that matches
jtulach@1348: * this pattern or is terminated by the end of the input sequence. The
jtulach@1348: * substrings in the array are in the order in which they occur in the
jtulach@1348: * input. If this pattern does not match any subsequence of the input then
jtulach@1348: * the resulting array has just one element, namely the input sequence in
jtulach@1348: * string form.
jtulach@1348: *
jtulach@1348: *
The limit parameter controls the number of times the
jtulach@1348: * pattern is applied and therefore affects the length of the resulting
jtulach@1348: * array. If the limit n is greater than zero then the pattern
jtulach@1348: * will be applied at most n - 1 times, the array's
jtulach@1348: * length will be no greater than n, and the array's last entry
jtulach@1348: * will contain all input beyond the last matched delimiter. If n
jtulach@1348: * is non-positive then the pattern will be applied as many times as
jtulach@1348: * possible and the array can have any length. If n is zero then
jtulach@1348: * the pattern will be applied as many times as possible, the array can
jtulach@1348: * have any length, and trailing empty strings will be discarded.
jtulach@1348: *
jtulach@1348: *
The input "boo:and:foo", for example, yields the following
jtulach@1348: * results with these parameters:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Regex |
jtulach@1348: * Limit |
jtulach@1348: * Result |
jtulach@1348: * : |
jtulach@1348: * 2 |
jtulach@1348: * { "boo", "and:foo" } |
jtulach@1348: * : |
jtulach@1348: * 5 |
jtulach@1348: * { "boo", "and", "foo" } |
jtulach@1348: * : |
jtulach@1348: * -2 |
jtulach@1348: * { "boo", "and", "foo" } |
jtulach@1348: * o |
jtulach@1348: * 5 |
jtulach@1348: * { "b", "", ":and:f", "", "" } |
jtulach@1348: * o |
jtulach@1348: * -2 |
jtulach@1348: * { "b", "", ":and:f", "", "" } |
jtulach@1348: * o |
jtulach@1348: * 0 |
jtulach@1348: * { "b", "", ":and:f" } |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * @param input
jtulach@1348: * The character sequence to be split
jtulach@1348: *
jtulach@1348: * @param limit
jtulach@1348: * The result threshold, as described above
jtulach@1348: *
jtulach@1348: * @return The array of strings computed by splitting the input
jtulach@1348: * around matches of this pattern
jtulach@1348: */
jtulach@1348: public String[] split(CharSequence input, int limit) {
jtulach@1348: int index = 0;
jtulach@1348: boolean matchLimited = limit > 0;
jtulach@1348: ArrayList matchList = new ArrayList<>();
jtulach@1348: Matcher m = matcher(input);
jtulach@1348:
jtulach@1348: // Add segments before each match found
jtulach@1348: while(m.find()) {
jtulach@1348: if (!matchLimited || matchList.size() < limit - 1) {
jtulach@1348: String match = input.subSequence(index, m.start()).toString();
jtulach@1348: matchList.add(match);
jtulach@1348: index = m.end();
jtulach@1348: } else if (matchList.size() == limit - 1) { // last one
jtulach@1348: String match = input.subSequence(index,
jtulach@1348: input.length()).toString();
jtulach@1348: matchList.add(match);
jtulach@1348: index = m.end();
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: // If no match was found, return this
jtulach@1348: if (index == 0)
jtulach@1348: return new String[] {input.toString()};
jtulach@1348:
jtulach@1348: // Add remaining segment
jtulach@1348: if (!matchLimited || matchList.size() < limit)
jtulach@1348: matchList.add(input.subSequence(index, input.length()).toString());
jtulach@1348:
jtulach@1348: // Construct result
jtulach@1348: int resultSize = matchList.size();
jtulach@1348: if (limit == 0)
jtulach@1348: while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
jtulach@1348: resultSize--;
jtulach@1348: String[] result = new String[resultSize];
jtulach@1348: return matchList.subList(0, resultSize).toArray(result);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Splits the given input sequence around matches of this pattern.
jtulach@1348: *
jtulach@1348: * This method works as if by invoking the two-argument {@link
jtulach@1348: * #split(java.lang.CharSequence, int) split} method with the given input
jtulach@1348: * sequence and a limit argument of zero. Trailing empty strings are
jtulach@1348: * therefore not included in the resulting array.
jtulach@1348: *
jtulach@1348: * The input "boo:and:foo", for example, yields the following
jtulach@1348: * results with these expressions:
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * Regex |
jtulach@1348: * Result |
jtulach@1348: * : |
jtulach@1348: * { "boo", "and", "foo" } |
jtulach@1348: * o |
jtulach@1348: * { "b", "", ":and:f" } |
jtulach@1348: *
jtulach@1348: *
jtulach@1348: *
jtulach@1348: * @param input
jtulach@1348: * The character sequence to be split
jtulach@1348: *
jtulach@1348: * @return The array of strings computed by splitting the input
jtulach@1348: * around matches of this pattern
jtulach@1348: */
jtulach@1348: public String[] split(CharSequence input) {
jtulach@1348: return split(input, 0);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns a literal pattern String
for the specified
jtulach@1348: * String
.
jtulach@1348: *
jtulach@1348: * This method produces a String
that can be used to
jtulach@1348: * create a Pattern
that would match the string
jtulach@1348: * s
as if it were a literal pattern.
Metacharacters
jtulach@1348: * or escape sequences in the input sequence will be given no special
jtulach@1348: * meaning.
jtulach@1348: *
jtulach@1348: * @param s The string to be literalized
jtulach@1348: * @return A literal string replacement
jtulach@1348: * @since 1.5
jtulach@1348: */
jtulach@1348: public static String quote(String s) {
jtulach@1348: int slashEIndex = s.indexOf("\\E");
jtulach@1348: if (slashEIndex == -1)
jtulach@1348: return "\\Q" + s + "\\E";
jtulach@1348:
jtulach@1348: StringBuilder sb = new StringBuilder(s.length() * 2);
jtulach@1348: sb.append("\\Q");
jtulach@1348: slashEIndex = 0;
jtulach@1348: int current = 0;
jtulach@1348: while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
jtulach@1348: sb.append(s.substring(current, slashEIndex));
jtulach@1348: current = slashEIndex + 2;
jtulach@1348: sb.append("\\E\\\\E\\Q");
jtulach@1348: }
jtulach@1348: sb.append(s.substring(current, s.length()));
jtulach@1348: sb.append("\\E");
jtulach@1348: return sb.toString();
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Recompile the Pattern instance from a stream. The original pattern
jtulach@1348: * string is read in and the object tree is recompiled from it.
jtulach@1348: */
jtulach@1348: private void readObject(java.io.ObjectInputStream s)
jtulach@1348: throws java.io.IOException, ClassNotFoundException {
jtulach@1348:
jtulach@1348: // Read in all fields
jtulach@1348: s.defaultReadObject();
jtulach@1348:
jtulach@1348: // Initialize counts
jtulach@1348: capturingGroupCount = 1;
jtulach@1348: localCount = 0;
jtulach@1348:
jtulach@1348: // if length > 0, the Pattern is lazily compiled
jtulach@1348: compiled = false;
jtulach@1348: if (pattern.length() == 0) {
jtulach@1348: root = new Start(lastAccept);
jtulach@1348: matchRoot = lastAccept;
jtulach@1348: compiled = true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * This private constructor is used to create all Patterns. The pattern
jtulach@1348: * string and match flags are all that is needed to completely describe
jtulach@1348: * a Pattern. An empty pattern string results in an object tree with
jtulach@1348: * only a Start node and a LastNode node.
jtulach@1348: */
jtulach@1348: private Pattern(String p, int f) {
jtulach@1348: pattern = p;
jtulach@1348: flags = f;
jtulach@1348:
jtulach@1348: // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
jtulach@1348: if ((flags & UNICODE_CHARACTER_CLASS) != 0)
jtulach@1348: flags |= UNICODE_CASE;
jtulach@1348:
jtulach@1348: // Reset group index count
jtulach@1348: capturingGroupCount = 1;
jtulach@1348: localCount = 0;
jtulach@1348:
jtulach@1348: if (pattern.length() > 0) {
jtulach@1348: compile();
jtulach@1348: } else {
jtulach@1348: root = new Start(lastAccept);
jtulach@1348: matchRoot = lastAccept;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The pattern is converted to normalizedD form and then a pure group
jtulach@1348: * is constructed to match canonical equivalences of the characters.
jtulach@1348: */
jtulach@1348: private void normalize() {
jtulach@1348: boolean inCharClass = false;
jtulach@1348: int lastCodePoint = -1;
jtulach@1348:
jtulach@1348: // Convert pattern into normalizedD form
jtulach@1350: normalizedPattern = Normalizer.normalize(pattern, Normalizer.NFD);
jtulach@1348: patternLength = normalizedPattern.length();
jtulach@1348:
jtulach@1348: // Modify pattern to match canonical equivalences
jtulach@1348: StringBuilder newPattern = new StringBuilder(patternLength);
jtulach@1348: for(int i=0; i= patternLength)
jtulach@1348: break;
jtulach@1348: c = normalizedPattern.codePointAt(i);
jtulach@1348: sequenceBuffer.appendCodePoint(c);
jtulach@1348: }
jtulach@1348: String ea = produceEquivalentAlternation(
jtulach@1348: sequenceBuffer.toString());
jtulach@1348: newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
jtulach@1348: newPattern.append("(?:").append(ea).append(")");
jtulach@1348: } else if (c == '[' && lastCodePoint != '\\') {
jtulach@1348: i = normalizeCharClass(newPattern, i);
jtulach@1348: } else {
jtulach@1348: newPattern.appendCodePoint(c);
jtulach@1348: }
jtulach@1348: lastCodePoint = c;
jtulach@1348: i += Character.charCount(c);
jtulach@1348: }
jtulach@1348: normalizedPattern = newPattern.toString();
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Complete the character class being parsed and add a set
jtulach@1348: * of alternations to it that will match the canonical equivalences
jtulach@1348: * of the characters within the class.
jtulach@1348: */
jtulach@1348: private int normalizeCharClass(StringBuilder newPattern, int i) {
jtulach@1348: StringBuilder charClass = new StringBuilder();
jtulach@1348: StringBuilder eq = null;
jtulach@1348: int lastCodePoint = -1;
jtulach@1348: String result;
jtulach@1348:
jtulach@1348: i++;
jtulach@1348: charClass.append("[");
jtulach@1348: while(true) {
jtulach@1348: int c = normalizedPattern.codePointAt(i);
jtulach@1348: StringBuilder sequenceBuffer;
jtulach@1348:
jtulach@1348: if (c == ']' && lastCodePoint != '\\') {
jtulach@1348: charClass.append((char)c);
jtulach@1348: break;
jtulach@1348: } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
jtulach@1348: sequenceBuffer = new StringBuilder();
jtulach@1348: sequenceBuffer.appendCodePoint(lastCodePoint);
jtulach@1348: while(Character.getType(c) == Character.NON_SPACING_MARK) {
jtulach@1348: sequenceBuffer.appendCodePoint(c);
jtulach@1348: i += Character.charCount(c);
jtulach@1348: if (i >= normalizedPattern.length())
jtulach@1348: break;
jtulach@1348: c = normalizedPattern.codePointAt(i);
jtulach@1348: }
jtulach@1348: String ea = produceEquivalentAlternation(
jtulach@1348: sequenceBuffer.toString());
jtulach@1348:
jtulach@1348: charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
jtulach@1348: if (eq == null)
jtulach@1348: eq = new StringBuilder();
jtulach@1348: eq.append('|');
jtulach@1348: eq.append(ea);
jtulach@1348: } else {
jtulach@1348: charClass.appendCodePoint(c);
jtulach@1348: i++;
jtulach@1348: }
jtulach@1348: if (i == normalizedPattern.length())
jtulach@1348: throw error("Unclosed character class");
jtulach@1348: lastCodePoint = c;
jtulach@1348: }
jtulach@1348:
jtulach@1348: if (eq != null) {
jtulach@1348: result = "(?:"+charClass.toString()+eq.toString()+")";
jtulach@1348: } else {
jtulach@1348: result = charClass.toString();
jtulach@1348: }
jtulach@1348:
jtulach@1348: newPattern.append(result);
jtulach@1348: return i;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Given a specific sequence composed of a regular character and
jtulach@1348: * combining marks that follow it, produce the alternation that will
jtulach@1348: * match all canonical equivalences of that sequence.
jtulach@1348: */
jtulach@1348: private String produceEquivalentAlternation(String source) {
jtulach@1348: int len = countChars(source, 0, 1);
jtulach@1348: if (source.length() == len)
jtulach@1348: // source has one character.
jtulach@1348: return source;
jtulach@1348:
jtulach@1348: String base = source.substring(0,len);
jtulach@1348: String combiningMarks = source.substring(len);
jtulach@1348:
jtulach@1348: String[] perms = producePermutations(combiningMarks);
jtulach@1348: StringBuilder result = new StringBuilder(source);
jtulach@1348:
jtulach@1348: // Add combined permutations
jtulach@1348: for(int x=0; x0)
jtulach@1348: result.append("|"+next);
jtulach@1348: next = composeOneStep(next);
jtulach@1348: if (next != null)
jtulach@1348: result.append("|"+produceEquivalentAlternation(next));
jtulach@1348: }
jtulach@1348: return result.toString();
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns an array of strings that have all the possible
jtulach@1348: * permutations of the characters in the input string.
jtulach@1348: * This is used to get a list of all possible orderings
jtulach@1348: * of a set of combining marks. Note that some of the permutations
jtulach@1348: * are invalid because of combining class collisions, and these
jtulach@1348: * possibilities must be removed because they are not canonically
jtulach@1348: * equivalent.
jtulach@1348: */
jtulach@1348: private String[] producePermutations(String input) {
jtulach@1348: if (input.length() == countChars(input, 0, 1))
jtulach@1348: return new String[] {input};
jtulach@1348:
jtulach@1348: if (input.length() == countChars(input, 0, 2)) {
jtulach@1348: int c0 = Character.codePointAt(input, 0);
jtulach@1348: int c1 = Character.codePointAt(input, Character.charCount(c0));
jtulach@1348: if (getClass(c1) == getClass(c0)) {
jtulach@1348: return new String[] {input};
jtulach@1348: }
jtulach@1348: String[] result = new String[2];
jtulach@1348: result[0] = input;
jtulach@1348: StringBuilder sb = new StringBuilder(2);
jtulach@1348: sb.appendCodePoint(c1);
jtulach@1348: sb.appendCodePoint(c0);
jtulach@1348: result[1] = sb.toString();
jtulach@1348: return result;
jtulach@1348: }
jtulach@1348:
jtulach@1348: int length = 1;
jtulach@1348: int nCodePoints = countCodePoints(input);
jtulach@1348: for(int x=1; x=0; y--) {
jtulach@1348: if (combClass[y] == combClass[x]) {
jtulach@1348: continue loop;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: StringBuilder sb = new StringBuilder(input);
jtulach@1348: String otherChars = sb.delete(offset, offset+len).toString();
jtulach@1348: String[] subResult = producePermutations(otherChars);
jtulach@1348:
jtulach@1348: String prefix = input.substring(offset, offset+len);
jtulach@1348: for(int y=0; y= pLen - 1) // No \Q sequence found
jtulach@1348: return;
jtulach@1348: int j = i;
jtulach@1348: i += 2;
jtulach@1348: int[] newtemp = new int[j + 2*(pLen-i) + 2];
jtulach@1348: System.arraycopy(temp, 0, newtemp, 0, j);
jtulach@1348:
jtulach@1348: boolean inQuote = true;
jtulach@1348: while (i < pLen) {
jtulach@1348: int c = temp[i++];
jtulach@1348: if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
jtulach@1348: newtemp[j++] = c;
jtulach@1348: } else if (c != '\\') {
jtulach@1348: if (inQuote) newtemp[j++] = '\\';
jtulach@1348: newtemp[j++] = c;
jtulach@1348: } else if (inQuote) {
jtulach@1348: if (temp[i] == 'E') {
jtulach@1348: i++;
jtulach@1348: inQuote = false;
jtulach@1348: } else {
jtulach@1348: newtemp[j++] = '\\';
jtulach@1348: newtemp[j++] = '\\';
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: if (temp[i] == 'Q') {
jtulach@1348: i++;
jtulach@1348: inQuote = true;
jtulach@1348: } else {
jtulach@1348: newtemp[j++] = c;
jtulach@1348: if (i != pLen)
jtulach@1348: newtemp[j++] = temp[i++];
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: patternLength = j;
jtulach@1348: temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Copies regular expression to an int array and invokes the parsing
jtulach@1348: * of the expression which will create the object tree.
jtulach@1348: */
jtulach@1348: private void compile() {
jtulach@1348: // Handle canonical equivalences
jtulach@1348: if (has(CANON_EQ) && !has(LITERAL)) {
jtulach@1348: normalize();
jtulach@1348: } else {
jtulach@1348: normalizedPattern = pattern;
jtulach@1348: }
jtulach@1348: patternLength = normalizedPattern.length();
jtulach@1348:
jtulach@1348: // Copy pattern to int array for convenience
jtulach@1348: // Use double zero to terminate pattern
jtulach@1348: temp = new int[patternLength + 2];
jtulach@1348:
jtulach@1348: hasSupplementary = false;
jtulach@1348: int c, count = 0;
jtulach@1348: // Convert all chars into code points
jtulach@1348: for (int x = 0; x < patternLength; x += Character.charCount(c)) {
jtulach@1348: c = normalizedPattern.codePointAt(x);
jtulach@1348: if (isSupplementary(c)) {
jtulach@1348: hasSupplementary = true;
jtulach@1348: }
jtulach@1348: temp[count++] = c;
jtulach@1348: }
jtulach@1348:
jtulach@1348: patternLength = count; // patternLength now in code points
jtulach@1348:
jtulach@1348: if (! has(LITERAL))
jtulach@1348: RemoveQEQuoting();
jtulach@1348:
jtulach@1348: // Allocate all temporary objects here.
jtulach@1348: buffer = new int[32];
jtulach@1348: groupNodes = new GroupHead[10];
jtulach@1348: namedGroups = null;
jtulach@1348:
jtulach@1348: if (has(LITERAL)) {
jtulach@1348: // Literal pattern handling
jtulach@1348: matchRoot = newSlice(temp, patternLength, hasSupplementary);
jtulach@1348: matchRoot.next = lastAccept;
jtulach@1348: } else {
jtulach@1348: // Start recursive descent parsing
jtulach@1348: matchRoot = expr(lastAccept);
jtulach@1348: // Check extra pattern characters
jtulach@1348: if (patternLength != cursor) {
jtulach@1348: if (peek() == ')') {
jtulach@1348: throw error("Unmatched closing ')'");
jtulach@1348: } else {
jtulach@1348: throw error("Unexpected internal error");
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: // Peephole optimization
jtulach@1348: if (matchRoot instanceof Slice) {
jtulach@1348: root = BnM.optimize(matchRoot);
jtulach@1348: if (root == matchRoot) {
jtulach@1348: root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
jtulach@1348: }
jtulach@1348: } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
jtulach@1348: root = matchRoot;
jtulach@1348: } else {
jtulach@1348: root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
jtulach@1348: }
jtulach@1348:
jtulach@1348: // Release temporary storage
jtulach@1348: temp = null;
jtulach@1348: buffer = null;
jtulach@1348: groupNodes = null;
jtulach@1348: patternLength = 0;
jtulach@1348: compiled = true;
jtulach@1348: }
jtulach@1348:
jtulach@1348: Map namedGroups() {
jtulach@1348: if (namedGroups == null)
jtulach@1348: namedGroups = new HashMap<>(2);
jtulach@1348: return namedGroups;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Used to print out a subtree of the Pattern to help with debugging.
jtulach@1348: */
jtulach@1348: private static void printObjectTree(Node node) {
jtulach@1348: while(node != null) {
jtulach@1348: if (node instanceof Prolog) {
jtulach@1348: System.out.println(node);
jtulach@1348: printObjectTree(((Prolog)node).loop);
jtulach@1348: System.out.println("**** end contents prolog loop");
jtulach@1348: } else if (node instanceof Loop) {
jtulach@1348: System.out.println(node);
jtulach@1348: printObjectTree(((Loop)node).body);
jtulach@1348: System.out.println("**** end contents Loop body");
jtulach@1348: } else if (node instanceof Curly) {
jtulach@1348: System.out.println(node);
jtulach@1348: printObjectTree(((Curly)node).atom);
jtulach@1348: System.out.println("**** end contents Curly body");
jtulach@1348: } else if (node instanceof GroupCurly) {
jtulach@1348: System.out.println(node);
jtulach@1348: printObjectTree(((GroupCurly)node).atom);
jtulach@1348: System.out.println("**** end contents GroupCurly body");
jtulach@1348: } else if (node instanceof GroupTail) {
jtulach@1348: System.out.println(node);
jtulach@1348: System.out.println("Tail next is "+node.next);
jtulach@1348: return;
jtulach@1348: } else {
jtulach@1348: System.out.println(node);
jtulach@1348: }
jtulach@1348: node = node.next;
jtulach@1348: if (node != null)
jtulach@1348: System.out.println("->next:");
jtulach@1348: if (node == Pattern.accept) {
jtulach@1348: System.out.println("Accept Node");
jtulach@1348: node = null;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Used to accumulate information about a subtree of the object graph
jtulach@1348: * so that optimizations can be applied to the subtree.
jtulach@1348: */
jtulach@1348: static final class TreeInfo {
jtulach@1348: int minLength;
jtulach@1348: int maxLength;
jtulach@1348: boolean maxValid;
jtulach@1348: boolean deterministic;
jtulach@1348:
jtulach@1348: TreeInfo() {
jtulach@1348: reset();
jtulach@1348: }
jtulach@1348: void reset() {
jtulach@1348: minLength = 0;
jtulach@1348: maxLength = 0;
jtulach@1348: maxValid = true;
jtulach@1348: deterministic = true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /*
jtulach@1348: * The following private methods are mainly used to improve the
jtulach@1348: * readability of the code. In order to let the Java compiler easily
jtulach@1348: * inline them, we should not put many assertions or error checks in them.
jtulach@1348: */
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Indicates whether a particular flag is set or not.
jtulach@1348: */
jtulach@1348: private boolean has(int f) {
jtulach@1348: return (flags & f) != 0;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Match next character, signal error if failed.
jtulach@1348: */
jtulach@1348: private void accept(int ch, String s) {
jtulach@1348: int testChar = temp[cursor++];
jtulach@1348: if (has(COMMENTS))
jtulach@1348: testChar = parsePastWhitespace(testChar);
jtulach@1348: if (ch != testChar) {
jtulach@1348: throw error(s);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Mark the end of pattern with a specific character.
jtulach@1348: */
jtulach@1348: private void mark(int c) {
jtulach@1348: temp[patternLength] = c;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Peek the next character, and do not advance the cursor.
jtulach@1348: */
jtulach@1348: private int peek() {
jtulach@1348: int ch = temp[cursor];
jtulach@1348: if (has(COMMENTS))
jtulach@1348: ch = peekPastWhitespace(ch);
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Read the next character, and advance the cursor by one.
jtulach@1348: */
jtulach@1348: private int read() {
jtulach@1348: int ch = temp[cursor++];
jtulach@1348: if (has(COMMENTS))
jtulach@1348: ch = parsePastWhitespace(ch);
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Read the next character, and advance the cursor by one,
jtulach@1348: * ignoring the COMMENTS setting
jtulach@1348: */
jtulach@1348: private int readEscaped() {
jtulach@1348: int ch = temp[cursor++];
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Advance the cursor by one, and peek the next character.
jtulach@1348: */
jtulach@1348: private int next() {
jtulach@1348: int ch = temp[++cursor];
jtulach@1348: if (has(COMMENTS))
jtulach@1348: ch = peekPastWhitespace(ch);
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Advance the cursor by one, and peek the next character,
jtulach@1348: * ignoring the COMMENTS setting
jtulach@1348: */
jtulach@1348: private int nextEscaped() {
jtulach@1348: int ch = temp[++cursor];
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * If in xmode peek past whitespace and comments.
jtulach@1348: */
jtulach@1348: private int peekPastWhitespace(int ch) {
jtulach@1348: while (ASCII.isSpace(ch) || ch == '#') {
jtulach@1348: while (ASCII.isSpace(ch))
jtulach@1348: ch = temp[++cursor];
jtulach@1348: if (ch == '#') {
jtulach@1348: ch = peekPastLine();
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * If in xmode parse past whitespace and comments.
jtulach@1348: */
jtulach@1348: private int parsePastWhitespace(int ch) {
jtulach@1348: while (ASCII.isSpace(ch) || ch == '#') {
jtulach@1348: while (ASCII.isSpace(ch))
jtulach@1348: ch = temp[cursor++];
jtulach@1348: if (ch == '#')
jtulach@1348: ch = parsePastLine();
jtulach@1348: }
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * xmode parse past comment to end of line.
jtulach@1348: */
jtulach@1348: private int parsePastLine() {
jtulach@1348: int ch = temp[cursor++];
jtulach@1348: while (ch != 0 && !isLineSeparator(ch))
jtulach@1348: ch = temp[cursor++];
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * xmode peek past comment to end of line.
jtulach@1348: */
jtulach@1348: private int peekPastLine() {
jtulach@1348: int ch = temp[++cursor];
jtulach@1348: while (ch != 0 && !isLineSeparator(ch))
jtulach@1348: ch = temp[++cursor];
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Determines if character is a line separator in the current mode
jtulach@1348: */
jtulach@1348: private boolean isLineSeparator(int ch) {
jtulach@1348: if (has(UNIX_LINES)) {
jtulach@1348: return ch == '\n';
jtulach@1348: } else {
jtulach@1348: return (ch == '\n' ||
jtulach@1348: ch == '\r' ||
jtulach@1348: (ch|1) == '\u2029' ||
jtulach@1348: ch == '\u0085');
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Read the character after the next one, and advance the cursor by two.
jtulach@1348: */
jtulach@1348: private int skip() {
jtulach@1348: int i = cursor;
jtulach@1348: int ch = temp[i+1];
jtulach@1348: cursor = i + 2;
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Unread one next character, and retreat cursor by one.
jtulach@1348: */
jtulach@1348: private void unread() {
jtulach@1348: cursor--;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Internal method used for handling all syntax errors. The pattern is
jtulach@1348: * displayed with a pointer to aid in locating the syntax error.
jtulach@1348: */
jtulach@1348: private PatternSyntaxException error(String s) {
jtulach@1348: return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Determines if there is any supplementary character or unpaired
jtulach@1348: * surrogate in the specified range.
jtulach@1348: */
jtulach@1348: private boolean findSupplementary(int start, int end) {
jtulach@1348: for (int i = start; i < end; i++) {
jtulach@1348: if (isSupplementary(temp[i]))
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Determines if the specified code point is a supplementary
jtulach@1348: * character or unpaired surrogate.
jtulach@1348: */
jtulach@1348: private static final boolean isSupplementary(int ch) {
jtulach@1348: return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
jtulach@1348: Character.isSurrogate((char)ch);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The following methods handle the main parsing. They are sorted
jtulach@1348: * according to their precedence order, the lowest one first.
jtulach@1348: */
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The expression is parsed with branch nodes added for alternations.
jtulach@1348: * This may be called recursively to parse sub expressions that may
jtulach@1348: * contain alternations.
jtulach@1348: */
jtulach@1348: private Node expr(Node end) {
jtulach@1348: Node prev = null;
jtulach@1348: Node firstTail = null;
jtulach@1348: Node branchConn = null;
jtulach@1348:
jtulach@1348: for (;;) {
jtulach@1348: Node node = sequence(end);
jtulach@1348: Node nodeTail = root; //double return
jtulach@1348: if (prev == null) {
jtulach@1348: prev = node;
jtulach@1348: firstTail = nodeTail;
jtulach@1348: } else {
jtulach@1348: // Branch
jtulach@1348: if (branchConn == null) {
jtulach@1348: branchConn = new BranchConn();
jtulach@1348: branchConn.next = end;
jtulach@1348: }
jtulach@1348: if (node == end) {
jtulach@1348: // if the node returned from sequence() is "end"
jtulach@1348: // we have an empty expr, set a null atom into
jtulach@1348: // the branch to indicate to go "next" directly.
jtulach@1348: node = null;
jtulach@1348: } else {
jtulach@1348: // the "tail.next" of each atom goes to branchConn
jtulach@1348: nodeTail.next = branchConn;
jtulach@1348: }
jtulach@1348: if (prev instanceof Branch) {
jtulach@1348: ((Branch)prev).add(node);
jtulach@1348: } else {
jtulach@1348: if (prev == end) {
jtulach@1348: prev = null;
jtulach@1348: } else {
jtulach@1348: // replace the "end" with "branchConn" at its tail.next
jtulach@1348: // when put the "prev" into the branch as the first atom.
jtulach@1348: firstTail.next = branchConn;
jtulach@1348: }
jtulach@1348: prev = new Branch(prev, node, branchConn);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (peek() != '|') {
jtulach@1348: return prev;
jtulach@1348: }
jtulach@1348: next();
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parsing of sequences between alternations.
jtulach@1348: */
jtulach@1348: private Node sequence(Node end) {
jtulach@1348: Node head = null;
jtulach@1348: Node tail = null;
jtulach@1348: Node node = null;
jtulach@1348: LOOP:
jtulach@1348: for (;;) {
jtulach@1348: int ch = peek();
jtulach@1348: switch (ch) {
jtulach@1348: case '(':
jtulach@1348: // Because group handles its own closure,
jtulach@1348: // we need to treat it differently
jtulach@1348: node = group0();
jtulach@1348: // Check for comment or flag group
jtulach@1348: if (node == null)
jtulach@1348: continue;
jtulach@1348: if (head == null)
jtulach@1348: head = node;
jtulach@1348: else
jtulach@1348: tail.next = node;
jtulach@1348: // Double return: Tail was returned in root
jtulach@1348: tail = root;
jtulach@1348: continue;
jtulach@1348: case '[':
jtulach@1348: node = clazz(true);
jtulach@1348: break;
jtulach@1348: case '\\':
jtulach@1348: ch = nextEscaped();
jtulach@1348: if (ch == 'p' || ch == 'P') {
jtulach@1348: boolean oneLetter = true;
jtulach@1348: boolean comp = (ch == 'P');
jtulach@1348: ch = next(); // Consume { if present
jtulach@1348: if (ch != '{') {
jtulach@1348: unread();
jtulach@1348: } else {
jtulach@1348: oneLetter = false;
jtulach@1348: }
jtulach@1348: node = family(oneLetter, comp);
jtulach@1348: } else {
jtulach@1348: unread();
jtulach@1348: node = atom();
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '^':
jtulach@1348: next();
jtulach@1348: if (has(MULTILINE)) {
jtulach@1348: if (has(UNIX_LINES))
jtulach@1348: node = new UnixCaret();
jtulach@1348: else
jtulach@1348: node = new Caret();
jtulach@1348: } else {
jtulach@1348: node = new Begin();
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '$':
jtulach@1348: next();
jtulach@1348: if (has(UNIX_LINES))
jtulach@1348: node = new UnixDollar(has(MULTILINE));
jtulach@1348: else
jtulach@1348: node = new Dollar(has(MULTILINE));
jtulach@1348: break;
jtulach@1348: case '.':
jtulach@1348: next();
jtulach@1348: if (has(DOTALL)) {
jtulach@1348: node = new All();
jtulach@1348: } else {
jtulach@1348: if (has(UNIX_LINES))
jtulach@1348: node = new UnixDot();
jtulach@1348: else {
jtulach@1348: node = new Dot();
jtulach@1348: }
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '|':
jtulach@1348: case ')':
jtulach@1348: break LOOP;
jtulach@1348: case ']': // Now interpreting dangling ] and } as literals
jtulach@1348: case '}':
jtulach@1348: node = atom();
jtulach@1348: break;
jtulach@1348: case '?':
jtulach@1348: case '*':
jtulach@1348: case '+':
jtulach@1348: next();
jtulach@1348: throw error("Dangling meta character '" + ((char)ch) + "'");
jtulach@1348: case 0:
jtulach@1348: if (cursor >= patternLength) {
jtulach@1348: break LOOP;
jtulach@1348: }
jtulach@1348: // Fall through
jtulach@1348: default:
jtulach@1348: node = atom();
jtulach@1348: break;
jtulach@1348: }
jtulach@1348:
jtulach@1348: node = closure(node);
jtulach@1348:
jtulach@1348: if (head == null) {
jtulach@1348: head = tail = node;
jtulach@1348: } else {
jtulach@1348: tail.next = node;
jtulach@1348: tail = node;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (head == null) {
jtulach@1348: return end;
jtulach@1348: }
jtulach@1348: tail.next = end;
jtulach@1348: root = tail; //double return
jtulach@1348: return head;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parse and add a new Single or Slice.
jtulach@1348: */
jtulach@1348: private Node atom() {
jtulach@1348: int first = 0;
jtulach@1348: int prev = -1;
jtulach@1348: boolean hasSupplementary = false;
jtulach@1348: int ch = peek();
jtulach@1348: for (;;) {
jtulach@1348: switch (ch) {
jtulach@1348: case '*':
jtulach@1348: case '+':
jtulach@1348: case '?':
jtulach@1348: case '{':
jtulach@1348: if (first > 1) {
jtulach@1348: cursor = prev; // Unwind one character
jtulach@1348: first--;
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '$':
jtulach@1348: case '.':
jtulach@1348: case '^':
jtulach@1348: case '(':
jtulach@1348: case '[':
jtulach@1348: case '|':
jtulach@1348: case ')':
jtulach@1348: break;
jtulach@1348: case '\\':
jtulach@1348: ch = nextEscaped();
jtulach@1348: if (ch == 'p' || ch == 'P') { // Property
jtulach@1348: if (first > 0) { // Slice is waiting; handle it first
jtulach@1348: unread();
jtulach@1348: break;
jtulach@1348: } else { // No slice; just return the family node
jtulach@1348: boolean comp = (ch == 'P');
jtulach@1348: boolean oneLetter = true;
jtulach@1348: ch = next(); // Consume { if present
jtulach@1348: if (ch != '{')
jtulach@1348: unread();
jtulach@1348: else
jtulach@1348: oneLetter = false;
jtulach@1348: return family(oneLetter, comp);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: unread();
jtulach@1348: prev = cursor;
jtulach@1348: ch = escape(false, first == 0);
jtulach@1348: if (ch >= 0) {
jtulach@1348: append(ch, first);
jtulach@1348: first++;
jtulach@1348: if (isSupplementary(ch)) {
jtulach@1348: hasSupplementary = true;
jtulach@1348: }
jtulach@1348: ch = peek();
jtulach@1348: continue;
jtulach@1348: } else if (first == 0) {
jtulach@1348: return root;
jtulach@1348: }
jtulach@1348: // Unwind meta escape sequence
jtulach@1348: cursor = prev;
jtulach@1348: break;
jtulach@1348: case 0:
jtulach@1348: if (cursor >= patternLength) {
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: // Fall through
jtulach@1348: default:
jtulach@1348: prev = cursor;
jtulach@1348: append(ch, first);
jtulach@1348: first++;
jtulach@1348: if (isSupplementary(ch)) {
jtulach@1348: hasSupplementary = true;
jtulach@1348: }
jtulach@1348: ch = next();
jtulach@1348: continue;
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: if (first == 1) {
jtulach@1348: return newSingle(buffer[0]);
jtulach@1348: } else {
jtulach@1348: return newSlice(buffer, first, hasSupplementary);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: private void append(int ch, int len) {
jtulach@1348: if (len >= buffer.length) {
jtulach@1348: int[] tmp = new int[len+len];
jtulach@1348: System.arraycopy(buffer, 0, tmp, 0, len);
jtulach@1348: buffer = tmp;
jtulach@1348: }
jtulach@1348: buffer[len] = ch;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses a backref greedily, taking as many numbers as it
jtulach@1348: * can. The first digit is always treated as a backref, but
jtulach@1348: * multi digit numbers are only treated as a backref if at
jtulach@1348: * least that many backrefs exist at this point in the regex.
jtulach@1348: */
jtulach@1348: private Node ref(int refNum) {
jtulach@1348: boolean done = false;
jtulach@1348: while(!done) {
jtulach@1348: int ch = peek();
jtulach@1348: switch(ch) {
jtulach@1348: case '0':
jtulach@1348: case '1':
jtulach@1348: case '2':
jtulach@1348: case '3':
jtulach@1348: case '4':
jtulach@1348: case '5':
jtulach@1348: case '6':
jtulach@1348: case '7':
jtulach@1348: case '8':
jtulach@1348: case '9':
jtulach@1348: int newRefNum = (refNum * 10) + (ch - '0');
jtulach@1348: // Add another number if it doesn't make a group
jtulach@1348: // that doesn't exist
jtulach@1348: if (capturingGroupCount - 1 < newRefNum) {
jtulach@1348: done = true;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: refNum = newRefNum;
jtulach@1348: read();
jtulach@1348: break;
jtulach@1348: default:
jtulach@1348: done = true;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (has(CASE_INSENSITIVE))
jtulach@1348: return new CIBackRef(refNum, has(UNICODE_CASE));
jtulach@1348: else
jtulach@1348: return new BackRef(refNum);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses an escape sequence to determine the actual value that needs
jtulach@1348: * to be matched.
jtulach@1348: * If -1 is returned and create was true a new object was added to the tree
jtulach@1348: * to handle the escape sequence.
jtulach@1348: * If the returned value is greater than zero, it is the value that
jtulach@1348: * matches the escape sequence.
jtulach@1348: */
jtulach@1348: private int escape(boolean inclass, boolean create) {
jtulach@1348: int ch = skip();
jtulach@1348: switch (ch) {
jtulach@1348: case '0':
jtulach@1348: return o();
jtulach@1348: case '1':
jtulach@1348: case '2':
jtulach@1348: case '3':
jtulach@1348: case '4':
jtulach@1348: case '5':
jtulach@1348: case '6':
jtulach@1348: case '7':
jtulach@1348: case '8':
jtulach@1348: case '9':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) {
jtulach@1348: root = ref((ch - '0'));
jtulach@1348: }
jtulach@1348: return -1;
jtulach@1348: case 'A':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) root = new Begin();
jtulach@1348: return -1;
jtulach@1348: case 'B':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
jtulach@1348: return -1;
jtulach@1348: case 'C':
jtulach@1348: break;
jtulach@1348: case 'D':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.DIGIT).complement()
jtulach@1348: : new Ctype(ASCII.DIGIT).complement();
jtulach@1348: return -1;
jtulach@1348: case 'E':
jtulach@1348: case 'F':
jtulach@1348: break;
jtulach@1348: case 'G':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) root = new LastMatch();
jtulach@1348: return -1;
jtulach@1348: case 'H':
jtulach@1348: case 'I':
jtulach@1348: case 'J':
jtulach@1348: case 'K':
jtulach@1348: case 'L':
jtulach@1348: case 'M':
jtulach@1348: case 'N':
jtulach@1348: case 'O':
jtulach@1348: case 'P':
jtulach@1348: case 'Q':
jtulach@1348: case 'R':
jtulach@1348: break;
jtulach@1348: case 'S':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.WHITE_SPACE).complement()
jtulach@1348: : new Ctype(ASCII.SPACE).complement();
jtulach@1348: return -1;
jtulach@1348: case 'T':
jtulach@1348: case 'U':
jtulach@1348: case 'V':
jtulach@1348: break;
jtulach@1348: case 'W':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.WORD).complement()
jtulach@1348: : new Ctype(ASCII.WORD).complement();
jtulach@1348: return -1;
jtulach@1348: case 'X':
jtulach@1348: case 'Y':
jtulach@1348: break;
jtulach@1348: case 'Z':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) {
jtulach@1348: if (has(UNIX_LINES))
jtulach@1348: root = new UnixDollar(false);
jtulach@1348: else
jtulach@1348: root = new Dollar(false);
jtulach@1348: }
jtulach@1348: return -1;
jtulach@1348: case 'a':
jtulach@1348: return '\007';
jtulach@1348: case 'b':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
jtulach@1348: return -1;
jtulach@1348: case 'c':
jtulach@1348: return c();
jtulach@1348: case 'd':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.DIGIT)
jtulach@1348: : new Ctype(ASCII.DIGIT);
jtulach@1348: return -1;
jtulach@1348: case 'e':
jtulach@1348: return '\033';
jtulach@1348: case 'f':
jtulach@1348: return '\f';
jtulach@1348: case 'g':
jtulach@1348: case 'h':
jtulach@1348: case 'i':
jtulach@1348: case 'j':
jtulach@1348: break;
jtulach@1348: case 'k':
jtulach@1348: if (inclass)
jtulach@1348: break;
jtulach@1348: if (read() != '<')
jtulach@1348: throw error("\\k is not followed by '<' for named capturing group");
jtulach@1348: String name = groupname(read());
jtulach@1348: if (!namedGroups().containsKey(name))
jtulach@1348: throw error("(named capturing group <"+ name+"> does not exit");
jtulach@1348: if (create) {
jtulach@1348: if (has(CASE_INSENSITIVE))
jtulach@1348: root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
jtulach@1348: else
jtulach@1348: root = new BackRef(namedGroups().get(name));
jtulach@1348: }
jtulach@1348: return -1;
jtulach@1348: case 'l':
jtulach@1348: case 'm':
jtulach@1348: break;
jtulach@1348: case 'n':
jtulach@1348: return '\n';
jtulach@1348: case 'o':
jtulach@1348: case 'p':
jtulach@1348: case 'q':
jtulach@1348: break;
jtulach@1348: case 'r':
jtulach@1348: return '\r';
jtulach@1348: case 's':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.WHITE_SPACE)
jtulach@1348: : new Ctype(ASCII.SPACE);
jtulach@1348: return -1;
jtulach@1348: case 't':
jtulach@1348: return '\t';
jtulach@1348: case 'u':
jtulach@1348: return u();
jtulach@1348: case 'v':
jtulach@1348: return '\013';
jtulach@1348: case 'w':
jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS)
jtulach@1348: ? new Utype(UnicodeProp.WORD)
jtulach@1348: : new Ctype(ASCII.WORD);
jtulach@1348: return -1;
jtulach@1348: case 'x':
jtulach@1348: return x();
jtulach@1348: case 'y':
jtulach@1348: break;
jtulach@1348: case 'z':
jtulach@1348: if (inclass) break;
jtulach@1348: if (create) root = new End();
jtulach@1348: return -1;
jtulach@1348: default:
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348: throw error("Illegal/unsupported escape sequence");
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parse a character class, and return the node that matches it.
jtulach@1348: *
jtulach@1348: * Consumes a ] on the way out if consume is true. Usually consume
jtulach@1348: * is true except for the case of [abc&&def] where def is a separate
jtulach@1348: * right hand node with "understood" brackets.
jtulach@1348: */
jtulach@1348: private CharProperty clazz(boolean consume) {
jtulach@1348: CharProperty prev = null;
jtulach@1348: CharProperty node = null;
jtulach@1348: BitClass bits = new BitClass();
jtulach@1348: boolean include = true;
jtulach@1348: boolean firstInClass = true;
jtulach@1348: int ch = next();
jtulach@1348: for (;;) {
jtulach@1348: switch (ch) {
jtulach@1348: case '^':
jtulach@1348: // Negates if first char in a class, otherwise literal
jtulach@1348: if (firstInClass) {
jtulach@1348: if (temp[cursor-1] != '[')
jtulach@1348: break;
jtulach@1348: ch = next();
jtulach@1348: include = !include;
jtulach@1348: continue;
jtulach@1348: } else {
jtulach@1348: // ^ not first in class, treat as literal
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: case '[':
jtulach@1348: firstInClass = false;
jtulach@1348: node = clazz(true);
jtulach@1348: if (prev == null)
jtulach@1348: prev = node;
jtulach@1348: else
jtulach@1348: prev = union(prev, node);
jtulach@1348: ch = peek();
jtulach@1348: continue;
jtulach@1348: case '&':
jtulach@1348: firstInClass = false;
jtulach@1348: ch = next();
jtulach@1348: if (ch == '&') {
jtulach@1348: ch = next();
jtulach@1348: CharProperty rightNode = null;
jtulach@1348: while (ch != ']' && ch != '&') {
jtulach@1348: if (ch == '[') {
jtulach@1348: if (rightNode == null)
jtulach@1348: rightNode = clazz(true);
jtulach@1348: else
jtulach@1348: rightNode = union(rightNode, clazz(true));
jtulach@1348: } else { // abc&&def
jtulach@1348: unread();
jtulach@1348: rightNode = clazz(false);
jtulach@1348: }
jtulach@1348: ch = peek();
jtulach@1348: }
jtulach@1348: if (rightNode != null)
jtulach@1348: node = rightNode;
jtulach@1348: if (prev == null) {
jtulach@1348: if (rightNode == null)
jtulach@1348: throw error("Bad class syntax");
jtulach@1348: else
jtulach@1348: prev = rightNode;
jtulach@1348: } else {
jtulach@1348: prev = intersection(prev, node);
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: // treat as a literal &
jtulach@1348: unread();
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: continue;
jtulach@1348: case 0:
jtulach@1348: firstInClass = false;
jtulach@1348: if (cursor >= patternLength)
jtulach@1348: throw error("Unclosed character class");
jtulach@1348: break;
jtulach@1348: case ']':
jtulach@1348: firstInClass = false;
jtulach@1348: if (prev != null) {
jtulach@1348: if (consume)
jtulach@1348: next();
jtulach@1348: return prev;
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: default:
jtulach@1348: firstInClass = false;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: node = range(bits);
jtulach@1348: if (include) {
jtulach@1348: if (prev == null) {
jtulach@1348: prev = node;
jtulach@1348: } else {
jtulach@1348: if (prev != node)
jtulach@1348: prev = union(prev, node);
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: if (prev == null) {
jtulach@1348: prev = node.complement();
jtulach@1348: } else {
jtulach@1348: if (prev != node)
jtulach@1348: prev = setDifference(prev, node);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: ch = peek();
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: private CharProperty bitsOrSingle(BitClass bits, int ch) {
jtulach@1348: /* Bits can only handle codepoints in [u+0000-u+00ff] range.
jtulach@1348: Use "single" node instead of bits when dealing with unicode
jtulach@1348: case folding for codepoints listed below.
jtulach@1348: (1)Uppercase out of range: u+00ff, u+00b5
jtulach@1348: toUpperCase(u+00ff) -> u+0178
jtulach@1348: toUpperCase(u+00b5) -> u+039c
jtulach@1348: (2)LatinSmallLetterLongS u+17f
jtulach@1348: toUpperCase(u+017f) -> u+0053
jtulach@1348: (3)LatinSmallLetterDotlessI u+131
jtulach@1348: toUpperCase(u+0131) -> u+0049
jtulach@1348: (4)LatinCapitalLetterIWithDotAbove u+0130
jtulach@1348: toLowerCase(u+0130) -> u+0069
jtulach@1348: (5)KelvinSign u+212a
jtulach@1348: toLowerCase(u+212a) ==> u+006B
jtulach@1348: (6)AngstromSign u+212b
jtulach@1348: toLowerCase(u+212b) ==> u+00e5
jtulach@1348: */
jtulach@1348: int d;
jtulach@1348: if (ch < 256 &&
jtulach@1348: !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
jtulach@1348: (ch == 0xff || ch == 0xb5 ||
jtulach@1348: ch == 0x49 || ch == 0x69 || //I and i
jtulach@1348: ch == 0x53 || ch == 0x73 || //S and s
jtulach@1348: ch == 0x4b || ch == 0x6b || //K and k
jtulach@1348: ch == 0xc5 || ch == 0xe5))) //A+ring
jtulach@1348: return bits.add(ch, flags());
jtulach@1348: return newSingle(ch);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parse a single character or a character range in a character class
jtulach@1348: * and return its representative node.
jtulach@1348: */
jtulach@1348: private CharProperty range(BitClass bits) {
jtulach@1348: int ch = peek();
jtulach@1348: if (ch == '\\') {
jtulach@1348: ch = nextEscaped();
jtulach@1348: if (ch == 'p' || ch == 'P') { // A property
jtulach@1348: boolean comp = (ch == 'P');
jtulach@1348: boolean oneLetter = true;
jtulach@1348: // Consume { if present
jtulach@1348: ch = next();
jtulach@1348: if (ch != '{')
jtulach@1348: unread();
jtulach@1348: else
jtulach@1348: oneLetter = false;
jtulach@1348: return family(oneLetter, comp);
jtulach@1348: } else { // ordinary escape
jtulach@1348: unread();
jtulach@1348: ch = escape(true, true);
jtulach@1348: if (ch == -1)
jtulach@1348: return (CharProperty) root;
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: ch = single();
jtulach@1348: }
jtulach@1348: if (ch >= 0) {
jtulach@1348: if (peek() == '-') {
jtulach@1348: int endRange = temp[cursor+1];
jtulach@1348: if (endRange == '[') {
jtulach@1348: return bitsOrSingle(bits, ch);
jtulach@1348: }
jtulach@1348: if (endRange != ']') {
jtulach@1348: next();
jtulach@1348: int m = single();
jtulach@1348: if (m < ch)
jtulach@1348: throw error("Illegal character range");
jtulach@1348: if (has(CASE_INSENSITIVE))
jtulach@1348: return caseInsensitiveRangeFor(ch, m);
jtulach@1348: else
jtulach@1348: return rangeFor(ch, m);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return bitsOrSingle(bits, ch);
jtulach@1348: }
jtulach@1348: throw error("Unexpected character '"+((char)ch)+"'");
jtulach@1348: }
jtulach@1348:
jtulach@1348: private int single() {
jtulach@1348: int ch = peek();
jtulach@1348: switch (ch) {
jtulach@1348: case '\\':
jtulach@1348: return escape(true, false);
jtulach@1348: default:
jtulach@1348: next();
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses a Unicode character family and returns its representative node.
jtulach@1348: */
jtulach@1348: private CharProperty family(boolean singleLetter,
jtulach@1348: boolean maybeComplement)
jtulach@1348: {
jtulach@1348: next();
jtulach@1348: String name;
jtulach@1348: CharProperty node = null;
jtulach@1348:
jtulach@1348: if (singleLetter) {
jtulach@1348: int c = temp[cursor];
jtulach@1348: if (!Character.isSupplementaryCodePoint(c)) {
jtulach@1348: name = String.valueOf((char)c);
jtulach@1348: } else {
jtulach@1348: name = new String(temp, cursor, 1);
jtulach@1348: }
jtulach@1348: read();
jtulach@1348: } else {
jtulach@1348: int i = cursor;
jtulach@1348: mark('}');
jtulach@1348: while(read() != '}') {
jtulach@1348: }
jtulach@1348: mark('\000');
jtulach@1348: int j = cursor;
jtulach@1348: if (j > patternLength)
jtulach@1348: throw error("Unclosed character family");
jtulach@1348: if (i + 1 >= j)
jtulach@1348: throw error("Empty character family");
jtulach@1348: name = new String(temp, i, j-i-1);
jtulach@1348: }
jtulach@1348:
jtulach@1348: int i = name.indexOf('=');
jtulach@1348: if (i != -1) {
jtulach@1348: // property construct \p{name=value}
jtulach@1348: String value = name.substring(i + 1);
jtulach@1348: name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
jtulach@1350: /* if ("sc".equals(name) || "script".equals(name)) {
jtulach@1348: node = unicodeScriptPropertyFor(value);
jtulach@1348: } else if ("blk".equals(name) || "block".equals(name)) {
jtulach@1348: node = unicodeBlockPropertyFor(value);
jtulach@1350: } else*/ if ("gc".equals(name) || "general_category".equals(name)) {
jtulach@1348: node = charPropertyNodeFor(value);
jtulach@1348: } else {
jtulach@1348: throw error("Unknown Unicode property {name=<" + name + ">, "
jtulach@1348: + "value=<" + value + ">}");
jtulach@1348: }
jtulach@1348: } else {
jtulach@1350: /*if (name.startsWith("In")) {
jtulach@1348: // \p{inBlockName}
jtulach@1348: node = unicodeBlockPropertyFor(name.substring(2));
jtulach@1348: } else if (name.startsWith("Is")) {
jtulach@1348: // \p{isGeneralCategory} and \p{isScriptName}
jtulach@1348: name = name.substring(2);
jtulach@1348: UnicodeProp uprop = UnicodeProp.forName(name);
jtulach@1348: if (uprop != null)
jtulach@1348: node = new Utype(uprop);
jtulach@1348: if (node == null)
jtulach@1348: node = CharPropertyNames.charPropertyFor(name);
jtulach@1348: if (node == null)
jtulach@1348: node = unicodeScriptPropertyFor(name);
jtulach@1350: } else*/ {
jtulach@1348: if (has(UNICODE_CHARACTER_CLASS)) {
jtulach@1348: UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
jtulach@1348: if (uprop != null)
jtulach@1348: node = new Utype(uprop);
jtulach@1348: }
jtulach@1348: if (node == null)
jtulach@1348: node = charPropertyNodeFor(name);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (maybeComplement) {
jtulach@1350: if (node instanceof Category /*|| node instanceof Block*/)
jtulach@1348: hasSupplementary = true;
jtulach@1348: node = node.complement();
jtulach@1348: }
jtulach@1348: return node;
jtulach@1348: }
jtulach@1348:
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns a CharProperty matching all characters belong to
jtulach@1348: * a UnicodeScript.
jtulach@1350: *
jtulach@1348: private CharProperty unicodeScriptPropertyFor(String name) {
jtulach@1348: final Character.UnicodeScript script;
jtulach@1348: try {
jtulach@1348: script = Character.UnicodeScript.forName(name);
jtulach@1348: } catch (IllegalArgumentException iae) {
jtulach@1348: throw error("Unknown character script name {" + name + "}");
jtulach@1348: }
jtulach@1348: return new Script(script);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns a CharProperty matching all characters in a UnicodeBlock.
jtulach@1350: *
jtulach@1348: private CharProperty unicodeBlockPropertyFor(String name) {
jtulach@1348: final Character.UnicodeBlock block;
jtulach@1348: try {
jtulach@1348: block = Character.UnicodeBlock.forName(name);
jtulach@1348: } catch (IllegalArgumentException iae) {
jtulach@1348: throw error("Unknown character block name {" + name + "}");
jtulach@1348: }
jtulach@1348: return new Block(block);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns a CharProperty matching all characters in a named property.
jtulach@1348: */
jtulach@1348: private CharProperty charPropertyNodeFor(String name) {
jtulach@1348: CharProperty p = CharPropertyNames.charPropertyFor(name);
jtulach@1348: if (p == null)
jtulach@1348: throw error("Unknown character property name {" + name + "}");
jtulach@1348: return p;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses and returns the name of a "named capturing group", the trailing
jtulach@1348: * ">" is consumed after parsing.
jtulach@1348: */
jtulach@1348: private String groupname(int ch) {
jtulach@1348: StringBuilder sb = new StringBuilder();
jtulach@1348: sb.append(Character.toChars(ch));
jtulach@1348: while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
jtulach@1348: ASCII.isDigit(ch)) {
jtulach@1348: sb.append(Character.toChars(ch));
jtulach@1348: }
jtulach@1348: if (sb.length() == 0)
jtulach@1348: throw error("named capturing group has 0 length name");
jtulach@1348: if (ch != '>')
jtulach@1348: throw error("named capturing group is missing trailing '>'");
jtulach@1348: return sb.toString();
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses a group and returns the head node of a set of nodes that process
jtulach@1348: * the group. Sometimes a double return system is used where the tail is
jtulach@1348: * returned in root.
jtulach@1348: */
jtulach@1348: private Node group0() {
jtulach@1348: boolean capturingGroup = false;
jtulach@1348: Node head = null;
jtulach@1348: Node tail = null;
jtulach@1348: int save = flags;
jtulach@1348: root = null;
jtulach@1348: int ch = next();
jtulach@1348: if (ch == '?') {
jtulach@1348: ch = skip();
jtulach@1348: switch (ch) {
jtulach@1348: case ':': // (?:xxx) pure group
jtulach@1348: head = createGroup(true);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: break;
jtulach@1348: case '=': // (?=xxx) and (?!xxx) lookahead
jtulach@1348: case '!':
jtulach@1348: head = createGroup(true);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: if (ch == '=') {
jtulach@1348: head = tail = new Pos(head);
jtulach@1348: } else {
jtulach@1348: head = tail = new Neg(head);
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '>': // (?>xxx) independent group
jtulach@1348: head = createGroup(true);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: head = tail = new Ques(head, INDEPENDENT);
jtulach@1348: break;
jtulach@1348: case '<': // (? is already defined");
jtulach@1348: capturingGroup = true;
jtulach@1348: head = createGroup(false);
jtulach@1348: tail = root;
jtulach@1348: namedGroups().put(name, capturingGroupCount-1);
jtulach@1348: head.next = expr(tail);
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: int start = cursor;
jtulach@1348: head = createGroup(true);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: tail.next = lookbehindEnd;
jtulach@1348: TreeInfo info = new TreeInfo();
jtulach@1348: head.study(info);
jtulach@1348: if (info.maxValid == false) {
jtulach@1348: throw error("Look-behind group does not have "
jtulach@1348: + "an obvious maximum length");
jtulach@1348: }
jtulach@1348: boolean hasSupplementary = findSupplementary(start, patternLength);
jtulach@1348: if (ch == '=') {
jtulach@1348: head = tail = (hasSupplementary ?
jtulach@1348: new BehindS(head, info.maxLength,
jtulach@1348: info.minLength) :
jtulach@1348: new Behind(head, info.maxLength,
jtulach@1348: info.minLength));
jtulach@1348: } else if (ch == '!') {
jtulach@1348: head = tail = (hasSupplementary ?
jtulach@1348: new NotBehindS(head, info.maxLength,
jtulach@1348: info.minLength) :
jtulach@1348: new NotBehind(head, info.maxLength,
jtulach@1348: info.minLength));
jtulach@1348: } else {
jtulach@1348: throw error("Unknown look-behind group");
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: case '$':
jtulach@1348: case '@':
jtulach@1348: throw error("Unknown group type");
jtulach@1348: default: // (?xxx:) inlined match flags
jtulach@1348: unread();
jtulach@1348: addFlag();
jtulach@1348: ch = read();
jtulach@1348: if (ch == ')') {
jtulach@1348: return null; // Inline modifier only
jtulach@1348: }
jtulach@1348: if (ch != ':') {
jtulach@1348: throw error("Unknown inline modifier");
jtulach@1348: }
jtulach@1348: head = createGroup(true);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: } else { // (xxx) a regular group
jtulach@1348: capturingGroup = true;
jtulach@1348: head = createGroup(false);
jtulach@1348: tail = root;
jtulach@1348: head.next = expr(tail);
jtulach@1348: }
jtulach@1348:
jtulach@1348: accept(')', "Unclosed group");
jtulach@1348: flags = save;
jtulach@1348:
jtulach@1348: // Check for quantifiers
jtulach@1348: Node node = closure(head);
jtulach@1348: if (node == head) { // No closure
jtulach@1348: root = tail;
jtulach@1348: return node; // Dual return
jtulach@1348: }
jtulach@1348: if (head == tail) { // Zero length assertion
jtulach@1348: root = node;
jtulach@1348: return node; // Dual return
jtulach@1348: }
jtulach@1348:
jtulach@1348: if (node instanceof Ques) {
jtulach@1348: Ques ques = (Ques) node;
jtulach@1348: if (ques.type == POSSESSIVE) {
jtulach@1348: root = node;
jtulach@1348: return node;
jtulach@1348: }
jtulach@1348: tail.next = new BranchConn();
jtulach@1348: tail = tail.next;
jtulach@1348: if (ques.type == GREEDY) {
jtulach@1348: head = new Branch(head, null, tail);
jtulach@1348: } else { // Reluctant quantifier
jtulach@1348: head = new Branch(null, head, tail);
jtulach@1348: }
jtulach@1348: root = tail;
jtulach@1348: return head;
jtulach@1348: } else if (node instanceof Curly) {
jtulach@1348: Curly curly = (Curly) node;
jtulach@1348: if (curly.type == POSSESSIVE) {
jtulach@1348: root = node;
jtulach@1348: return node;
jtulach@1348: }
jtulach@1348: // Discover if the group is deterministic
jtulach@1348: TreeInfo info = new TreeInfo();
jtulach@1348: if (head.study(info)) { // Deterministic
jtulach@1348: GroupTail temp = (GroupTail) tail;
jtulach@1348: head = root = new GroupCurly(head.next, curly.cmin,
jtulach@1348: curly.cmax, curly.type,
jtulach@1348: ((GroupTail)tail).localIndex,
jtulach@1348: ((GroupTail)tail).groupIndex,
jtulach@1348: capturingGroup);
jtulach@1348: return head;
jtulach@1348: } else { // Non-deterministic
jtulach@1348: int temp = ((GroupHead) head).localIndex;
jtulach@1348: Loop loop;
jtulach@1348: if (curly.type == GREEDY)
jtulach@1348: loop = new Loop(this.localCount, temp);
jtulach@1348: else // Reluctant Curly
jtulach@1348: loop = new LazyLoop(this.localCount, temp);
jtulach@1348: Prolog prolog = new Prolog(loop);
jtulach@1348: this.localCount += 1;
jtulach@1348: loop.cmin = curly.cmin;
jtulach@1348: loop.cmax = curly.cmax;
jtulach@1348: loop.body = head;
jtulach@1348: tail.next = loop;
jtulach@1348: root = loop;
jtulach@1348: return prolog; // Dual return
jtulach@1348: }
jtulach@1348: }
jtulach@1348: throw error("Internal logic error");
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Create group head and tail nodes using double return. If the group is
jtulach@1348: * created with anonymous true then it is a pure group and should not
jtulach@1348: * affect group counting.
jtulach@1348: */
jtulach@1348: private Node createGroup(boolean anonymous) {
jtulach@1348: int localIndex = localCount++;
jtulach@1348: int groupIndex = 0;
jtulach@1348: if (!anonymous)
jtulach@1348: groupIndex = capturingGroupCount++;
jtulach@1348: GroupHead head = new GroupHead(localIndex);
jtulach@1348: root = new GroupTail(localIndex, groupIndex);
jtulach@1348: if (!anonymous && groupIndex < 10)
jtulach@1348: groupNodes[groupIndex] = head;
jtulach@1348: return head;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses inlined match flags and set them appropriately.
jtulach@1348: */
jtulach@1348: private void addFlag() {
jtulach@1348: int ch = peek();
jtulach@1348: for (;;) {
jtulach@1348: switch (ch) {
jtulach@1348: case 'i':
jtulach@1348: flags |= CASE_INSENSITIVE;
jtulach@1348: break;
jtulach@1348: case 'm':
jtulach@1348: flags |= MULTILINE;
jtulach@1348: break;
jtulach@1348: case 's':
jtulach@1348: flags |= DOTALL;
jtulach@1348: break;
jtulach@1348: case 'd':
jtulach@1348: flags |= UNIX_LINES;
jtulach@1348: break;
jtulach@1348: case 'u':
jtulach@1348: flags |= UNICODE_CASE;
jtulach@1348: break;
jtulach@1348: case 'c':
jtulach@1348: flags |= CANON_EQ;
jtulach@1348: break;
jtulach@1348: case 'x':
jtulach@1348: flags |= COMMENTS;
jtulach@1348: break;
jtulach@1348: case 'U':
jtulach@1348: flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
jtulach@1348: break;
jtulach@1348: case '-': // subFlag then fall through
jtulach@1348: ch = next();
jtulach@1348: subFlag();
jtulach@1348: default:
jtulach@1348: return;
jtulach@1348: }
jtulach@1348: ch = next();
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Parses the second part of inlined match flags and turns off
jtulach@1348: * flags appropriately.
jtulach@1348: */
jtulach@1348: private void subFlag() {
jtulach@1348: int ch = peek();
jtulach@1348: for (;;) {
jtulach@1348: switch (ch) {
jtulach@1348: case 'i':
jtulach@1348: flags &= ~CASE_INSENSITIVE;
jtulach@1348: break;
jtulach@1348: case 'm':
jtulach@1348: flags &= ~MULTILINE;
jtulach@1348: break;
jtulach@1348: case 's':
jtulach@1348: flags &= ~DOTALL;
jtulach@1348: break;
jtulach@1348: case 'd':
jtulach@1348: flags &= ~UNIX_LINES;
jtulach@1348: break;
jtulach@1348: case 'u':
jtulach@1348: flags &= ~UNICODE_CASE;
jtulach@1348: break;
jtulach@1348: case 'c':
jtulach@1348: flags &= ~CANON_EQ;
jtulach@1348: break;
jtulach@1348: case 'x':
jtulach@1348: flags &= ~COMMENTS;
jtulach@1348: break;
jtulach@1348: case 'U':
jtulach@1348: flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
jtulach@1348: default:
jtulach@1348: return;
jtulach@1348: }
jtulach@1348: ch = next();
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: static final int MAX_REPS = 0x7FFFFFFF;
jtulach@1348:
jtulach@1348: static final int GREEDY = 0;
jtulach@1348:
jtulach@1348: static final int LAZY = 1;
jtulach@1348:
jtulach@1348: static final int POSSESSIVE = 2;
jtulach@1348:
jtulach@1348: static final int INDEPENDENT = 3;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Processes repetition. If the next character peeked is a quantifier
jtulach@1348: * then new nodes must be appended to handle the repetition.
jtulach@1348: * Prev could be a single or a group, so it could be a chain of nodes.
jtulach@1348: */
jtulach@1348: private Node closure(Node prev) {
jtulach@1348: Node atom;
jtulach@1348: int ch = peek();
jtulach@1348: switch (ch) {
jtulach@1348: case '?':
jtulach@1348: ch = next();
jtulach@1348: if (ch == '?') {
jtulach@1348: next();
jtulach@1348: return new Ques(prev, LAZY);
jtulach@1348: } else if (ch == '+') {
jtulach@1348: next();
jtulach@1348: return new Ques(prev, POSSESSIVE);
jtulach@1348: }
jtulach@1348: return new Ques(prev, GREEDY);
jtulach@1348: case '*':
jtulach@1348: ch = next();
jtulach@1348: if (ch == '?') {
jtulach@1348: next();
jtulach@1348: return new Curly(prev, 0, MAX_REPS, LAZY);
jtulach@1348: } else if (ch == '+') {
jtulach@1348: next();
jtulach@1348: return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
jtulach@1348: }
jtulach@1348: return new Curly(prev, 0, MAX_REPS, GREEDY);
jtulach@1348: case '+':
jtulach@1348: ch = next();
jtulach@1348: if (ch == '?') {
jtulach@1348: next();
jtulach@1348: return new Curly(prev, 1, MAX_REPS, LAZY);
jtulach@1348: } else if (ch == '+') {
jtulach@1348: next();
jtulach@1348: return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
jtulach@1348: }
jtulach@1348: return new Curly(prev, 1, MAX_REPS, GREEDY);
jtulach@1348: case '{':
jtulach@1348: ch = temp[cursor+1];
jtulach@1348: if (ASCII.isDigit(ch)) {
jtulach@1348: skip();
jtulach@1348: int cmin = 0;
jtulach@1348: do {
jtulach@1348: cmin = cmin * 10 + (ch - '0');
jtulach@1348: } while (ASCII.isDigit(ch = read()));
jtulach@1348: int cmax = cmin;
jtulach@1348: if (ch == ',') {
jtulach@1348: ch = read();
jtulach@1348: cmax = MAX_REPS;
jtulach@1348: if (ch != '}') {
jtulach@1348: cmax = 0;
jtulach@1348: while (ASCII.isDigit(ch)) {
jtulach@1348: cmax = cmax * 10 + (ch - '0');
jtulach@1348: ch = read();
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (ch != '}')
jtulach@1348: throw error("Unclosed counted closure");
jtulach@1348: if (((cmin) | (cmax) | (cmax - cmin)) < 0)
jtulach@1348: throw error("Illegal repetition range");
jtulach@1348: Curly curly;
jtulach@1348: ch = peek();
jtulach@1348: if (ch == '?') {
jtulach@1348: next();
jtulach@1348: curly = new Curly(prev, cmin, cmax, LAZY);
jtulach@1348: } else if (ch == '+') {
jtulach@1348: next();
jtulach@1348: curly = new Curly(prev, cmin, cmax, POSSESSIVE);
jtulach@1348: } else {
jtulach@1348: curly = new Curly(prev, cmin, cmax, GREEDY);
jtulach@1348: }
jtulach@1348: return curly;
jtulach@1348: } else {
jtulach@1348: throw error("Illegal repetition");
jtulach@1348: }
jtulach@1348: default:
jtulach@1348: return prev;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Utility method for parsing control escape sequences.
jtulach@1348: */
jtulach@1348: private int c() {
jtulach@1348: if (cursor < patternLength) {
jtulach@1348: return read() ^ 64;
jtulach@1348: }
jtulach@1348: throw error("Illegal control escape sequence");
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Utility method for parsing octal escape sequences.
jtulach@1348: */
jtulach@1348: private int o() {
jtulach@1348: int n = read();
jtulach@1348: if (((n-'0')|('7'-n)) >= 0) {
jtulach@1348: int m = read();
jtulach@1348: if (((m-'0')|('7'-m)) >= 0) {
jtulach@1348: int o = read();
jtulach@1348: if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
jtulach@1348: return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
jtulach@1348: }
jtulach@1348: unread();
jtulach@1348: return (n - '0') * 8 + (m - '0');
jtulach@1348: }
jtulach@1348: unread();
jtulach@1348: return (n - '0');
jtulach@1348: }
jtulach@1348: throw error("Illegal octal escape sequence");
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Utility method for parsing hexadecimal escape sequences.
jtulach@1348: */
jtulach@1348: private int x() {
jtulach@1348: int n = read();
jtulach@1348: if (ASCII.isHexDigit(n)) {
jtulach@1348: int m = read();
jtulach@1348: if (ASCII.isHexDigit(m)) {
jtulach@1348: return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
jtulach@1348: }
jtulach@1348: } else if (n == '{' && ASCII.isHexDigit(peek())) {
jtulach@1348: int ch = 0;
jtulach@1348: while (ASCII.isHexDigit(n = read())) {
jtulach@1348: ch = (ch << 4) + ASCII.toDigit(n);
jtulach@1348: if (ch > Character.MAX_CODE_POINT)
jtulach@1348: throw error("Hexadecimal codepoint is too big");
jtulach@1348: }
jtulach@1348: if (n != '}')
jtulach@1348: throw error("Unclosed hexadecimal escape sequence");
jtulach@1348: return ch;
jtulach@1348: }
jtulach@1348: throw error("Illegal hexadecimal escape sequence");
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Utility method for parsing unicode escape sequences.
jtulach@1348: */
jtulach@1348: private int cursor() {
jtulach@1348: return cursor;
jtulach@1348: }
jtulach@1348:
jtulach@1348: private void setcursor(int pos) {
jtulach@1348: cursor = pos;
jtulach@1348: }
jtulach@1348:
jtulach@1348: private int uxxxx() {
jtulach@1348: int n = 0;
jtulach@1348: for (int i = 0; i < 4; i++) {
jtulach@1348: int ch = read();
jtulach@1348: if (!ASCII.isHexDigit(ch)) {
jtulach@1348: throw error("Illegal Unicode escape sequence");
jtulach@1348: }
jtulach@1348: n = n * 16 + ASCII.toDigit(ch);
jtulach@1348: }
jtulach@1348: return n;
jtulach@1348: }
jtulach@1348:
jtulach@1348: private int u() {
jtulach@1348: int n = uxxxx();
jtulach@1348: if (Character.isHighSurrogate((char)n)) {
jtulach@1348: int cur = cursor();
jtulach@1348: if (read() == '\\' && read() == 'u') {
jtulach@1348: int n2 = uxxxx();
jtulach@1348: if (Character.isLowSurrogate((char)n2))
jtulach@1348: return Character.toCodePoint((char)n, (char)n2);
jtulach@1348: }
jtulach@1348: setcursor(cur);
jtulach@1348: }
jtulach@1348: return n;
jtulach@1348: }
jtulach@1348:
jtulach@1348: //
jtulach@1348: // Utility methods for code point support
jtulach@1348: //
jtulach@1348:
jtulach@1348: private static final int countChars(CharSequence seq, int index,
jtulach@1348: int lengthInCodePoints) {
jtulach@1348: // optimization
jtulach@1348: if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
jtulach@1348: assert (index >= 0 && index < seq.length());
jtulach@1348: return 1;
jtulach@1348: }
jtulach@1348: int length = seq.length();
jtulach@1348: int x = index;
jtulach@1348: if (lengthInCodePoints >= 0) {
jtulach@1348: assert (index >= 0 && index < length);
jtulach@1348: for (int i = 0; x < length && i < lengthInCodePoints; i++) {
jtulach@1348: if (Character.isHighSurrogate(seq.charAt(x++))) {
jtulach@1348: if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
jtulach@1348: x++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return x - index;
jtulach@1348: }
jtulach@1348:
jtulach@1348: assert (index >= 0 && index <= length);
jtulach@1348: if (index == 0) {
jtulach@1348: return 0;
jtulach@1348: }
jtulach@1348: int len = -lengthInCodePoints;
jtulach@1348: for (int i = 0; x > 0 && i < len; i++) {
jtulach@1348: if (Character.isLowSurrogate(seq.charAt(--x))) {
jtulach@1348: if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
jtulach@1348: x--;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return index - x;
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static final int countCodePoints(CharSequence seq) {
jtulach@1348: int length = seq.length();
jtulach@1348: int n = 0;
jtulach@1348: for (int i = 0; i < length; ) {
jtulach@1348: n++;
jtulach@1348: if (Character.isHighSurrogate(seq.charAt(i++))) {
jtulach@1348: if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
jtulach@1348: i++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return n;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Creates a bit vector for matching Latin-1 values. A normal BitClass
jtulach@1348: * never matches values above Latin-1, and a complemented BitClass always
jtulach@1348: * matches values above Latin-1.
jtulach@1348: */
jtulach@1348: private static final class BitClass extends BmpCharProperty {
jtulach@1348: final boolean[] bits;
jtulach@1348: BitClass() { bits = new boolean[256]; }
jtulach@1348: private BitClass(boolean[] bits) { this.bits = bits; }
jtulach@1348: BitClass add(int c, int flags) {
jtulach@1348: assert c >= 0 && c <= 255;
jtulach@1348: if ((flags & CASE_INSENSITIVE) != 0) {
jtulach@1348: if (ASCII.isAscii(c)) {
jtulach@1348: bits[ASCII.toUpper(c)] = true;
jtulach@1348: bits[ASCII.toLower(c)] = true;
jtulach@1348: } else if ((flags & UNICODE_CASE) != 0) {
jtulach@1348: bits[Character.toLowerCase(c)] = true;
jtulach@1348: bits[Character.toUpperCase(c)] = true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: bits[c] = true;
jtulach@1348: return this;
jtulach@1348: }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch < 256 && bits[ch];
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns a suitably optimized, single character matcher.
jtulach@1348: */
jtulach@1348: private CharProperty newSingle(final int ch) {
jtulach@1348: if (has(CASE_INSENSITIVE)) {
jtulach@1348: int lower, upper;
jtulach@1348: if (has(UNICODE_CASE)) {
jtulach@1348: upper = Character.toUpperCase(ch);
jtulach@1348: lower = Character.toLowerCase(upper);
jtulach@1348: if (upper != lower)
jtulach@1348: return new SingleU(lower);
jtulach@1348: } else if (ASCII.isAscii(ch)) {
jtulach@1348: lower = ASCII.toLower(ch);
jtulach@1348: upper = ASCII.toUpper(ch);
jtulach@1348: if (lower != upper)
jtulach@1348: return new SingleI(lower, upper);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (isSupplementary(ch))
jtulach@1348: return new SingleS(ch); // Match a given Unicode character
jtulach@1348: return new Single(ch); // Match a given BMP character
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Utility method for creating a string slice matcher.
jtulach@1348: */
jtulach@1348: private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
jtulach@1348: int[] tmp = new int[count];
jtulach@1348: if (has(CASE_INSENSITIVE)) {
jtulach@1348: if (has(UNICODE_CASE)) {
jtulach@1348: for (int i = 0; i < count; i++) {
jtulach@1348: tmp[i] = Character.toLowerCase(
jtulach@1348: Character.toUpperCase(buf[i]));
jtulach@1348: }
jtulach@1348: return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
jtulach@1348: }
jtulach@1348: for (int i = 0; i < count; i++) {
jtulach@1348: tmp[i] = ASCII.toLower(buf[i]);
jtulach@1348: }
jtulach@1348: return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
jtulach@1348: }
jtulach@1348: for (int i = 0; i < count; i++) {
jtulach@1348: tmp[i] = buf[i];
jtulach@1348: }
jtulach@1348: return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The following classes are the building components of the object
jtulach@1348: * tree that represents a compiled regular expression. The object tree
jtulach@1348: * is made of individual elements that handle constructs in the Pattern.
jtulach@1348: * Each type of object knows how to match its equivalent construct with
jtulach@1348: * the match() method.
jtulach@1348: */
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Base class for all node classes. Subclasses should override the match()
jtulach@1348: * method as appropriate. This class is an accepting node, so its match()
jtulach@1348: * always returns true.
jtulach@1348: */
jtulach@1348: static class Node extends Object {
jtulach@1348: Node next;
jtulach@1348: Node() {
jtulach@1348: next = Pattern.accept;
jtulach@1348: }
jtulach@1348: /**
jtulach@1348: * This method implements the classic accept node.
jtulach@1348: */
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: matcher.last = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: /**
jtulach@1348: * This method is good for all zero length assertions.
jtulach@1348: */
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: if (next != null) {
jtulach@1348: return next.study(info);
jtulach@1348: } else {
jtulach@1348: return info.deterministic;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: static class LastNode extends Node {
jtulach@1348: /**
jtulach@1348: * This method implements the classic accept node with
jtulach@1348: * the addition of a check to see if the match occurred
jtulach@1348: * using all of the input.
jtulach@1348: */
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
jtulach@1348: return false;
jtulach@1348: matcher.last = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Used for REs that can start anywhere within the input string.
jtulach@1348: * This basically tries to match repeatedly at each spot in the
jtulach@1348: * input string, moving forward after each try. An anchored search
jtulach@1348: * or a BnM will bypass this node completely.
jtulach@1348: */
jtulach@1348: static class Start extends Node {
jtulach@1348: int minLength;
jtulach@1348: Start(Node node) {
jtulach@1348: this.next = node;
jtulach@1348: TreeInfo info = new TreeInfo();
jtulach@1348: next.study(info);
jtulach@1348: minLength = info.minLength;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (i > matcher.to - minLength) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int guard = matcher.to - minLength;
jtulach@1348: for (; i <= guard; i++) {
jtulach@1348: if (next.match(matcher, i, seq)) {
jtulach@1348: matcher.first = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: next.study(info);
jtulach@1348: info.maxValid = false;
jtulach@1348: info.deterministic = false;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /*
jtulach@1348: * StartS supports supplementary characters, including unpaired surrogates.
jtulach@1348: */
jtulach@1348: static final class StartS extends Start {
jtulach@1348: StartS(Node node) {
jtulach@1348: super(node);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (i > matcher.to - minLength) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int guard = matcher.to - minLength;
jtulach@1348: while (i <= guard) {
jtulach@1348: //if ((ret = next.match(matcher, i, seq)) || i == guard)
jtulach@1348: if (next.match(matcher, i, seq)) {
jtulach@1348: matcher.first = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: if (i == guard)
jtulach@1348: break;
jtulach@1348: // Optimization to move to the next character. This is
jtulach@1348: // faster than countChars(seq, i, 1).
jtulach@1348: if (Character.isHighSurrogate(seq.charAt(i++))) {
jtulach@1348: if (i < seq.length() &&
jtulach@1348: Character.isLowSurrogate(seq.charAt(i))) {
jtulach@1348: i++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the beginning of input. This object implements the
jtulach@1348: * match for a \A sequence, and the caret anchor will use this if not in
jtulach@1348: * multiline mode.
jtulach@1348: */
jtulach@1348: static final class Begin extends Node {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int fromIndex = (matcher.anchoringBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: if (i == fromIndex && next.match(matcher, i, seq)) {
jtulach@1348: matcher.first = i;
jtulach@1348: matcher.groups[0] = i;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: } else {
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the end of input. This is the absolute end, so this
jtulach@1348: * should not match at the last newline before the end as $ will.
jtulach@1348: */
jtulach@1348: static final class End extends Node {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int endIndex = (matcher.anchoringBounds) ?
jtulach@1348: matcher.to : matcher.getTextLength();
jtulach@1348: if (i == endIndex) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the beginning of a line. This is essentially the
jtulach@1348: * object to match for the multiline ^.
jtulach@1348: */
jtulach@1348: static final class Caret extends Node {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int startIndex = matcher.from;
jtulach@1348: int endIndex = matcher.to;
jtulach@1348: if (!matcher.anchoringBounds) {
jtulach@1348: startIndex = 0;
jtulach@1348: endIndex = matcher.getTextLength();
jtulach@1348: }
jtulach@1348: // Perl does not match ^ at end of input even after newline
jtulach@1348: if (i == endIndex) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: if (i > startIndex) {
jtulach@1348: char ch = seq.charAt(i-1);
jtulach@1348: if (ch != '\n' && ch != '\r'
jtulach@1348: && (ch|1) != '\u2029'
jtulach@1348: && ch != '\u0085' ) {
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: // Should treat /r/n as one newline
jtulach@1348: if (ch == '\r' && seq.charAt(i) == '\n')
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the beginning of a line when in unixdot mode.
jtulach@1348: */
jtulach@1348: static final class UnixCaret extends Node {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int startIndex = matcher.from;
jtulach@1348: int endIndex = matcher.to;
jtulach@1348: if (!matcher.anchoringBounds) {
jtulach@1348: startIndex = 0;
jtulach@1348: endIndex = matcher.getTextLength();
jtulach@1348: }
jtulach@1348: // Perl does not match ^ at end of input even after newline
jtulach@1348: if (i == endIndex) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: if (i > startIndex) {
jtulach@1348: char ch = seq.charAt(i-1);
jtulach@1348: if (ch != '\n') {
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to match the location where the last match ended.
jtulach@1348: * This is used for the \G construct.
jtulach@1348: */
jtulach@1348: static final class LastMatch extends Node {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (i != matcher.oldLast)
jtulach@1348: return false;
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the end of a line or the end of input based on the
jtulach@1348: * multiline mode.
jtulach@1348: *
jtulach@1348: * When not in multiline mode, the $ can only match at the very end
jtulach@1348: * of the input, unless the input ends in a line terminator in which
jtulach@1348: * it matches right before the last line terminator.
jtulach@1348: *
jtulach@1348: * Note that \r\n is considered an atomic line terminator.
jtulach@1348: *
jtulach@1348: * Like ^ the $ operator matches at a position, it does not match the
jtulach@1348: * line terminators themselves.
jtulach@1348: */
jtulach@1348: static final class Dollar extends Node {
jtulach@1348: boolean multiline;
jtulach@1348: Dollar(boolean mul) {
jtulach@1348: multiline = mul;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int endIndex = (matcher.anchoringBounds) ?
jtulach@1348: matcher.to : matcher.getTextLength();
jtulach@1348: if (!multiline) {
jtulach@1348: if (i < endIndex - 2)
jtulach@1348: return false;
jtulach@1348: if (i == endIndex - 2) {
jtulach@1348: char ch = seq.charAt(i);
jtulach@1348: if (ch != '\r')
jtulach@1348: return false;
jtulach@1348: ch = seq.charAt(i + 1);
jtulach@1348: if (ch != '\n')
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Matches before any line terminator; also matches at the
jtulach@1348: // end of input
jtulach@1348: // Before line terminator:
jtulach@1348: // If multiline, we match here no matter what
jtulach@1348: // If not multiline, fall through so that the end
jtulach@1348: // is marked as hit; this must be a /r/n or a /n
jtulach@1348: // at the very end so the end was hit; more input
jtulach@1348: // could make this not match here
jtulach@1348: if (i < endIndex) {
jtulach@1348: char ch = seq.charAt(i);
jtulach@1348: if (ch == '\n') {
jtulach@1348: // No match between \r\n
jtulach@1348: if (i > 0 && seq.charAt(i-1) == '\r')
jtulach@1348: return false;
jtulach@1348: if (multiline)
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: } else if (ch == '\r' || ch == '\u0085' ||
jtulach@1348: (ch|1) == '\u2029') {
jtulach@1348: if (multiline)
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: } else { // No line terminator, no match
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Matched at current end so hit end
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: // If a $ matches because of end of input, then more input
jtulach@1348: // could cause it to fail!
jtulach@1348: matcher.requireEnd = true;
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: next.study(info);
jtulach@1348: return info.deterministic;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node to anchor at the end of a line or the end of input based on the
jtulach@1348: * multiline mode when in unix lines mode.
jtulach@1348: */
jtulach@1348: static final class UnixDollar extends Node {
jtulach@1348: boolean multiline;
jtulach@1348: UnixDollar(boolean mul) {
jtulach@1348: multiline = mul;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int endIndex = (matcher.anchoringBounds) ?
jtulach@1348: matcher.to : matcher.getTextLength();
jtulach@1348: if (i < endIndex) {
jtulach@1348: char ch = seq.charAt(i);
jtulach@1348: if (ch == '\n') {
jtulach@1348: // If not multiline, then only possible to
jtulach@1348: // match at very end or one before end
jtulach@1348: if (multiline == false && i != endIndex - 1)
jtulach@1348: return false;
jtulach@1348: // If multiline return next.match without setting
jtulach@1348: // matcher.hitEnd
jtulach@1348: if (multiline)
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: } else {
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Matching because at the end or 1 before the end;
jtulach@1348: // more input could change this so set hitEnd
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: // If a $ matches because of end of input, then more input
jtulach@1348: // could cause it to fail!
jtulach@1348: matcher.requireEnd = true;
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: next.study(info);
jtulach@1348: return info.deterministic;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Abstract node class to match one character satisfying some
jtulach@1348: * boolean property.
jtulach@1348: */
jtulach@1348: private static abstract class CharProperty extends Node {
jtulach@1348: abstract boolean isSatisfiedBy(int ch);
jtulach@1348: CharProperty complement() {
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ! CharProperty.this.isSatisfiedBy(ch);}};
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (i < matcher.to) {
jtulach@1348: int ch = Character.codePointAt(seq, i);
jtulach@1348: return isSatisfiedBy(ch)
jtulach@1348: && next.match(matcher, i+Character.charCount(ch), seq);
jtulach@1348: } else {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.minLength++;
jtulach@1348: info.maxLength++;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Optimized version of CharProperty that works only for
jtulach@1348: * properties never satisfied by Supplementary characters.
jtulach@1348: */
jtulach@1348: private static abstract class BmpCharProperty extends CharProperty {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (i < matcher.to) {
jtulach@1348: return isSatisfiedBy(seq.charAt(i))
jtulach@1348: && next.match(matcher, i+1, seq);
jtulach@1348: } else {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a Supplementary Unicode character
jtulach@1348: */
jtulach@1348: static final class SingleS extends CharProperty {
jtulach@1348: final int c;
jtulach@1348: SingleS(int c) { this.c = c; }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch == c;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Optimization -- matches a given BMP character
jtulach@1348: */
jtulach@1348: static final class Single extends BmpCharProperty {
jtulach@1348: final int c;
jtulach@1348: Single(int c) { this.c = c; }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch == c;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Case insensitive matches a given BMP character
jtulach@1348: */
jtulach@1348: static final class SingleI extends BmpCharProperty {
jtulach@1348: final int lower;
jtulach@1348: final int upper;
jtulach@1348: SingleI(int lower, int upper) {
jtulach@1348: this.lower = lower;
jtulach@1348: this.upper = upper;
jtulach@1348: }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch == lower || ch == upper;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Unicode case insensitive matches a given Unicode character
jtulach@1348: */
jtulach@1348: static final class SingleU extends CharProperty {
jtulach@1348: final int lower;
jtulach@1348: SingleU(int lower) {
jtulach@1348: this.lower = lower;
jtulach@1348: }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return lower == ch ||
jtulach@1348: lower == Character.toLowerCase(Character.toUpperCase(ch));
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a Unicode block.
jtulach@1350: *
jtulach@1348: static final class Block extends CharProperty {
jtulach@1348: final Character.UnicodeBlock block;
jtulach@1348: Block(Character.UnicodeBlock block) {
jtulach@1348: this.block = block;
jtulach@1348: }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return block == Character.UnicodeBlock.of(ch);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a Unicode script
jtulach@1350: *
jtulach@1348: static final class Script extends CharProperty {
jtulach@1348: final Character.UnicodeScript script;
jtulach@1348: Script(Character.UnicodeScript script) {
jtulach@1348: this.script = script;
jtulach@1348: }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return script == Character.UnicodeScript.of(ch);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a Unicode category.
jtulach@1348: */
jtulach@1348: static final class Category extends CharProperty {
jtulach@1348: final int typeMask;
jtulach@1348: Category(int typeMask) { this.typeMask = typeMask; }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return (typeMask & (1 << Character.getType(ch))) != 0;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a Unicode "type"
jtulach@1348: */
jtulach@1348: static final class Utype extends CharProperty {
jtulach@1348: final UnicodeProp uprop;
jtulach@1348: Utype(UnicodeProp uprop) { this.uprop = uprop; }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return uprop.is(ch);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class that matches a POSIX type.
jtulach@1348: */
jtulach@1348: static final class Ctype extends BmpCharProperty {
jtulach@1348: final int ctype;
jtulach@1348: Ctype(int ctype) { this.ctype = ctype; }
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch < 128 && ASCII.isType(ch, ctype);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Base class for all Slice nodes
jtulach@1348: */
jtulach@1348: static class SliceNode extends Node {
jtulach@1348: int[] buffer;
jtulach@1348: SliceNode(int[] buf) {
jtulach@1348: buffer = buf;
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.minLength += buffer.length;
jtulach@1348: info.maxLength += buffer.length;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a case sensitive/BMP-only sequence of literal
jtulach@1348: * characters.
jtulach@1348: */
jtulach@1348: static final class Slice extends SliceNode {
jtulach@1348: Slice(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] buf = buffer;
jtulach@1348: int len = buf.length;
jtulach@1348: for (int j=0; j= matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: if (buf[j] != seq.charAt(i+j))
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i+len, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a case_insensitive/BMP-only sequence of literal
jtulach@1348: * characters.
jtulach@1348: */
jtulach@1348: static class SliceI extends SliceNode {
jtulach@1348: SliceI(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] buf = buffer;
jtulach@1348: int len = buf.length;
jtulach@1348: for (int j=0; j= matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int c = seq.charAt(i+j);
jtulach@1348: if (buf[j] != c &&
jtulach@1348: buf[j] != ASCII.toLower(c))
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i+len, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a unicode_case_insensitive/BMP-only sequence of
jtulach@1348: * literal characters. Uses unicode case folding.
jtulach@1348: */
jtulach@1348: static final class SliceU extends SliceNode {
jtulach@1348: SliceU(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] buf = buffer;
jtulach@1348: int len = buf.length;
jtulach@1348: for (int j=0; j= matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int c = seq.charAt(i+j);
jtulach@1348: if (buf[j] != c &&
jtulach@1348: buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i+len, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a case sensitive sequence of literal characters
jtulach@1348: * including supplementary characters.
jtulach@1348: */
jtulach@1348: static final class SliceS extends SliceNode {
jtulach@1348: SliceS(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] buf = buffer;
jtulach@1348: int x = i;
jtulach@1348: for (int j = 0; j < buf.length; j++) {
jtulach@1348: if (x >= matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int c = Character.codePointAt(seq, x);
jtulach@1348: if (buf[j] != c)
jtulach@1348: return false;
jtulach@1348: x += Character.charCount(c);
jtulach@1348: if (x > matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return next.match(matcher, x, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a case insensitive sequence of literal characters
jtulach@1348: * including supplementary characters.
jtulach@1348: */
jtulach@1348: static class SliceIS extends SliceNode {
jtulach@1348: SliceIS(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: int toLower(int c) {
jtulach@1348: return ASCII.toLower(c);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] buf = buffer;
jtulach@1348: int x = i;
jtulach@1348: for (int j = 0; j < buf.length; j++) {
jtulach@1348: if (x >= matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: int c = Character.codePointAt(seq, x);
jtulach@1348: if (buf[j] != c && buf[j] != toLower(c))
jtulach@1348: return false;
jtulach@1348: x += Character.charCount(c);
jtulach@1348: if (x > matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return next.match(matcher, x, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for a case insensitive sequence of literal characters.
jtulach@1348: * Uses unicode case folding.
jtulach@1348: */
jtulach@1348: static final class SliceUS extends SliceIS {
jtulach@1348: SliceUS(int[] buf) {
jtulach@1348: super(buf);
jtulach@1348: }
jtulach@1348: int toLower(int c) {
jtulach@1348: return Character.toLowerCase(Character.toUpperCase(c));
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static boolean inRange(int lower, int ch, int upper) {
jtulach@1348: return lower <= ch && ch <= upper;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns node for matching characters within an explicit value range.
jtulach@1348: */
jtulach@1348: private static CharProperty rangeFor(final int lower,
jtulach@1348: final int upper) {
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return inRange(lower, ch, upper);}};
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns node for matching characters within an explicit value
jtulach@1348: * range in a case insensitive manner.
jtulach@1348: */
jtulach@1348: private CharProperty caseInsensitiveRangeFor(final int lower,
jtulach@1348: final int upper) {
jtulach@1348: if (has(UNICODE_CASE))
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: if (inRange(lower, ch, upper))
jtulach@1348: return true;
jtulach@1348: int up = Character.toUpperCase(ch);
jtulach@1348: return inRange(lower, up, upper) ||
jtulach@1348: inRange(lower, Character.toLowerCase(up), upper);}};
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return inRange(lower, ch, upper) ||
jtulach@1348: ASCII.isAscii(ch) &&
jtulach@1348: (inRange(lower, ASCII.toUpper(ch), upper) ||
jtulach@1348: inRange(lower, ASCII.toLower(ch), upper));
jtulach@1348: }};
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Implements the Unicode category ALL and the dot metacharacter when
jtulach@1348: * in dotall mode.
jtulach@1348: */
jtulach@1348: static final class All extends CharProperty {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for the dot metacharacter when dotall is not enabled.
jtulach@1348: */
jtulach@1348: static final class Dot extends CharProperty {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return (ch != '\n' && ch != '\r'
jtulach@1348: && (ch|1) != '\u2029'
jtulach@1348: && ch != '\u0085');
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Node class for the dot metacharacter when dotall is not enabled
jtulach@1348: * but UNIX_LINES is enabled.
jtulach@1348: */
jtulach@1348: static final class UnixDot extends CharProperty {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ch != '\n';
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The 0 or 1 quantifier. This one class implements all three types.
jtulach@1348: */
jtulach@1348: static final class Ques extends Node {
jtulach@1348: Node atom;
jtulach@1348: int type;
jtulach@1348: Ques(Node node, int type) {
jtulach@1348: this.atom = node;
jtulach@1348: this.type = type;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: switch (type) {
jtulach@1348: case GREEDY:
jtulach@1348: return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
jtulach@1348: || next.match(matcher, i, seq);
jtulach@1348: case LAZY:
jtulach@1348: return next.match(matcher, i, seq)
jtulach@1348: || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
jtulach@1348: case POSSESSIVE:
jtulach@1348: if (atom.match(matcher, i, seq)) i = matcher.last;
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: default:
jtulach@1348: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: if (type != INDEPENDENT) {
jtulach@1348: int minL = info.minLength;
jtulach@1348: atom.study(info);
jtulach@1348: info.minLength = minL;
jtulach@1348: info.deterministic = false;
jtulach@1348: return next.study(info);
jtulach@1348: } else {
jtulach@1348: atom.study(info);
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles the curly-brace style repetition with a specified minimum and
jtulach@1348: * maximum occurrences. The * quantifier is handled as a special case.
jtulach@1348: * This class handles the three types.
jtulach@1348: */
jtulach@1348: static final class Curly extends Node {
jtulach@1348: Node atom;
jtulach@1348: int type;
jtulach@1348: int cmin;
jtulach@1348: int cmax;
jtulach@1348:
jtulach@1348: Curly(Node node, int cmin, int cmax, int type) {
jtulach@1348: this.atom = node;
jtulach@1348: this.type = type;
jtulach@1348: this.cmin = cmin;
jtulach@1348: this.cmax = cmax;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int j;
jtulach@1348: for (j = 0; j < cmin; j++) {
jtulach@1348: if (atom.match(matcher, i, seq)) {
jtulach@1348: i = matcher.last;
jtulach@1348: continue;
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: if (type == GREEDY)
jtulach@1348: return match0(matcher, i, j, seq);
jtulach@1348: else if (type == LAZY)
jtulach@1348: return match1(matcher, i, j, seq);
jtulach@1348: else
jtulach@1348: return match2(matcher, i, j, seq);
jtulach@1348: }
jtulach@1348: // Greedy match.
jtulach@1348: // i is the index to start matching at
jtulach@1348: // j is the number of atoms that have matched
jtulach@1348: boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: if (j >= cmax) {
jtulach@1348: // We have matched the maximum... continue with the rest of
jtulach@1348: // the regular expression
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: int backLimit = j;
jtulach@1348: while (atom.match(matcher, i, seq)) {
jtulach@1348: // k is the length of this match
jtulach@1348: int k = matcher.last - i;
jtulach@1348: if (k == 0) // Zero length match
jtulach@1348: break;
jtulach@1348: // Move up index and number matched
jtulach@1348: i = matcher.last;
jtulach@1348: j++;
jtulach@1348: // We are greedy so match as many as we can
jtulach@1348: while (j < cmax) {
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: break;
jtulach@1348: if (i + k != matcher.last) {
jtulach@1348: if (match0(matcher, matcher.last, j+1, seq))
jtulach@1348: return true;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: i += k;
jtulach@1348: j++;
jtulach@1348: }
jtulach@1348: // Handle backing off if match fails
jtulach@1348: while (j >= backLimit) {
jtulach@1348: if (next.match(matcher, i, seq))
jtulach@1348: return true;
jtulach@1348: i -= k;
jtulach@1348: j--;
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: // Reluctant match. At this point, the minimum has been satisfied.
jtulach@1348: // i is the index to start matching at
jtulach@1348: // j is the number of atoms that have matched
jtulach@1348: boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: for (;;) {
jtulach@1348: // Try finishing match without consuming any more
jtulach@1348: if (next.match(matcher, i, seq))
jtulach@1348: return true;
jtulach@1348: // At the maximum, no match found
jtulach@1348: if (j >= cmax)
jtulach@1348: return false;
jtulach@1348: // Okay, must try one more atom
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: return false;
jtulach@1348: // If we haven't moved forward then must break out
jtulach@1348: if (i == matcher.last)
jtulach@1348: return false;
jtulach@1348: // Move up index and number matched
jtulach@1348: i = matcher.last;
jtulach@1348: j++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: for (; j < cmax; j++) {
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: break;
jtulach@1348: if (i == matcher.last)
jtulach@1348: break;
jtulach@1348: i = matcher.last;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: // Save original info
jtulach@1348: int minL = info.minLength;
jtulach@1348: int maxL = info.maxLength;
jtulach@1348: boolean maxV = info.maxValid;
jtulach@1348: boolean detm = info.deterministic;
jtulach@1348: info.reset();
jtulach@1348:
jtulach@1348: atom.study(info);
jtulach@1348:
jtulach@1348: int temp = info.minLength * cmin + minL;
jtulach@1348: if (temp < minL) {
jtulach@1348: temp = 0xFFFFFFF; // arbitrary large number
jtulach@1348: }
jtulach@1348: info.minLength = temp;
jtulach@1348:
jtulach@1348: if (maxV & info.maxValid) {
jtulach@1348: temp = info.maxLength * cmax + maxL;
jtulach@1348: info.maxLength = temp;
jtulach@1348: if (temp < maxL) {
jtulach@1348: info.maxValid = false;
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: info.maxValid = false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: if (info.deterministic && cmin == cmax)
jtulach@1348: info.deterministic = detm;
jtulach@1348: else
jtulach@1348: info.deterministic = false;
jtulach@1348:
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles the curly-brace style repetition with a specified minimum and
jtulach@1348: * maximum occurrences in deterministic cases. This is an iterative
jtulach@1348: * optimization over the Prolog and Loop system which would handle this
jtulach@1348: * in a recursive way. The * quantifier is handled as a special case.
jtulach@1348: * If capture is true then this class saves group settings and ensures
jtulach@1348: * that groups are unset when backing off of a group match.
jtulach@1348: */
jtulach@1348: static final class GroupCurly extends Node {
jtulach@1348: Node atom;
jtulach@1348: int type;
jtulach@1348: int cmin;
jtulach@1348: int cmax;
jtulach@1348: int localIndex;
jtulach@1348: int groupIndex;
jtulach@1348: boolean capture;
jtulach@1348:
jtulach@1348: GroupCurly(Node node, int cmin, int cmax, int type, int local,
jtulach@1348: int group, boolean capture) {
jtulach@1348: this.atom = node;
jtulach@1348: this.type = type;
jtulach@1348: this.cmin = cmin;
jtulach@1348: this.cmax = cmax;
jtulach@1348: this.localIndex = local;
jtulach@1348: this.groupIndex = group;
jtulach@1348: this.capture = capture;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] groups = matcher.groups;
jtulach@1348: int[] locals = matcher.locals;
jtulach@1348: int save0 = locals[localIndex];
jtulach@1348: int save1 = 0;
jtulach@1348: int save2 = 0;
jtulach@1348:
jtulach@1348: if (capture) {
jtulach@1348: save1 = groups[groupIndex];
jtulach@1348: save2 = groups[groupIndex+1];
jtulach@1348: }
jtulach@1348:
jtulach@1348: // Notify GroupTail there is no need to setup group info
jtulach@1348: // because it will be set here
jtulach@1348: locals[localIndex] = -1;
jtulach@1348:
jtulach@1348: boolean ret = true;
jtulach@1348: for (int j = 0; j < cmin; j++) {
jtulach@1348: if (atom.match(matcher, i, seq)) {
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex] = i;
jtulach@1348: groups[groupIndex+1] = matcher.last;
jtulach@1348: }
jtulach@1348: i = matcher.last;
jtulach@1348: } else {
jtulach@1348: ret = false;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (ret) {
jtulach@1348: if (type == GREEDY) {
jtulach@1348: ret = match0(matcher, i, cmin, seq);
jtulach@1348: } else if (type == LAZY) {
jtulach@1348: ret = match1(matcher, i, cmin, seq);
jtulach@1348: } else {
jtulach@1348: ret = match2(matcher, i, cmin, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: if (!ret) {
jtulach@1348: locals[localIndex] = save0;
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex] = save1;
jtulach@1348: groups[groupIndex+1] = save2;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return ret;
jtulach@1348: }
jtulach@1348: // Aggressive group match
jtulach@1348: boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: int[] groups = matcher.groups;
jtulach@1348: int save0 = 0;
jtulach@1348: int save1 = 0;
jtulach@1348: if (capture) {
jtulach@1348: save0 = groups[groupIndex];
jtulach@1348: save1 = groups[groupIndex+1];
jtulach@1348: }
jtulach@1348: for (;;) {
jtulach@1348: if (j >= cmax)
jtulach@1348: break;
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: break;
jtulach@1348: int k = matcher.last - i;
jtulach@1348: if (k <= 0) {
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex] = i;
jtulach@1348: groups[groupIndex+1] = i + k;
jtulach@1348: }
jtulach@1348: i = i + k;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: for (;;) {
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex] = i;
jtulach@1348: groups[groupIndex+1] = i + k;
jtulach@1348: }
jtulach@1348: i = i + k;
jtulach@1348: if (++j >= cmax)
jtulach@1348: break;
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: break;
jtulach@1348: if (i + k != matcher.last) {
jtulach@1348: if (match0(matcher, i, j, seq))
jtulach@1348: return true;
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: while (j > cmin) {
jtulach@1348: if (next.match(matcher, i, seq)) {
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex+1] = i;
jtulach@1348: groups[groupIndex] = i - k;
jtulach@1348: }
jtulach@1348: i = i - k;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: // backing off
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex+1] = i;
jtulach@1348: groups[groupIndex] = i - k;
jtulach@1348: }
jtulach@1348: i = i - k;
jtulach@1348: j--;
jtulach@1348: }
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: if (capture) {
jtulach@1348: groups[groupIndex] = save0;
jtulach@1348: groups[groupIndex+1] = save1;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: // Reluctant matching
jtulach@1348: boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: for (;;) {
jtulach@1348: if (next.match(matcher, i, seq))
jtulach@1348: return true;
jtulach@1348: if (j >= cmax)
jtulach@1348: return false;
jtulach@1348: if (!atom.match(matcher, i, seq))
jtulach@1348: return false;
jtulach@1348: if (i == matcher.last)
jtulach@1348: return false;
jtulach@1348: if (capture) {
jtulach@1348: matcher.groups[groupIndex] = i;
jtulach@1348: matcher.groups[groupIndex+1] = matcher.last;
jtulach@1348: }
jtulach@1348: i = matcher.last;
jtulach@1348: j++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Possessive matching
jtulach@1348: boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
jtulach@1348: for (; j < cmax; j++) {
jtulach@1348: if (!atom.match(matcher, i, seq)) {
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: if (capture) {
jtulach@1348: matcher.groups[groupIndex] = i;
jtulach@1348: matcher.groups[groupIndex+1] = matcher.last;
jtulach@1348: }
jtulach@1348: if (i == matcher.last) {
jtulach@1348: break;
jtulach@1348: }
jtulach@1348: i = matcher.last;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: // Save original info
jtulach@1348: int minL = info.minLength;
jtulach@1348: int maxL = info.maxLength;
jtulach@1348: boolean maxV = info.maxValid;
jtulach@1348: boolean detm = info.deterministic;
jtulach@1348: info.reset();
jtulach@1348:
jtulach@1348: atom.study(info);
jtulach@1348:
jtulach@1348: int temp = info.minLength * cmin + minL;
jtulach@1348: if (temp < minL) {
jtulach@1348: temp = 0xFFFFFFF; // Arbitrary large number
jtulach@1348: }
jtulach@1348: info.minLength = temp;
jtulach@1348:
jtulach@1348: if (maxV & info.maxValid) {
jtulach@1348: temp = info.maxLength * cmax + maxL;
jtulach@1348: info.maxLength = temp;
jtulach@1348: if (temp < maxL) {
jtulach@1348: info.maxValid = false;
jtulach@1348: }
jtulach@1348: } else {
jtulach@1348: info.maxValid = false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: if (info.deterministic && cmin == cmax) {
jtulach@1348: info.deterministic = detm;
jtulach@1348: } else {
jtulach@1348: info.deterministic = false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * A Guard node at the end of each atom node in a Branch. It
jtulach@1348: * serves the purpose of chaining the "match" operation to
jtulach@1348: * "next" but not the "study", so we can collect the TreeInfo
jtulach@1348: * of each atom node without including the TreeInfo of the
jtulach@1348: * "next".
jtulach@1348: */
jtulach@1348: static final class BranchConn extends Node {
jtulach@1348: BranchConn() {};
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: return info.deterministic;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles the branching of alternations. Note this is also used for
jtulach@1348: * the ? quantifier to branch between the case where it matches once
jtulach@1348: * and where it does not occur.
jtulach@1348: */
jtulach@1348: static final class Branch extends Node {
jtulach@1348: Node[] atoms = new Node[2];
jtulach@1348: int size = 2;
jtulach@1348: Node conn;
jtulach@1348: Branch(Node first, Node second, Node branchConn) {
jtulach@1348: conn = branchConn;
jtulach@1348: atoms[0] = first;
jtulach@1348: atoms[1] = second;
jtulach@1348: }
jtulach@1348:
jtulach@1348: void add(Node node) {
jtulach@1348: if (size >= atoms.length) {
jtulach@1348: Node[] tmp = new Node[atoms.length*2];
jtulach@1348: System.arraycopy(atoms, 0, tmp, 0, atoms.length);
jtulach@1348: atoms = tmp;
jtulach@1348: }
jtulach@1348: atoms[size++] = node;
jtulach@1348: }
jtulach@1348:
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: for (int n = 0; n < size; n++) {
jtulach@1348: if (atoms[n] == null) {
jtulach@1348: if (conn.next.match(matcher, i, seq))
jtulach@1348: return true;
jtulach@1348: } else if (atoms[n].match(matcher, i, seq)) {
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: int minL = info.minLength;
jtulach@1348: int maxL = info.maxLength;
jtulach@1348: boolean maxV = info.maxValid;
jtulach@1348:
jtulach@1348: int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
jtulach@1348: int maxL2 = -1;
jtulach@1348: for (int n = 0; n < size; n++) {
jtulach@1348: info.reset();
jtulach@1348: if (atoms[n] != null)
jtulach@1348: atoms[n].study(info);
jtulach@1348: minL2 = Math.min(minL2, info.minLength);
jtulach@1348: maxL2 = Math.max(maxL2, info.maxLength);
jtulach@1348: maxV = (maxV & info.maxValid);
jtulach@1348: }
jtulach@1348:
jtulach@1348: minL += minL2;
jtulach@1348: maxL += maxL2;
jtulach@1348:
jtulach@1348: info.reset();
jtulach@1348: conn.next.study(info);
jtulach@1348:
jtulach@1348: info.minLength += minL;
jtulach@1348: info.maxLength += maxL;
jtulach@1348: info.maxValid &= maxV;
jtulach@1348: info.deterministic = false;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The GroupHead saves the location where the group begins in the locals
jtulach@1348: * and restores them when the match is done.
jtulach@1348: *
jtulach@1348: * The matchRef is used when a reference to this group is accessed later
jtulach@1348: * in the expression. The locals will have a negative value in them to
jtulach@1348: * indicate that we do not want to unset the group if the reference
jtulach@1348: * doesn't match.
jtulach@1348: */
jtulach@1348: static final class GroupHead extends Node {
jtulach@1348: int localIndex;
jtulach@1348: GroupHead(int localCount) {
jtulach@1348: localIndex = localCount;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int save = matcher.locals[localIndex];
jtulach@1348: matcher.locals[localIndex] = i;
jtulach@1348: boolean ret = next.match(matcher, i, seq);
jtulach@1348: matcher.locals[localIndex] = save;
jtulach@1348: return ret;
jtulach@1348: }
jtulach@1348: boolean matchRef(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int save = matcher.locals[localIndex];
jtulach@1348: matcher.locals[localIndex] = ~i; // HACK
jtulach@1348: boolean ret = next.match(matcher, i, seq);
jtulach@1348: matcher.locals[localIndex] = save;
jtulach@1348: return ret;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Recursive reference to a group in the regular expression. It calls
jtulach@1348: * matchRef because if the reference fails to match we would not unset
jtulach@1348: * the group.
jtulach@1348: */
jtulach@1348: static final class GroupRef extends Node {
jtulach@1348: GroupHead head;
jtulach@1348: GroupRef(GroupHead head) {
jtulach@1348: this.head = head;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: return head.matchRef(matcher, i, seq)
jtulach@1348: && next.match(matcher, matcher.last, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.maxValid = false;
jtulach@1348: info.deterministic = false;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * The GroupTail handles the setting of group beginning and ending
jtulach@1348: * locations when groups are successfully matched. It must also be able to
jtulach@1348: * unset groups that have to be backed off of.
jtulach@1348: *
jtulach@1348: * The GroupTail node is also used when a previous group is referenced,
jtulach@1348: * and in that case no group information needs to be set.
jtulach@1348: */
jtulach@1348: static final class GroupTail extends Node {
jtulach@1348: int localIndex;
jtulach@1348: int groupIndex;
jtulach@1348: GroupTail(int localCount, int groupCount) {
jtulach@1348: localIndex = localCount;
jtulach@1348: groupIndex = groupCount + groupCount;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int tmp = matcher.locals[localIndex];
jtulach@1348: if (tmp >= 0) { // This is the normal group case.
jtulach@1348: // Save the group so we can unset it if it
jtulach@1348: // backs off of a match.
jtulach@1348: int groupStart = matcher.groups[groupIndex];
jtulach@1348: int groupEnd = matcher.groups[groupIndex+1];
jtulach@1348:
jtulach@1348: matcher.groups[groupIndex] = tmp;
jtulach@1348: matcher.groups[groupIndex+1] = i;
jtulach@1348: if (next.match(matcher, i, seq)) {
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: matcher.groups[groupIndex] = groupStart;
jtulach@1348: matcher.groups[groupIndex+1] = groupEnd;
jtulach@1348: return false;
jtulach@1348: } else {
jtulach@1348: // This is a group reference case. We don't need to save any
jtulach@1348: // group info because it isn't really a group.
jtulach@1348: matcher.last = i;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * This sets up a loop to handle a recursive quantifier structure.
jtulach@1348: */
jtulach@1348: static final class Prolog extends Node {
jtulach@1348: Loop loop;
jtulach@1348: Prolog(Loop loop) {
jtulach@1348: this.loop = loop;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: return loop.matchInit(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: return loop.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles the repetition count for a greedy Curly. The matchInit
jtulach@1348: * is called from the Prolog to save the index of where the group
jtulach@1348: * beginning is stored. A zero length group check occurs in the
jtulach@1348: * normal match but is skipped in the matchInit.
jtulach@1348: */
jtulach@1348: static class Loop extends Node {
jtulach@1348: Node body;
jtulach@1348: int countIndex; // local count index in matcher locals
jtulach@1348: int beginIndex; // group beginning index
jtulach@1348: int cmin, cmax;
jtulach@1348: Loop(int countIndex, int beginIndex) {
jtulach@1348: this.countIndex = countIndex;
jtulach@1348: this.beginIndex = beginIndex;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: // Avoid infinite loop in zero-length case.
jtulach@1348: if (i > matcher.locals[beginIndex]) {
jtulach@1348: int count = matcher.locals[countIndex];
jtulach@1348:
jtulach@1348: // This block is for before we reach the minimum
jtulach@1348: // iterations required for the loop to match
jtulach@1348: if (count < cmin) {
jtulach@1348: matcher.locals[countIndex] = count + 1;
jtulach@1348: boolean b = body.match(matcher, i, seq);
jtulach@1348: // If match failed we must backtrack, so
jtulach@1348: // the loop count should NOT be incremented
jtulach@1348: if (!b)
jtulach@1348: matcher.locals[countIndex] = count;
jtulach@1348: // Return success or failure since we are under
jtulach@1348: // minimum
jtulach@1348: return b;
jtulach@1348: }
jtulach@1348: // This block is for after we have the minimum
jtulach@1348: // iterations required for the loop to match
jtulach@1348: if (count < cmax) {
jtulach@1348: matcher.locals[countIndex] = count + 1;
jtulach@1348: boolean b = body.match(matcher, i, seq);
jtulach@1348: // If match failed we must backtrack, so
jtulach@1348: // the loop count should NOT be incremented
jtulach@1348: if (!b)
jtulach@1348: matcher.locals[countIndex] = count;
jtulach@1348: else
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean matchInit(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int save = matcher.locals[countIndex];
jtulach@1348: boolean ret = false;
jtulach@1348: if (0 < cmin) {
jtulach@1348: matcher.locals[countIndex] = 1;
jtulach@1348: ret = body.match(matcher, i, seq);
jtulach@1348: } else if (0 < cmax) {
jtulach@1348: matcher.locals[countIndex] = 1;
jtulach@1348: ret = body.match(matcher, i, seq);
jtulach@1348: if (ret == false)
jtulach@1348: ret = next.match(matcher, i, seq);
jtulach@1348: } else {
jtulach@1348: ret = next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: matcher.locals[countIndex] = save;
jtulach@1348: return ret;
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.maxValid = false;
jtulach@1348: info.deterministic = false;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles the repetition count for a reluctant Curly. The matchInit
jtulach@1348: * is called from the Prolog to save the index of where the group
jtulach@1348: * beginning is stored. A zero length group check occurs in the
jtulach@1348: * normal match but is skipped in the matchInit.
jtulach@1348: */
jtulach@1348: static final class LazyLoop extends Loop {
jtulach@1348: LazyLoop(int countIndex, int beginIndex) {
jtulach@1348: super(countIndex, beginIndex);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: // Check for zero length group
jtulach@1348: if (i > matcher.locals[beginIndex]) {
jtulach@1348: int count = matcher.locals[countIndex];
jtulach@1348: if (count < cmin) {
jtulach@1348: matcher.locals[countIndex] = count + 1;
jtulach@1348: boolean result = body.match(matcher, i, seq);
jtulach@1348: // If match failed we must backtrack, so
jtulach@1348: // the loop count should NOT be incremented
jtulach@1348: if (!result)
jtulach@1348: matcher.locals[countIndex] = count;
jtulach@1348: return result;
jtulach@1348: }
jtulach@1348: if (next.match(matcher, i, seq))
jtulach@1348: return true;
jtulach@1348: if (count < cmax) {
jtulach@1348: matcher.locals[countIndex] = count + 1;
jtulach@1348: boolean result = body.match(matcher, i, seq);
jtulach@1348: // If match failed we must backtrack, so
jtulach@1348: // the loop count should NOT be incremented
jtulach@1348: if (!result)
jtulach@1348: matcher.locals[countIndex] = count;
jtulach@1348: return result;
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: boolean matchInit(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int save = matcher.locals[countIndex];
jtulach@1348: boolean ret = false;
jtulach@1348: if (0 < cmin) {
jtulach@1348: matcher.locals[countIndex] = 1;
jtulach@1348: ret = body.match(matcher, i, seq);
jtulach@1348: } else if (next.match(matcher, i, seq)) {
jtulach@1348: ret = true;
jtulach@1348: } else if (0 < cmax) {
jtulach@1348: matcher.locals[countIndex] = 1;
jtulach@1348: ret = body.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: matcher.locals[countIndex] = save;
jtulach@1348: return ret;
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.maxValid = false;
jtulach@1348: info.deterministic = false;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Refers to a group in the regular expression. Attempts to match
jtulach@1348: * whatever the group referred to last matched.
jtulach@1348: */
jtulach@1348: static class BackRef extends Node {
jtulach@1348: int groupIndex;
jtulach@1348: BackRef(int groupCount) {
jtulach@1348: super();
jtulach@1348: groupIndex = groupCount + groupCount;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int j = matcher.groups[groupIndex];
jtulach@1348: int k = matcher.groups[groupIndex+1];
jtulach@1348:
jtulach@1348: int groupSize = k - j;
jtulach@1348:
jtulach@1348: // If the referenced group didn't match, neither can this
jtulach@1348: if (j < 0)
jtulach@1348: return false;
jtulach@1348:
jtulach@1348: // If there isn't enough input left no match
jtulach@1348: if (i + groupSize > matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: // Check each new char to make sure it matches what the group
jtulach@1348: // referenced matched last time around
jtulach@1348: for (int index=0; index matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: // Check each new char to make sure it matches what the group
jtulach@1348: // referenced matched last time around
jtulach@1348: int x = i;
jtulach@1348: for (int index=0; index matcher.to) {
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: if (atom.match(matcher, i, seq)) {
jtulach@1348: return next.match(matcher, matcher.last, seq);
jtulach@1348: }
jtulach@1348: i += countChars(seq, i, 1);
jtulach@1348: matcher.first++;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: atom.study(info);
jtulach@1348: info.maxValid = false;
jtulach@1348: info.deterministic = false;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: static final class Conditional extends Node {
jtulach@1348: Node cond, yes, not;
jtulach@1348: Conditional(Node cond, Node yes, Node not) {
jtulach@1348: this.cond = cond;
jtulach@1348: this.yes = yes;
jtulach@1348: this.not = not;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: if (cond.match(matcher, i, seq)) {
jtulach@1348: return yes.match(matcher, i, seq);
jtulach@1348: } else {
jtulach@1348: return not.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: int minL = info.minLength;
jtulach@1348: int maxL = info.maxLength;
jtulach@1348: boolean maxV = info.maxValid;
jtulach@1348: info.reset();
jtulach@1348: yes.study(info);
jtulach@1348:
jtulach@1348: int minL2 = info.minLength;
jtulach@1348: int maxL2 = info.maxLength;
jtulach@1348: boolean maxV2 = info.maxValid;
jtulach@1348: info.reset();
jtulach@1348: not.study(info);
jtulach@1348:
jtulach@1348: info.minLength = minL + Math.min(minL2, info.minLength);
jtulach@1348: info.maxLength = maxL + Math.max(maxL2, info.maxLength);
jtulach@1348: info.maxValid = (maxV & maxV2 & info.maxValid);
jtulach@1348: info.deterministic = false;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width positive lookahead.
jtulach@1348: */
jtulach@1348: static final class Pos extends Node {
jtulach@1348: Node cond;
jtulach@1348: Pos(Node cond) {
jtulach@1348: this.cond = cond;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int savedTo = matcher.to;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348:
jtulach@1348: // Relax transparent region boundaries for lookahead
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.to = matcher.getTextLength();
jtulach@1348: try {
jtulach@1348: conditionMatched = cond.match(matcher, i, seq);
jtulach@1348: } finally {
jtulach@1348: // Reinstate region boundaries
jtulach@1348: matcher.to = savedTo;
jtulach@1348: }
jtulach@1348: return conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width negative lookahead.
jtulach@1348: */
jtulach@1348: static final class Neg extends Node {
jtulach@1348: Node cond;
jtulach@1348: Neg(Node cond) {
jtulach@1348: this.cond = cond;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int savedTo = matcher.to;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348:
jtulach@1348: // Relax transparent region boundaries for lookahead
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.to = matcher.getTextLength();
jtulach@1348: try {
jtulach@1348: if (i < matcher.to) {
jtulach@1348: conditionMatched = !cond.match(matcher, i, seq);
jtulach@1348: } else {
jtulach@1348: // If a negative lookahead succeeds then more input
jtulach@1348: // could cause it to fail!
jtulach@1348: matcher.requireEnd = true;
jtulach@1348: conditionMatched = !cond.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: } finally {
jtulach@1348: // Reinstate region boundaries
jtulach@1348: matcher.to = savedTo;
jtulach@1348: }
jtulach@1348: return conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * For use with lookbehinds; matches the position where the lookbehind
jtulach@1348: * was encountered.
jtulach@1348: */
jtulach@1348: static Node lookbehindEnd = new Node() {
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: return i == matcher.lookbehindTo;
jtulach@1348: }
jtulach@1348: };
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width positive lookbehind.
jtulach@1348: */
jtulach@1348: static class Behind extends Node {
jtulach@1348: Node cond;
jtulach@1348: int rmax, rmin;
jtulach@1348: Behind(Node cond, int rmax, int rmin) {
jtulach@1348: this.cond = cond;
jtulach@1348: this.rmax = rmax;
jtulach@1348: this.rmin = rmin;
jtulach@1348: }
jtulach@1348:
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int savedFrom = matcher.from;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348: int startIndex = (!matcher.transparentBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: int from = Math.max(i - rmax, startIndex);
jtulach@1348: // Set end boundary
jtulach@1348: int savedLBT = matcher.lookbehindTo;
jtulach@1348: matcher.lookbehindTo = i;
jtulach@1348: // Relax transparent region boundaries for lookbehind
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.from = 0;
jtulach@1348: for (int j = i - rmin; !conditionMatched && j >= from; j--) {
jtulach@1348: conditionMatched = cond.match(matcher, j, seq);
jtulach@1348: }
jtulach@1348: matcher.from = savedFrom;
jtulach@1348: matcher.lookbehindTo = savedLBT;
jtulach@1348: return conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width positive lookbehind, including supplementary
jtulach@1348: * characters or unpaired surrogates.
jtulach@1348: */
jtulach@1348: static final class BehindS extends Behind {
jtulach@1348: BehindS(Node cond, int rmax, int rmin) {
jtulach@1348: super(cond, rmax, rmin);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int rmaxChars = countChars(seq, i, -rmax);
jtulach@1348: int rminChars = countChars(seq, i, -rmin);
jtulach@1348: int savedFrom = matcher.from;
jtulach@1348: int startIndex = (!matcher.transparentBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348: int from = Math.max(i - rmaxChars, startIndex);
jtulach@1348: // Set end boundary
jtulach@1348: int savedLBT = matcher.lookbehindTo;
jtulach@1348: matcher.lookbehindTo = i;
jtulach@1348: // Relax transparent region boundaries for lookbehind
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.from = 0;
jtulach@1348:
jtulach@1348: for (int j = i - rminChars;
jtulach@1348: !conditionMatched && j >= from;
jtulach@1348: j -= j>from ? countChars(seq, j, -1) : 1) {
jtulach@1348: conditionMatched = cond.match(matcher, j, seq);
jtulach@1348: }
jtulach@1348: matcher.from = savedFrom;
jtulach@1348: matcher.lookbehindTo = savedLBT;
jtulach@1348: return conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width negative lookbehind.
jtulach@1348: */
jtulach@1348: static class NotBehind extends Node {
jtulach@1348: Node cond;
jtulach@1348: int rmax, rmin;
jtulach@1348: NotBehind(Node cond, int rmax, int rmin) {
jtulach@1348: this.cond = cond;
jtulach@1348: this.rmax = rmax;
jtulach@1348: this.rmin = rmin;
jtulach@1348: }
jtulach@1348:
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int savedLBT = matcher.lookbehindTo;
jtulach@1348: int savedFrom = matcher.from;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348: int startIndex = (!matcher.transparentBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: int from = Math.max(i - rmax, startIndex);
jtulach@1348: matcher.lookbehindTo = i;
jtulach@1348: // Relax transparent region boundaries for lookbehind
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.from = 0;
jtulach@1348: for (int j = i - rmin; !conditionMatched && j >= from; j--) {
jtulach@1348: conditionMatched = cond.match(matcher, j, seq);
jtulach@1348: }
jtulach@1348: // Reinstate region boundaries
jtulach@1348: matcher.from = savedFrom;
jtulach@1348: matcher.lookbehindTo = savedLBT;
jtulach@1348: return !conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Zero width negative lookbehind, including supplementary
jtulach@1348: * characters or unpaired surrogates.
jtulach@1348: */
jtulach@1348: static final class NotBehindS extends NotBehind {
jtulach@1348: NotBehindS(Node cond, int rmax, int rmin) {
jtulach@1348: super(cond, rmax, rmin);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int rmaxChars = countChars(seq, i, -rmax);
jtulach@1348: int rminChars = countChars(seq, i, -rmin);
jtulach@1348: int savedFrom = matcher.from;
jtulach@1348: int savedLBT = matcher.lookbehindTo;
jtulach@1348: boolean conditionMatched = false;
jtulach@1348: int startIndex = (!matcher.transparentBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: int from = Math.max(i - rmaxChars, startIndex);
jtulach@1348: matcher.lookbehindTo = i;
jtulach@1348: // Relax transparent region boundaries for lookbehind
jtulach@1348: if (matcher.transparentBounds)
jtulach@1348: matcher.from = 0;
jtulach@1348: for (int j = i - rminChars;
jtulach@1348: !conditionMatched && j >= from;
jtulach@1348: j -= j>from ? countChars(seq, j, -1) : 1) {
jtulach@1348: conditionMatched = cond.match(matcher, j, seq);
jtulach@1348: }
jtulach@1348: //Reinstate region boundaries
jtulach@1348: matcher.from = savedFrom;
jtulach@1348: matcher.lookbehindTo = savedLBT;
jtulach@1348: return !conditionMatched && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns the set union of two CharProperty nodes.
jtulach@1348: */
jtulach@1348: private static CharProperty union(final CharProperty lhs,
jtulach@1348: final CharProperty rhs) {
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns the set intersection of two CharProperty nodes.
jtulach@1348: */
jtulach@1348: private static CharProperty intersection(final CharProperty lhs,
jtulach@1348: final CharProperty rhs) {
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Returns the set difference of two CharProperty nodes.
jtulach@1348: */
jtulach@1348: private static CharProperty setDifference(final CharProperty lhs,
jtulach@1348: final CharProperty rhs) {
jtulach@1348: return new CharProperty() {
jtulach@1348: boolean isSatisfiedBy(int ch) {
jtulach@1348: return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Handles word boundaries. Includes a field to allow this one class to
jtulach@1348: * deal with the different types of word boundaries we can match. The word
jtulach@1348: * characters include underscores, letters, and digits. Non spacing marks
jtulach@1348: * can are also part of a word if they have a base character, otherwise
jtulach@1348: * they are ignored for purposes of finding word boundaries.
jtulach@1348: */
jtulach@1348: static final class Bound extends Node {
jtulach@1348: static int LEFT = 0x1;
jtulach@1348: static int RIGHT= 0x2;
jtulach@1348: static int BOTH = 0x3;
jtulach@1348: static int NONE = 0x4;
jtulach@1348: int type;
jtulach@1348: boolean useUWORD;
jtulach@1348: Bound(int n, boolean useUWORD) {
jtulach@1348: type = n;
jtulach@1348: this.useUWORD = useUWORD;
jtulach@1348: }
jtulach@1348:
jtulach@1348: boolean isWord(int ch) {
jtulach@1348: return useUWORD ? UnicodeProp.WORD.is(ch)
jtulach@1348: : (ch == '_' || Character.isLetterOrDigit(ch));
jtulach@1348: }
jtulach@1348:
jtulach@1348: int check(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int ch;
jtulach@1348: boolean left = false;
jtulach@1348: int startIndex = matcher.from;
jtulach@1348: int endIndex = matcher.to;
jtulach@1348: if (matcher.transparentBounds) {
jtulach@1348: startIndex = 0;
jtulach@1348: endIndex = matcher.getTextLength();
jtulach@1348: }
jtulach@1348: if (i > startIndex) {
jtulach@1348: ch = Character.codePointBefore(seq, i);
jtulach@1348: left = (isWord(ch) ||
jtulach@1348: ((Character.getType(ch) == Character.NON_SPACING_MARK)
jtulach@1348: && hasBaseCharacter(matcher, i-1, seq)));
jtulach@1348: }
jtulach@1348: boolean right = false;
jtulach@1348: if (i < endIndex) {
jtulach@1348: ch = Character.codePointAt(seq, i);
jtulach@1348: right = (isWord(ch) ||
jtulach@1348: ((Character.getType(ch) == Character.NON_SPACING_MARK)
jtulach@1348: && hasBaseCharacter(matcher, i, seq)));
jtulach@1348: } else {
jtulach@1348: // Tried to access char past the end
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: // The addition of another char could wreck a boundary
jtulach@1348: matcher.requireEnd = true;
jtulach@1348: }
jtulach@1348: return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: return (check(matcher, i, seq) & type) > 0
jtulach@1348: && next.match(matcher, i, seq);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Non spacing marks only count as word characters in bounds calculations
jtulach@1348: * if they have a base character.
jtulach@1348: */
jtulach@1348: private static boolean hasBaseCharacter(Matcher matcher, int i,
jtulach@1348: CharSequence seq)
jtulach@1348: {
jtulach@1348: int start = (!matcher.transparentBounds) ?
jtulach@1348: matcher.from : 0;
jtulach@1348: for (int x=i; x >= start; x--) {
jtulach@1348: int ch = Character.codePointAt(seq, x);
jtulach@1348: if (Character.isLetterOrDigit(ch))
jtulach@1348: return true;
jtulach@1348: if (Character.getType(ch) == Character.NON_SPACING_MARK)
jtulach@1348: continue;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Attempts to match a slice in the input using the Boyer-Moore string
jtulach@1348: * matching algorithm. The algorithm is based on the idea that the
jtulach@1348: * pattern can be shifted farther ahead in the search text if it is
jtulach@1348: * matched right to left.
jtulach@1348: *
jtulach@1348: * The pattern is compared to the input one character at a time, from
jtulach@1348: * the rightmost character in the pattern to the left. If the characters
jtulach@1348: * all match the pattern has been found. If a character does not match,
jtulach@1348: * the pattern is shifted right a distance that is the maximum of two
jtulach@1348: * functions, the bad character shift and the good suffix shift. This
jtulach@1348: * shift moves the attempted match position through the input more
jtulach@1348: * quickly than a naive one position at a time check.
jtulach@1348: *
jtulach@1348: * The bad character shift is based on the character from the text that
jtulach@1348: * did not match. If the character does not appear in the pattern, the
jtulach@1348: * pattern can be shifted completely beyond the bad character. If the
jtulach@1348: * character does occur in the pattern, the pattern can be shifted to
jtulach@1348: * line the pattern up with the next occurrence of that character.
jtulach@1348: *
jtulach@1348: * The good suffix shift is based on the idea that some subset on the right
jtulach@1348: * side of the pattern has matched. When a bad character is found, the
jtulach@1348: * pattern can be shifted right by the pattern length if the subset does
jtulach@1348: * not occur again in pattern, or by the amount of distance to the
jtulach@1348: * next occurrence of the subset in the pattern.
jtulach@1348: *
jtulach@1348: * Boyer-Moore search methods adapted from code by Amy Yu.
jtulach@1348: */
jtulach@1348: static class BnM extends Node {
jtulach@1348: int[] buffer;
jtulach@1348: int[] lastOcc;
jtulach@1348: int[] optoSft;
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Pre calculates arrays needed to generate the bad character
jtulach@1348: * shift and the good suffix shift. Only the last seven bits
jtulach@1348: * are used to see if chars match; This keeps the tables small
jtulach@1348: * and covers the heavily used ASCII range, but occasionally
jtulach@1348: * results in an aliased match for the bad character shift.
jtulach@1348: */
jtulach@1348: static Node optimize(Node node) {
jtulach@1348: if (!(node instanceof Slice)) {
jtulach@1348: return node;
jtulach@1348: }
jtulach@1348:
jtulach@1348: int[] src = ((Slice) node).buffer;
jtulach@1348: int patternLength = src.length;
jtulach@1348: // The BM algorithm requires a bit of overhead;
jtulach@1348: // If the pattern is short don't use it, since
jtulach@1348: // a shift larger than the pattern length cannot
jtulach@1348: // be used anyway.
jtulach@1348: if (patternLength < 4) {
jtulach@1348: return node;
jtulach@1348: }
jtulach@1348: int i, j, k;
jtulach@1348: int[] lastOcc = new int[128];
jtulach@1348: int[] optoSft = new int[patternLength];
jtulach@1348: // Precalculate part of the bad character shift
jtulach@1348: // It is a table for where in the pattern each
jtulach@1348: // lower 7-bit value occurs
jtulach@1348: for (i = 0; i < patternLength; i++) {
jtulach@1348: lastOcc[src[i]&0x7F] = i + 1;
jtulach@1348: }
jtulach@1348: // Precalculate the good suffix shift
jtulach@1348: // i is the shift amount being considered
jtulach@1348: NEXT: for (i = patternLength; i > 0; i--) {
jtulach@1348: // j is the beginning index of suffix being considered
jtulach@1348: for (j = patternLength - 1; j >= i; j--) {
jtulach@1348: // Testing for good suffix
jtulach@1348: if (src[j] == src[j-i]) {
jtulach@1348: // src[j..len] is a good suffix
jtulach@1348: optoSft[j-1] = i;
jtulach@1348: } else {
jtulach@1348: // No match. The array has already been
jtulach@1348: // filled up with correct values before.
jtulach@1348: continue NEXT;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // This fills up the remaining of optoSft
jtulach@1348: // any suffix can not have larger shift amount
jtulach@1348: // then its sub-suffix. Why???
jtulach@1348: while (j > 0) {
jtulach@1348: optoSft[--j] = i;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Set the guard value because of unicode compression
jtulach@1348: optoSft[patternLength-1] = 1;
jtulach@1348: if (node instanceof SliceS)
jtulach@1348: return new BnMS(src, lastOcc, optoSft, node.next);
jtulach@1348: return new BnM(src, lastOcc, optoSft, node.next);
jtulach@1348: }
jtulach@1348: BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
jtulach@1348: this.buffer = src;
jtulach@1348: this.lastOcc = lastOcc;
jtulach@1348: this.optoSft = optoSft;
jtulach@1348: this.next = next;
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] src = buffer;
jtulach@1348: int patternLength = src.length;
jtulach@1348: int last = matcher.to - patternLength;
jtulach@1348:
jtulach@1348: // Loop over all possible match positions in text
jtulach@1348: NEXT: while (i <= last) {
jtulach@1348: // Loop over pattern from right to left
jtulach@1348: for (int j = patternLength - 1; j >= 0; j--) {
jtulach@1348: int ch = seq.charAt(i+j);
jtulach@1348: if (ch != src[j]) {
jtulach@1348: // Shift search to the right by the maximum of the
jtulach@1348: // bad character shift and the good suffix shift
jtulach@1348: i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
jtulach@1348: continue NEXT;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Entire pattern matched starting at i
jtulach@1348: matcher.first = i;
jtulach@1348: boolean ret = next.match(matcher, i + patternLength, seq);
jtulach@1348: if (ret) {
jtulach@1348: matcher.first = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: i++;
jtulach@1348: }
jtulach@1348: // BnM is only used as the leading node in the unanchored case,
jtulach@1348: // and it replaced its Start() which always searches to the end
jtulach@1348: // if it doesn't find what it's looking for, so hitEnd is true.
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: boolean study(TreeInfo info) {
jtulach@1348: info.minLength += buffer.length;
jtulach@1348: info.maxValid = false;
jtulach@1348: return next.study(info);
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * Supplementary support version of BnM(). Unpaired surrogates are
jtulach@1348: * also handled by this class.
jtulach@1348: */
jtulach@1348: static final class BnMS extends BnM {
jtulach@1348: int lengthInChars;
jtulach@1348:
jtulach@1348: BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
jtulach@1348: super(src, lastOcc, optoSft, next);
jtulach@1348: for (int x = 0; x < buffer.length; x++) {
jtulach@1348: lengthInChars += Character.charCount(buffer[x]);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) {
jtulach@1348: int[] src = buffer;
jtulach@1348: int patternLength = src.length;
jtulach@1348: int last = matcher.to - lengthInChars;
jtulach@1348:
jtulach@1348: // Loop over all possible match positions in text
jtulach@1348: NEXT: while (i <= last) {
jtulach@1348: // Loop over pattern from right to left
jtulach@1348: int ch;
jtulach@1348: for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
jtulach@1348: j > 0; j -= Character.charCount(ch), x--) {
jtulach@1348: ch = Character.codePointBefore(seq, i+j);
jtulach@1348: if (ch != src[x]) {
jtulach@1348: // Shift search to the right by the maximum of the
jtulach@1348: // bad character shift and the good suffix shift
jtulach@1348: int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
jtulach@1348: i += countChars(seq, i, n);
jtulach@1348: continue NEXT;
jtulach@1348: }
jtulach@1348: }
jtulach@1348: // Entire pattern matched starting at i
jtulach@1348: matcher.first = i;
jtulach@1348: boolean ret = next.match(matcher, i + lengthInChars, seq);
jtulach@1348: if (ret) {
jtulach@1348: matcher.first = i;
jtulach@1348: matcher.groups[0] = matcher.first;
jtulach@1348: matcher.groups[1] = matcher.last;
jtulach@1348: return true;
jtulach@1348: }
jtulach@1348: i += countChars(seq, i, 1);
jtulach@1348: }
jtulach@1348: matcher.hitEnd = true;
jtulach@1348: return false;
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: ///////////////////////////////////////////////////////////////////////////////
jtulach@1348: ///////////////////////////////////////////////////////////////////////////////
jtulach@1348:
jtulach@1348: /**
jtulach@1348: * This must be the very first initializer.
jtulach@1348: */
jtulach@1348: static Node accept = new Node();
jtulach@1348:
jtulach@1348: static Node lastAccept = new LastNode();
jtulach@1348:
jtulach@1348: private static class CharPropertyNames {
jtulach@1348:
jtulach@1348: static CharProperty charPropertyFor(String name) {
jtulach@1348: CharPropertyFactory m = map.get(name);
jtulach@1348: return m == null ? null : m.make();
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static abstract class CharPropertyFactory {
jtulach@1348: abstract CharProperty make();
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static void defCategory(String name,
jtulach@1348: final int typeMask) {
jtulach@1348: map.put(name, new CharPropertyFactory() {
jtulach@1348: CharProperty make() { return new Category(typeMask);}});
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static void defRange(String name,
jtulach@1348: final int lower, final int upper) {
jtulach@1348: map.put(name, new CharPropertyFactory() {
jtulach@1348: CharProperty make() { return rangeFor(lower, upper);}});
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static void defCtype(String name,
jtulach@1348: final int ctype) {
jtulach@1348: map.put(name, new CharPropertyFactory() {
jtulach@1348: CharProperty make() { return new Ctype(ctype);}});
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static abstract class CloneableProperty
jtulach@1348: extends CharProperty implements Cloneable
jtulach@1348: {
jtulach@1348: public CloneableProperty clone() {
jtulach@1348: try {
jtulach@1348: return (CloneableProperty) super.clone();
jtulach@1348: } catch (CloneNotSupportedException e) {
jtulach@1348: throw new AssertionError(e);
jtulach@1348: }
jtulach@1348: }
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static void defClone(String name,
jtulach@1348: final CloneableProperty p) {
jtulach@1348: map.put(name, new CharPropertyFactory() {
jtulach@1348: CharProperty make() { return p.clone();}});
jtulach@1348: }
jtulach@1348:
jtulach@1348: private static final HashMap map
jtulach@1348: = new HashMap<>();
jtulach@1348:
jtulach@1348: static {
jtulach@1348: // Unicode character property aliases, defined in
jtulach@1348: // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
jtulach@1348: defCategory("Cn", 1<