jtulach@1348: /* jtulach@1348: * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved. jtulach@1348: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. jtulach@1348: * jtulach@1348: * This code is free software; you can redistribute it and/or modify it jtulach@1348: * under the terms of the GNU General Public License version 2 only, as jtulach@1348: * published by the Free Software Foundation. Oracle designates this jtulach@1348: * particular file as subject to the "Classpath" exception as provided jtulach@1348: * by Oracle in the LICENSE file that accompanied this code. jtulach@1348: * jtulach@1348: * This code is distributed in the hope that it will be useful, but WITHOUT jtulach@1348: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or jtulach@1348: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License jtulach@1348: * version 2 for more details (a copy is included in the LICENSE file that jtulach@1348: * accompanied this code). jtulach@1348: * jtulach@1348: * You should have received a copy of the GNU General Public License version jtulach@1348: * 2 along with this work; if not, write to the Free Software Foundation, jtulach@1348: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. jtulach@1348: * jtulach@1348: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA jtulach@1348: * or visit www.oracle.com if you need additional information or have any jtulach@1348: * questions. jtulach@1348: */ jtulach@1348: jtulach@1348: package java.util.regex; jtulach@1348: jtulach@1348: import java.util.Locale; jtulach@1348: import java.util.Map; jtulach@1348: import java.util.ArrayList; jtulach@1348: import java.util.HashMap; jtulach@1348: import java.util.Arrays; jtulach@1348: jtulach@1348: jtulach@1348: /** jtulach@1348: * A compiled representation of a regular expression. jtulach@1348: * jtulach@1348: *

A regular expression, specified as a string, must first be compiled into jtulach@1348: * an instance of this class. The resulting pattern can then be used to create jtulach@1348: * a {@link Matcher} object that can match arbitrary {@link jtulach@1348: * java.lang.CharSequence character sequences} against the regular jtulach@1348: * expression. All of the state involved in performing a match resides in the jtulach@1348: * matcher, so many matchers can share the same pattern. jtulach@1348: * jtulach@1348: *

A typical invocation sequence is thus jtulach@1348: * jtulach@1348: *

jtulach@1348:  * Pattern p = Pattern.{@link #compile compile}("a*b");
jtulach@1348:  * Matcher m = p.{@link #matcher matcher}("aaaaab");
jtulach@1348:  * boolean b = m.{@link Matcher#matches matches}();
jtulach@1348: * jtulach@1348: *

A {@link #matches matches} method is defined by this class as a jtulach@1348: * convenience for when a regular expression is used just once. This method jtulach@1348: * compiles an expression and matches an input sequence against it in a single jtulach@1348: * invocation. The statement jtulach@1348: * jtulach@1348: *

jtulach@1348:  * boolean b = Pattern.matches("a*b", "aaaaab");
jtulach@1348: * jtulach@1348: * is equivalent to the three statements above, though for repeated matches it jtulach@1348: * is less efficient since it does not allow the compiled pattern to be reused. jtulach@1348: * jtulach@1348: *

Instances of this class are immutable and are safe for use by multiple jtulach@1348: * concurrent threads. Instances of the {@link Matcher} class are not safe for jtulach@1348: * such use. jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *

Summary of regular-expression constructs

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *
ConstructMatches
 
Characters
xThe character x
\\The backslash character
\0nThe character with octal value 0n jtulach@1348: * (0 <= n <= 7)
\0nnThe character with octal value 0nn jtulach@1348: * (0 <= n <= 7)
\0mnnThe character with octal value 0mnn jtulach@1348: * (0 <= m <= 3, jtulach@1348: * 0 <= n <= 7)
\xhhThe character with hexadecimal value 0xhh
\uhhhhThe character with hexadecimal value 0xhhhh
\x{h...h}The character with hexadecimal value 0xh...h jtulach@1348: * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} jtulach@1348: *  <= 0xh...h <=  jtulach@1348: * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})
\tThe tab character ('\u0009')
\nThe newline (line feed) character ('\u000A')
\rThe carriage-return character ('\u000D')
\fThe form-feed character ('\u000C')
\aThe alert (bell) character ('\u0007')
\eThe escape character ('\u001B')
\cxThe control character corresponding to x
 
Character classes
[abc]a, b, or c (simple class)
[^abc]Any character except a, b, or c (negation)
[a-zA-Z]a through z jtulach@1348: * or A through Z, inclusive (range)
[a-d[m-p]]a through d, jtulach@1348: * or m through p: [a-dm-p] (union)
[a-z&&[def]]d, e, or f (intersection)
[a-z&&[^bc]]a through z, jtulach@1348: * except for b and c: [ad-z] (subtraction)
[a-z&&[^m-p]]a through z, jtulach@1348: * and not m through p: [a-lq-z](subtraction)
 
Predefined character classes
.Any character (may or may not match line terminators)
\dA digit: [0-9]
\DA non-digit: [^0-9]
\sA whitespace character: [ \t\n\x0B\f\r]
\SA non-whitespace character: [^\s]
\wA word character: [a-zA-Z_0-9]
\WA non-word character: [^\w]
 
POSIX character classes (US-ASCII only)
\p{Lower}A lower-case alphabetic character: [a-z]
\p{Upper}An upper-case alphabetic character:[A-Z]
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:[\p{Lower}\p{Upper}]
\p{Digit}A decimal digit: [0-9]
\p{Alnum}An alphanumeric character:[\p{Alpha}\p{Digit}]
\p{Punct}Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
\p{Graph}A visible character: [\p{Alnum}\p{Punct}]
\p{Print}A printable character: [\p{Graph}\x20]
\p{Blank}A space or a tab: [ \t]
\p{Cntrl}A control character: [\x00-\x1F\x7F]
\p{XDigit}A hexadecimal digit: [0-9a-fA-F]
\p{Space}A whitespace character: [ \t\n\x0B\f\r]
 
java.lang.Character classes (simple java character type)
\p{javaLowerCase}Equivalent to java.lang.Character.isLowerCase()
\p{javaUpperCase}Equivalent to java.lang.Character.isUpperCase()
\p{javaWhitespace}Equivalent to java.lang.Character.isWhitespace()
\p{javaMirrored}Equivalent to java.lang.Character.isMirrored()
 
Classes for Unicode scripts, blocks, categories and binary properties
\p{IsLatin}A Latin script character (script)
\p{InGreek}A character in the Greek block (block)
\p{Lu}An uppercase letter (category)
\p{IsAlphabetic}An alphabetic character (binary property)
\p{Sc}A currency symbol
\P{InGreek}Any character except one in the Greek block (negation)
[\p{L}&&[^\p{Lu}]] Any letter except an uppercase letter (subtraction)
 
Boundary matchers
^The beginning of a line
$The end of a line
\bA word boundary
\BA non-word boundary
\AThe beginning of the input
\GThe end of the previous match
\ZThe end of the input but for the final jtulach@1348: * terminator, if any
\zThe end of the input
 
Greedy quantifiers
X?X, once or not at all
X*X, zero or more times
X+X, one or more times
X{n}X, exactly n times
X{n,}X, at least n times
X{n,m}X, at least n but not more than m times
 
Reluctant quantifiers
X??X, once or not at all
X*?X, zero or more times
X+?X, one or more times
X{n}?X, exactly n times
X{n,}?X, at least n times
X{n,m}?X, at least n but not more than m times
 
Possessive quantifiers
X?+X, once or not at all
X*+X, zero or more times
X++X, one or more times
X{n}+X, exactly n times
X{n,}+X, at least n times
X{n,m}+X, at least n but not more than m times
 
Logical operators
XYX followed by Y
X|YEither X or Y
(X)X, as a capturing group
 
Back references
\nWhatever the nth jtulach@1348: * capturing group matched
\k<name>Whatever the jtulach@1348: * named-capturing group "name" matched
 
Quotation
\Nothing, but quotes the following character
\QNothing, but quotes all characters until \E
\ENothing, but ends quoting started by \Q
 
Special constructs (named-capturing and non-capturing)
(?<name>X)X, as a named-capturing group
(?:X)X, as a non-capturing group
(?idmsuxU-idmsuxU) Nothing, but turns match flags i jtulach@1348: * d m s jtulach@1348: * u x U jtulach@1348: * on - off
(?idmsux-idmsux:X)  X, as a non-capturing group with the jtulach@1348: * given flags i d jtulach@1348: * m s u jtulach@1348: * x on - off
(?=X)X, via zero-width positive lookahead
(?!X)X, via zero-width negative lookahead
(?<=X)X, via zero-width positive lookbehind
(?<!X)X, via zero-width negative lookbehind
(?>X)X, as an independent, non-capturing group
jtulach@1348: * jtulach@1348: *
jtulach@1348: * jtulach@1348: * jtulach@1348: *
jtulach@1348: *

Backslashes, escapes, and quoting

jtulach@1348: * jtulach@1348: *

The backslash character ('\') serves to introduce escaped jtulach@1348: * constructs, as defined in the table above, as well as to quote characters jtulach@1348: * that otherwise would be interpreted as unescaped constructs. Thus the jtulach@1348: * expression \\ matches a single backslash and \{ matches a jtulach@1348: * left brace. jtulach@1348: * jtulach@1348: *

It is an error to use a backslash prior to any alphabetic character that jtulach@1348: * does not denote an escaped construct; these are reserved for future jtulach@1348: * extensions to the regular-expression language. A backslash may be used jtulach@1348: * prior to a non-alphabetic character regardless of whether that character is jtulach@1348: * part of an unescaped construct. jtulach@1348: * jtulach@1348: *

Backslashes within string literals in Java source code are interpreted jtulach@1348: * as required by jtulach@1348: * The Java™ Language Specification jtulach@1348: * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) jtulach@1348: * It is therefore necessary to double backslashes in string jtulach@1348: * literals that represent regular expressions to protect them from jtulach@1348: * interpretation by the Java bytecode compiler. The string literal jtulach@1348: * "\b", for example, matches a single backspace character when jtulach@1348: * interpreted as a regular expression, while "\\b" matches a jtulach@1348: * word boundary. The string literal "\(hello\)" is illegal jtulach@1348: * and leads to a compile-time error; in order to match the string jtulach@1348: * (hello) the string literal "\\(hello\\)" jtulach@1348: * must be used. jtulach@1348: * jtulach@1348: * jtulach@1348: *

Character Classes

jtulach@1348: * jtulach@1348: *

Character classes may appear within other character classes, and jtulach@1348: * may be composed by the union operator (implicit) and the intersection jtulach@1348: * operator (&&). jtulach@1348: * The union operator denotes a class that contains every character that is jtulach@1348: * in at least one of its operand classes. The intersection operator jtulach@1348: * denotes a class that contains every character that is in both of its jtulach@1348: * operand classes. jtulach@1348: * jtulach@1348: *

The precedence of character-class operators is as follows, from jtulach@1348: * highest to lowest: jtulach@1348: * jtulach@1348: *

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *
1    Literal escape    \x
2    Grouping[...]
3    Rangea-z
4    Union[a-e][i-u]
5    Intersection[a-z&&[aeiou]]
jtulach@1348: * jtulach@1348: *

Note that a different set of metacharacters are in effect inside jtulach@1348: * a character class than outside a character class. For instance, the jtulach@1348: * regular expression . loses its special meaning inside a jtulach@1348: * character class, while the expression - becomes a range jtulach@1348: * forming metacharacter. jtulach@1348: * jtulach@1348: * jtulach@1348: *

Line terminators

jtulach@1348: * jtulach@1348: *

A line terminator is a one- or two-character sequence that marks jtulach@1348: * the end of a line of the input character sequence. The following are jtulach@1348: * recognized as line terminators: jtulach@1348: * jtulach@1348: *

jtulach@1348: *

If {@link #UNIX_LINES} mode is activated, then the only line terminators jtulach@1348: * recognized are newline characters. jtulach@1348: * jtulach@1348: *

The regular expression . matches any character except a line jtulach@1348: * terminator unless the {@link #DOTALL} flag is specified. jtulach@1348: * jtulach@1348: *

By default, the regular expressions ^ and $ ignore jtulach@1348: * line terminators and only match at the beginning and the end, respectively, jtulach@1348: * of the entire input sequence. If {@link #MULTILINE} mode is activated then jtulach@1348: * ^ matches at the beginning of input and after any line terminator jtulach@1348: * except at the end of input. When in {@link #MULTILINE} mode $ jtulach@1348: * matches just before a line terminator or the end of the input sequence. jtulach@1348: * jtulach@1348: * jtulach@1348: *

Groups and capturing

jtulach@1348: * jtulach@1348: * jtulach@1348: *
Group number
jtulach@1348: *

Capturing groups are numbered by counting their opening parentheses from jtulach@1348: * left to right. In the expression ((A)(B(C))), for example, there jtulach@1348: * are four such groups:

jtulach@1348: * jtulach@1348: *
jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
jtulach@1348: * jtulach@1348: *

Group zero always stands for the entire expression. jtulach@1348: * jtulach@1348: *

Capturing groups are so named because, during a match, each subsequence jtulach@1348: * of the input sequence that matches such a group is saved. The captured jtulach@1348: * subsequence may be used later in the expression, via a back reference, and jtulach@1348: * may also be retrieved from the matcher once the match operation is complete. jtulach@1348: * jtulach@1348: * jtulach@1348: *

Group name
jtulach@1348: *

A capturing group can also be assigned a "name", a named-capturing group, jtulach@1348: * and then be back-referenced later by the "name". Group names are composed of jtulach@1348: * the following characters. The first character must be a letter. jtulach@1348: * jtulach@1348: *

jtulach@1348: * jtulach@1348: *

A named-capturing group is still numbered as described in jtulach@1348: * Group number. jtulach@1348: * jtulach@1348: *

The captured input associated with a group is always the subsequence jtulach@1348: * that the group most recently matched. If a group is evaluated a second time jtulach@1348: * because of quantification then its previously-captured value, if any, will jtulach@1348: * be retained if the second evaluation fails. Matching the string jtulach@1348: * "aba" against the expression (a(b)?)+, for example, leaves jtulach@1348: * group two set to "b". All captured input is discarded at the jtulach@1348: * beginning of each match. jtulach@1348: * jtulach@1348: *

Groups beginning with (? are either pure, non-capturing groups jtulach@1348: * that do not capture text and do not count towards the group total, or jtulach@1348: * named-capturing group. jtulach@1348: * jtulach@1348: *

Unicode support

jtulach@1348: * jtulach@1348: *

This class is in conformance with Level 1 of Unicode Technical jtulach@1348: * Standard #18: Unicode Regular Expression, plus RL2.1 jtulach@1348: * Canonical Equivalents. jtulach@1348: *

jtulach@1348: * Unicode escape sequences such as \u2014 in Java source code jtulach@1348: * are processed as described in section 3.3 of jtulach@1348: * The Java™ Language Specification. jtulach@1348: * Such escape sequences are also implemented directly by the regular-expression jtulach@1348: * parser so that Unicode escapes can be used in expressions that are read from jtulach@1348: * files or from the keyboard. Thus the strings "\u2014" and jtulach@1348: * "\\u2014", while not equal, compile into the same pattern, which jtulach@1348: * matches the character with hexadecimal value 0x2014. jtulach@1348: *

jtulach@1348: * A Unicode character can also be represented in a regular-expression by jtulach@1348: * using its Hex notation(hexadecimal code point value) directly as described in construct jtulach@1348: * \x{...}, for example a supplementary character U+2011F jtulach@1348: * can be specified as \x{2011F}, instead of two consecutive jtulach@1348: * Unicode escape sequences of the surrogate pair jtulach@1348: * \uD840\uDD1F. jtulach@1348: *

jtulach@1348: * Unicode scripts, blocks, categories and binary properties are written with jtulach@1348: * the \p and \P constructs as in Perl. jtulach@1348: * \p{prop} matches if jtulach@1348: * the input has the property prop, while \P{prop} jtulach@1348: * does not match if the input has that property. jtulach@1348: *

jtulach@1348: * Scripts, blocks, categories and binary properties can be used both inside jtulach@1348: * and outside of a character class. jtulach@1348: * jtulach@1348: *

jtulach@1348: * Scripts are specified either with the prefix {@code Is}, as in jtulach@1348: * {@code IsHiragana}, or by using the {@code script} keyword (or its short jtulach@1348: * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. jtulach@1348: *

jtulach@1348: * The script names supported by Pattern are the valid script names jtulach@1348: * accepted and defined by jtulach@1348: * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. jtulach@1348: * jtulach@1348: *

jtulach@1348: * Blocks are specified with the prefix {@code In}, as in jtulach@1348: * {@code InMongolian}, or by using the keyword {@code block} (or its short jtulach@1348: * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. jtulach@1348: *

jtulach@1348: * The block names supported by Pattern are the valid block names jtulach@1348: * accepted and defined by jtulach@1348: * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. jtulach@1348: *

jtulach@1348: * jtulach@1348: * Categories may be specified with the optional prefix {@code Is}: jtulach@1348: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode jtulach@1348: * letters. Same as scripts and blocks, categories can also be specified jtulach@1348: * by using the keyword {@code general_category} (or its short form jtulach@1348: * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. jtulach@1348: *

jtulach@1348: * The supported categories are those of jtulach@1348: * jtulach@1348: * The Unicode Standard in the version specified by the jtulach@1348: * {@link java.lang.Character Character} class. The category names are those jtulach@1348: * defined in the Standard, both normative and informative. jtulach@1348: *

jtulach@1348: * jtulach@1348: * Binary properties are specified with the prefix {@code Is}, as in jtulach@1348: * {@code IsAlphabetic}. The supported binary properties by Pattern jtulach@1348: * are jtulach@1348: *

jtulach@1348: jtulach@1348: jtulach@1348: *

jtulach@1348: * Predefined Character classes and POSIX character classes are in jtulach@1348: * conformance with the recommendation of Annex C: Compatibility Properties jtulach@1348: * of Unicode Regular Expression jtulach@1348: * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified. jtulach@1348: *

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *
ClassesMatches
\p{Lower}A lowercase character:\p{IsLowercase}
\p{Upper}An uppercase character:\p{IsUppercase}
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:\p{IsAlphabetic}
\p{Digit}A decimal digit character:p{IsDigit}
\p{Alnum}An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
\p{Punct}A punctuation character:p{IsPunctuation}
\p{Graph}A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
\p{Print}A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
\p{Blank}A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
\p{Cntrl}A control character: \p{gc=Cc}
\p{XDigit}A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
\p{Space}A whitespace character:\p{IsWhite_Space}
\dA digit: \p{IsDigit}
\DA non-digit: [^\d]
\sA whitespace character: \p{IsWhite_Space}
\SA non-whitespace character: [^\s]
\wA word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
\WA non-word character: [^\w]
jtulach@1348: *

jtulach@1348: * jtulach@1348: * Categories that behave like the java.lang.Character jtulach@1348: * boolean ismethodname methods (except for the deprecated ones) are jtulach@1348: * available through the same \p{prop} syntax where jtulach@1348: * the specified property has the name javamethodname. jtulach@1348: * jtulach@1348: *

Comparison to Perl 5

jtulach@1348: * jtulach@1348: *

The Pattern engine performs traditional NFA-based matching jtulach@1348: * with ordered alternation as occurs in Perl 5. jtulach@1348: * jtulach@1348: *

Perl constructs not supported by this class:

jtulach@1348: * jtulach@1348: *
jtulach@1348: * jtulach@1348: *

Constructs supported by this class but not by Perl:

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *

Notable differences from Perl:

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *

For a more precise description of the behavior of regular expression jtulach@1348: * constructs, please see jtulach@1348: * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, jtulach@1348: * O'Reilly and Associates, 2006. jtulach@1348: *

jtulach@1348: * jtulach@1348: * @see java.lang.String#split(String, int) jtulach@1348: * @see java.lang.String#split(String) jtulach@1348: * jtulach@1348: * @author Mike McCloskey jtulach@1348: * @author Mark Reinhold jtulach@1348: * @author JSR-51 Expert Group jtulach@1348: * @since 1.4 jtulach@1348: * @spec JSR-51 jtulach@1348: */ jtulach@1348: jtulach@1348: public final class Pattern jtulach@1348: implements java.io.Serializable jtulach@1348: { jtulach@1348: jtulach@1348: /** jtulach@1348: * Regular expression modifier values. Instead of being passed as jtulach@1348: * arguments, they can also be passed as inline modifiers. jtulach@1348: * For example, the following statements have the same effect. jtulach@1348: *
jtulach@1348:      * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
jtulach@1348:      * RegExp r2 = RegExp.compile("(?im)abc", 0);
jtulach@1348:      * 
jtulach@1348: * jtulach@1348: * The flags are duplicated so that the familiar Perl match flag jtulach@1348: * names are available. jtulach@1348: */ jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables Unix lines mode. jtulach@1348: * jtulach@1348: *

In this mode, only the '\n' line terminator is recognized jtulach@1348: * in the behavior of ., ^, and $. jtulach@1348: * jtulach@1348: *

Unix lines mode can also be enabled via the embedded flag jtulach@1348: * expression (?d). jtulach@1348: */ jtulach@1348: public static final int UNIX_LINES = 0x01; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables case-insensitive matching. jtulach@1348: * jtulach@1348: *

By default, case-insensitive matching assumes that only characters jtulach@1348: * in the US-ASCII charset are being matched. Unicode-aware jtulach@1348: * case-insensitive matching can be enabled by specifying the {@link jtulach@1348: * #UNICODE_CASE} flag in conjunction with this flag. jtulach@1348: * jtulach@1348: *

Case-insensitive matching can also be enabled via the embedded flag jtulach@1348: * expression (?i). jtulach@1348: * jtulach@1348: *

Specifying this flag may impose a slight performance penalty.

jtulach@1348: */ jtulach@1348: public static final int CASE_INSENSITIVE = 0x02; jtulach@1348: jtulach@1348: /** jtulach@1348: * Permits whitespace and comments in pattern. jtulach@1348: * jtulach@1348: *

In this mode, whitespace is ignored, and embedded comments starting jtulach@1348: * with # are ignored until the end of a line. jtulach@1348: * jtulach@1348: *

Comments mode can also be enabled via the embedded flag jtulach@1348: * expression (?x). jtulach@1348: */ jtulach@1348: public static final int COMMENTS = 0x04; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables multiline mode. jtulach@1348: * jtulach@1348: *

In multiline mode the expressions ^ and $ match jtulach@1348: * just after or just before, respectively, a line terminator or the end of jtulach@1348: * the input sequence. By default these expressions only match at the jtulach@1348: * beginning and the end of the entire input sequence. jtulach@1348: * jtulach@1348: *

Multiline mode can also be enabled via the embedded flag jtulach@1348: * expression (?m).

jtulach@1348: */ jtulach@1348: public static final int MULTILINE = 0x08; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables literal parsing of the pattern. jtulach@1348: * jtulach@1348: *

When this flag is specified then the input string that specifies jtulach@1348: * the pattern is treated as a sequence of literal characters. jtulach@1348: * Metacharacters or escape sequences in the input sequence will be jtulach@1348: * given no special meaning. jtulach@1348: * jtulach@1348: *

The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on jtulach@1348: * matching when used in conjunction with this flag. The other flags jtulach@1348: * become superfluous. jtulach@1348: * jtulach@1348: *

There is no embedded flag character for enabling literal parsing. jtulach@1348: * @since 1.5 jtulach@1348: */ jtulach@1348: public static final int LITERAL = 0x10; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables dotall mode. jtulach@1348: * jtulach@1348: *

In dotall mode, the expression . matches any character, jtulach@1348: * including a line terminator. By default this expression does not match jtulach@1348: * line terminators. jtulach@1348: * jtulach@1348: *

Dotall mode can also be enabled via the embedded flag jtulach@1348: * expression (?s). (The s is a mnemonic for jtulach@1348: * "single-line" mode, which is what this is called in Perl.)

jtulach@1348: */ jtulach@1348: public static final int DOTALL = 0x20; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables Unicode-aware case folding. jtulach@1348: * jtulach@1348: *

When this flag is specified then case-insensitive matching, when jtulach@1348: * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner jtulach@1348: * consistent with the Unicode Standard. By default, case-insensitive jtulach@1348: * matching assumes that only characters in the US-ASCII charset are being jtulach@1348: * matched. jtulach@1348: * jtulach@1348: *

Unicode-aware case folding can also be enabled via the embedded flag jtulach@1348: * expression (?u). jtulach@1348: * jtulach@1348: *

Specifying this flag may impose a performance penalty.

jtulach@1348: */ jtulach@1348: public static final int UNICODE_CASE = 0x40; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables canonical equivalence. jtulach@1348: * jtulach@1348: *

When this flag is specified then two characters will be considered jtulach@1348: * to match if, and only if, their full canonical decompositions match. jtulach@1348: * The expression "a\u030A", for example, will match the jtulach@1348: * string "\u00E5" when this flag is specified. By default, jtulach@1348: * matching does not take canonical equivalence into account. jtulach@1348: * jtulach@1348: *

There is no embedded flag character for enabling canonical jtulach@1348: * equivalence. jtulach@1348: * jtulach@1348: *

Specifying this flag may impose a performance penalty.

jtulach@1348: */ jtulach@1348: public static final int CANON_EQ = 0x80; jtulach@1348: jtulach@1348: /** jtulach@1348: * Enables the Unicode version of Predefined character classes and jtulach@1348: * POSIX character classes. jtulach@1348: * jtulach@1348: *

When this flag is specified then the (US-ASCII only) jtulach@1348: * Predefined character classes and POSIX character classes jtulach@1348: * are in conformance with jtulach@1348: * Unicode Technical jtulach@1348: * Standard #18: Unicode Regular Expression jtulach@1348: * Annex C: Compatibility Properties. jtulach@1348: *

jtulach@1348: * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded jtulach@1348: * flag expression (?U). jtulach@1348: *

jtulach@1348: * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case jtulach@1348: * folding. jtulach@1348: *

jtulach@1348: * Specifying this flag may impose a performance penalty.

jtulach@1348: * @since 1.7 jtulach@1348: */ jtulach@1348: public static final int UNICODE_CHARACTER_CLASS = 0x100; jtulach@1348: jtulach@1348: /* Pattern has only two serialized components: The pattern string jtulach@1348: * and the flags, which are all that is needed to recompile the pattern jtulach@1348: * when it is deserialized. jtulach@1348: */ jtulach@1348: jtulach@1348: /** use serialVersionUID from Merlin b59 for interoperability */ jtulach@1348: private static final long serialVersionUID = 5073258162644648461L; jtulach@1348: jtulach@1348: /** jtulach@1348: * The original regular-expression pattern string. jtulach@1348: * jtulach@1348: * @serial jtulach@1348: */ jtulach@1348: private String pattern; jtulach@1348: jtulach@1348: /** jtulach@1348: * The original pattern flags. jtulach@1348: * jtulach@1348: * @serial jtulach@1348: */ jtulach@1348: private int flags; jtulach@1348: jtulach@1348: /** jtulach@1348: * Boolean indicating this Pattern is compiled; this is necessary in order jtulach@1348: * to lazily compile deserialized Patterns. jtulach@1348: */ jtulach@1348: private transient volatile boolean compiled = false; jtulach@1348: jtulach@1348: /** jtulach@1348: * The normalized pattern string. jtulach@1348: */ jtulach@1348: private transient String normalizedPattern; jtulach@1348: jtulach@1348: /** jtulach@1348: * The starting point of state machine for the find operation. This allows jtulach@1348: * a match to start anywhere in the input. jtulach@1348: */ jtulach@1348: transient Node root; jtulach@1348: jtulach@1348: /** jtulach@1348: * The root of object tree for a match operation. The pattern is matched jtulach@1348: * at the beginning. This may include a find that uses BnM or a First jtulach@1348: * node. jtulach@1348: */ jtulach@1348: transient Node matchRoot; jtulach@1348: jtulach@1348: /** jtulach@1348: * Temporary storage used by parsing pattern slice. jtulach@1348: */ jtulach@1348: transient int[] buffer; jtulach@1348: jtulach@1348: /** jtulach@1348: * Map the "name" of the "named capturing group" to its group id jtulach@1348: * node. jtulach@1348: */ jtulach@1348: transient volatile Map namedGroups; jtulach@1348: jtulach@1348: /** jtulach@1348: * Temporary storage used while parsing group references. jtulach@1348: */ jtulach@1348: transient GroupHead[] groupNodes; jtulach@1348: jtulach@1348: /** jtulach@1348: * Temporary null terminated code point array used by pattern compiling. jtulach@1348: */ jtulach@1348: private transient int[] temp; jtulach@1348: jtulach@1348: /** jtulach@1348: * The number of capturing groups in this Pattern. Used by matchers to jtulach@1348: * allocate storage needed to perform a match. jtulach@1348: */ jtulach@1348: transient int capturingGroupCount; jtulach@1348: jtulach@1348: /** jtulach@1348: * The local variable count used by parsing tree. Used by matchers to jtulach@1348: * allocate storage needed to perform a match. jtulach@1348: */ jtulach@1348: transient int localCount; jtulach@1348: jtulach@1348: /** jtulach@1348: * Index into the pattern string that keeps track of how much has been jtulach@1348: * parsed. jtulach@1348: */ jtulach@1348: private transient int cursor; jtulach@1348: jtulach@1348: /** jtulach@1348: * Holds the length of the pattern string. jtulach@1348: */ jtulach@1348: private transient int patternLength; jtulach@1348: jtulach@1348: /** jtulach@1348: * If the Start node might possibly match supplementary characters. jtulach@1348: * It is set to true during compiling if jtulach@1348: * (1) There is supplementary char in pattern, or jtulach@1348: * (2) There is complement node of Category or Block jtulach@1348: */ jtulach@1348: private transient boolean hasSupplementary; jtulach@1348: jtulach@1348: /** jtulach@1348: * Compiles the given regular expression into a pattern.

jtulach@1348: * jtulach@1348: * @param regex jtulach@1348: * The expression to be compiled jtulach@1348: * jtulach@1348: * @throws PatternSyntaxException jtulach@1348: * If the expression's syntax is invalid jtulach@1348: */ jtulach@1348: public static Pattern compile(String regex) { jtulach@1348: return new Pattern(regex, 0); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Compiles the given regular expression into a pattern with the given jtulach@1348: * flags.

jtulach@1348: * jtulach@1348: * @param regex jtulach@1348: * The expression to be compiled jtulach@1348: * jtulach@1348: * @param flags jtulach@1348: * Match flags, a bit mask that may include jtulach@1348: * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, jtulach@1348: * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, jtulach@1348: * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} jtulach@1348: * and {@link #COMMENTS} jtulach@1348: * jtulach@1348: * @throws IllegalArgumentException jtulach@1348: * If bit values other than those corresponding to the defined jtulach@1348: * match flags are set in flags jtulach@1348: * jtulach@1348: * @throws PatternSyntaxException jtulach@1348: * If the expression's syntax is invalid jtulach@1348: */ jtulach@1348: public static Pattern compile(String regex, int flags) { jtulach@1348: return new Pattern(regex, flags); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns the regular expression from which this pattern was compiled. jtulach@1348: *

jtulach@1348: * jtulach@1348: * @return The source of this pattern jtulach@1348: */ jtulach@1348: public String pattern() { jtulach@1348: return pattern; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: *

Returns the string representation of this pattern. This jtulach@1348: * is the regular expression from which this pattern was jtulach@1348: * compiled.

jtulach@1348: * jtulach@1348: * @return The string representation of this pattern jtulach@1348: * @since 1.5 jtulach@1348: */ jtulach@1348: public String toString() { jtulach@1348: return pattern; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Creates a matcher that will match the given input against this pattern. jtulach@1348: *

jtulach@1348: * jtulach@1348: * @param input jtulach@1348: * The character sequence to be matched jtulach@1348: * jtulach@1348: * @return A new matcher for this pattern jtulach@1348: */ jtulach@1348: public Matcher matcher(CharSequence input) { jtulach@1348: if (!compiled) { jtulach@1348: synchronized(this) { jtulach@1348: if (!compiled) jtulach@1348: compile(); jtulach@1348: } jtulach@1348: } jtulach@1348: Matcher m = new Matcher(this, input); jtulach@1348: return m; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns this pattern's match flags.

jtulach@1348: * jtulach@1348: * @return The match flags specified when this pattern was compiled jtulach@1348: */ jtulach@1348: public int flags() { jtulach@1348: return flags; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Compiles the given regular expression and attempts to match the given jtulach@1348: * input against it. jtulach@1348: * jtulach@1348: *

An invocation of this convenience method of the form jtulach@1348: * jtulach@1348: *

jtulach@1348:      * Pattern.matches(regex, input);
jtulach@1348: * jtulach@1348: * behaves in exactly the same way as the expression jtulach@1348: * jtulach@1348: *
jtulach@1348:      * Pattern.compile(regex).matcher(input).matches()
jtulach@1348: * jtulach@1348: *

If a pattern is to be used multiple times, compiling it once and reusing jtulach@1348: * it will be more efficient than invoking this method each time.

jtulach@1348: * jtulach@1348: * @param regex jtulach@1348: * The expression to be compiled jtulach@1348: * jtulach@1348: * @param input jtulach@1348: * The character sequence to be matched jtulach@1348: * jtulach@1348: * @throws PatternSyntaxException jtulach@1348: * If the expression's syntax is invalid jtulach@1348: */ jtulach@1348: public static boolean matches(String regex, CharSequence input) { jtulach@1348: Pattern p = Pattern.compile(regex); jtulach@1348: Matcher m = p.matcher(input); jtulach@1348: return m.matches(); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Splits the given input sequence around matches of this pattern. jtulach@1348: * jtulach@1348: *

The array returned by this method contains each substring of the jtulach@1348: * input sequence that is terminated by another subsequence that matches jtulach@1348: * this pattern or is terminated by the end of the input sequence. The jtulach@1348: * substrings in the array are in the order in which they occur in the jtulach@1348: * input. If this pattern does not match any subsequence of the input then jtulach@1348: * the resulting array has just one element, namely the input sequence in jtulach@1348: * string form. jtulach@1348: * jtulach@1348: *

The limit parameter controls the number of times the jtulach@1348: * pattern is applied and therefore affects the length of the resulting jtulach@1348: * array. If the limit n is greater than zero then the pattern jtulach@1348: * will be applied at most n - 1 times, the array's jtulach@1348: * length will be no greater than n, and the array's last entry jtulach@1348: * will contain all input beyond the last matched delimiter. If n jtulach@1348: * is non-positive then the pattern will be applied as many times as jtulach@1348: * possible and the array can have any length. If n is zero then jtulach@1348: * the pattern will be applied as many times as possible, the array can jtulach@1348: * have any length, and trailing empty strings will be discarded. jtulach@1348: * jtulach@1348: *

The input "boo:and:foo", for example, yields the following jtulach@1348: * results with these parameters: jtulach@1348: * jtulach@1348: *

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *

Regex    

Limit    

Result    

:2{ "boo", "and:foo" }
:5{ "boo", "and", "foo" }
:-2{ "boo", "and", "foo" }
o5{ "b", "", ":and:f", "", "" }
o-2{ "b", "", ":and:f", "", "" }
o0{ "b", "", ":and:f" }
jtulach@1348: * jtulach@1348: * jtulach@1348: * @param input jtulach@1348: * The character sequence to be split jtulach@1348: * jtulach@1348: * @param limit jtulach@1348: * The result threshold, as described above jtulach@1348: * jtulach@1348: * @return The array of strings computed by splitting the input jtulach@1348: * around matches of this pattern jtulach@1348: */ jtulach@1348: public String[] split(CharSequence input, int limit) { jtulach@1348: int index = 0; jtulach@1348: boolean matchLimited = limit > 0; jtulach@1348: ArrayList matchList = new ArrayList<>(); jtulach@1348: Matcher m = matcher(input); jtulach@1348: jtulach@1348: // Add segments before each match found jtulach@1348: while(m.find()) { jtulach@1348: if (!matchLimited || matchList.size() < limit - 1) { jtulach@1348: String match = input.subSequence(index, m.start()).toString(); jtulach@1348: matchList.add(match); jtulach@1348: index = m.end(); jtulach@1348: } else if (matchList.size() == limit - 1) { // last one jtulach@1348: String match = input.subSequence(index, jtulach@1348: input.length()).toString(); jtulach@1348: matchList.add(match); jtulach@1348: index = m.end(); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: // If no match was found, return this jtulach@1348: if (index == 0) jtulach@1348: return new String[] {input.toString()}; jtulach@1348: jtulach@1348: // Add remaining segment jtulach@1348: if (!matchLimited || matchList.size() < limit) jtulach@1348: matchList.add(input.subSequence(index, input.length()).toString()); jtulach@1348: jtulach@1348: // Construct result jtulach@1348: int resultSize = matchList.size(); jtulach@1348: if (limit == 0) jtulach@1348: while (resultSize > 0 && matchList.get(resultSize-1).equals("")) jtulach@1348: resultSize--; jtulach@1348: String[] result = new String[resultSize]; jtulach@1348: return matchList.subList(0, resultSize).toArray(result); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Splits the given input sequence around matches of this pattern. jtulach@1348: * jtulach@1348: *

This method works as if by invoking the two-argument {@link jtulach@1348: * #split(java.lang.CharSequence, int) split} method with the given input jtulach@1348: * sequence and a limit argument of zero. Trailing empty strings are jtulach@1348: * therefore not included in the resulting array.

jtulach@1348: * jtulach@1348: *

The input "boo:and:foo", for example, yields the following jtulach@1348: * results with these expressions: jtulach@1348: * jtulach@1348: *

jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: * jtulach@1348: *

Regex    

Result

:{ "boo", "and", "foo" }
o{ "b", "", ":and:f" }
jtulach@1348: * jtulach@1348: * jtulach@1348: * @param input jtulach@1348: * The character sequence to be split jtulach@1348: * jtulach@1348: * @return The array of strings computed by splitting the input jtulach@1348: * around matches of this pattern jtulach@1348: */ jtulach@1348: public String[] split(CharSequence input) { jtulach@1348: return split(input, 0); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns a literal pattern String for the specified jtulach@1348: * String. jtulach@1348: * jtulach@1348: *

This method produces a String that can be used to jtulach@1348: * create a Pattern that would match the string jtulach@1348: * s as if it were a literal pattern.

Metacharacters jtulach@1348: * or escape sequences in the input sequence will be given no special jtulach@1348: * meaning. jtulach@1348: * jtulach@1348: * @param s The string to be literalized jtulach@1348: * @return A literal string replacement jtulach@1348: * @since 1.5 jtulach@1348: */ jtulach@1348: public static String quote(String s) { jtulach@1348: int slashEIndex = s.indexOf("\\E"); jtulach@1348: if (slashEIndex == -1) jtulach@1348: return "\\Q" + s + "\\E"; jtulach@1348: jtulach@1348: StringBuilder sb = new StringBuilder(s.length() * 2); jtulach@1348: sb.append("\\Q"); jtulach@1348: slashEIndex = 0; jtulach@1348: int current = 0; jtulach@1348: while ((slashEIndex = s.indexOf("\\E", current)) != -1) { jtulach@1348: sb.append(s.substring(current, slashEIndex)); jtulach@1348: current = slashEIndex + 2; jtulach@1348: sb.append("\\E\\\\E\\Q"); jtulach@1348: } jtulach@1348: sb.append(s.substring(current, s.length())); jtulach@1348: sb.append("\\E"); jtulach@1348: return sb.toString(); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Recompile the Pattern instance from a stream. The original pattern jtulach@1348: * string is read in and the object tree is recompiled from it. jtulach@1348: */ jtulach@1348: private void readObject(java.io.ObjectInputStream s) jtulach@1348: throws java.io.IOException, ClassNotFoundException { jtulach@1348: jtulach@1348: // Read in all fields jtulach@1348: s.defaultReadObject(); jtulach@1348: jtulach@1348: // Initialize counts jtulach@1348: capturingGroupCount = 1; jtulach@1348: localCount = 0; jtulach@1348: jtulach@1348: // if length > 0, the Pattern is lazily compiled jtulach@1348: compiled = false; jtulach@1348: if (pattern.length() == 0) { jtulach@1348: root = new Start(lastAccept); jtulach@1348: matchRoot = lastAccept; jtulach@1348: compiled = true; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * This private constructor is used to create all Patterns. The pattern jtulach@1348: * string and match flags are all that is needed to completely describe jtulach@1348: * a Pattern. An empty pattern string results in an object tree with jtulach@1348: * only a Start node and a LastNode node. jtulach@1348: */ jtulach@1348: private Pattern(String p, int f) { jtulach@1348: pattern = p; jtulach@1348: flags = f; jtulach@1348: jtulach@1348: // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present jtulach@1348: if ((flags & UNICODE_CHARACTER_CLASS) != 0) jtulach@1348: flags |= UNICODE_CASE; jtulach@1348: jtulach@1348: // Reset group index count jtulach@1348: capturingGroupCount = 1; jtulach@1348: localCount = 0; jtulach@1348: jtulach@1348: if (pattern.length() > 0) { jtulach@1348: compile(); jtulach@1348: } else { jtulach@1348: root = new Start(lastAccept); jtulach@1348: matchRoot = lastAccept; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The pattern is converted to normalizedD form and then a pure group jtulach@1348: * is constructed to match canonical equivalences of the characters. jtulach@1348: */ jtulach@1348: private void normalize() { jtulach@1348: boolean inCharClass = false; jtulach@1348: int lastCodePoint = -1; jtulach@1348: jtulach@1348: // Convert pattern into normalizedD form jtulach@1350: normalizedPattern = Normalizer.normalize(pattern, Normalizer.NFD); jtulach@1348: patternLength = normalizedPattern.length(); jtulach@1348: jtulach@1348: // Modify pattern to match canonical equivalences jtulach@1348: StringBuilder newPattern = new StringBuilder(patternLength); jtulach@1348: for(int i=0; i= patternLength) jtulach@1348: break; jtulach@1348: c = normalizedPattern.codePointAt(i); jtulach@1348: sequenceBuffer.appendCodePoint(c); jtulach@1348: } jtulach@1348: String ea = produceEquivalentAlternation( jtulach@1348: sequenceBuffer.toString()); jtulach@1348: newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); jtulach@1348: newPattern.append("(?:").append(ea).append(")"); jtulach@1348: } else if (c == '[' && lastCodePoint != '\\') { jtulach@1348: i = normalizeCharClass(newPattern, i); jtulach@1348: } else { jtulach@1348: newPattern.appendCodePoint(c); jtulach@1348: } jtulach@1348: lastCodePoint = c; jtulach@1348: i += Character.charCount(c); jtulach@1348: } jtulach@1348: normalizedPattern = newPattern.toString(); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Complete the character class being parsed and add a set jtulach@1348: * of alternations to it that will match the canonical equivalences jtulach@1348: * of the characters within the class. jtulach@1348: */ jtulach@1348: private int normalizeCharClass(StringBuilder newPattern, int i) { jtulach@1348: StringBuilder charClass = new StringBuilder(); jtulach@1348: StringBuilder eq = null; jtulach@1348: int lastCodePoint = -1; jtulach@1348: String result; jtulach@1348: jtulach@1348: i++; jtulach@1348: charClass.append("["); jtulach@1348: while(true) { jtulach@1348: int c = normalizedPattern.codePointAt(i); jtulach@1348: StringBuilder sequenceBuffer; jtulach@1348: jtulach@1348: if (c == ']' && lastCodePoint != '\\') { jtulach@1348: charClass.append((char)c); jtulach@1348: break; jtulach@1348: } else if (Character.getType(c) == Character.NON_SPACING_MARK) { jtulach@1348: sequenceBuffer = new StringBuilder(); jtulach@1348: sequenceBuffer.appendCodePoint(lastCodePoint); jtulach@1348: while(Character.getType(c) == Character.NON_SPACING_MARK) { jtulach@1348: sequenceBuffer.appendCodePoint(c); jtulach@1348: i += Character.charCount(c); jtulach@1348: if (i >= normalizedPattern.length()) jtulach@1348: break; jtulach@1348: c = normalizedPattern.codePointAt(i); jtulach@1348: } jtulach@1348: String ea = produceEquivalentAlternation( jtulach@1348: sequenceBuffer.toString()); jtulach@1348: jtulach@1348: charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); jtulach@1348: if (eq == null) jtulach@1348: eq = new StringBuilder(); jtulach@1348: eq.append('|'); jtulach@1348: eq.append(ea); jtulach@1348: } else { jtulach@1348: charClass.appendCodePoint(c); jtulach@1348: i++; jtulach@1348: } jtulach@1348: if (i == normalizedPattern.length()) jtulach@1348: throw error("Unclosed character class"); jtulach@1348: lastCodePoint = c; jtulach@1348: } jtulach@1348: jtulach@1348: if (eq != null) { jtulach@1348: result = "(?:"+charClass.toString()+eq.toString()+")"; jtulach@1348: } else { jtulach@1348: result = charClass.toString(); jtulach@1348: } jtulach@1348: jtulach@1348: newPattern.append(result); jtulach@1348: return i; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Given a specific sequence composed of a regular character and jtulach@1348: * combining marks that follow it, produce the alternation that will jtulach@1348: * match all canonical equivalences of that sequence. jtulach@1348: */ jtulach@1348: private String produceEquivalentAlternation(String source) { jtulach@1348: int len = countChars(source, 0, 1); jtulach@1348: if (source.length() == len) jtulach@1348: // source has one character. jtulach@1348: return source; jtulach@1348: jtulach@1348: String base = source.substring(0,len); jtulach@1348: String combiningMarks = source.substring(len); jtulach@1348: jtulach@1348: String[] perms = producePermutations(combiningMarks); jtulach@1348: StringBuilder result = new StringBuilder(source); jtulach@1348: jtulach@1348: // Add combined permutations jtulach@1348: for(int x=0; x0) jtulach@1348: result.append("|"+next); jtulach@1348: next = composeOneStep(next); jtulach@1348: if (next != null) jtulach@1348: result.append("|"+produceEquivalentAlternation(next)); jtulach@1348: } jtulach@1348: return result.toString(); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns an array of strings that have all the possible jtulach@1348: * permutations of the characters in the input string. jtulach@1348: * This is used to get a list of all possible orderings jtulach@1348: * of a set of combining marks. Note that some of the permutations jtulach@1348: * are invalid because of combining class collisions, and these jtulach@1348: * possibilities must be removed because they are not canonically jtulach@1348: * equivalent. jtulach@1348: */ jtulach@1348: private String[] producePermutations(String input) { jtulach@1348: if (input.length() == countChars(input, 0, 1)) jtulach@1348: return new String[] {input}; jtulach@1348: jtulach@1348: if (input.length() == countChars(input, 0, 2)) { jtulach@1348: int c0 = Character.codePointAt(input, 0); jtulach@1348: int c1 = Character.codePointAt(input, Character.charCount(c0)); jtulach@1348: if (getClass(c1) == getClass(c0)) { jtulach@1348: return new String[] {input}; jtulach@1348: } jtulach@1348: String[] result = new String[2]; jtulach@1348: result[0] = input; jtulach@1348: StringBuilder sb = new StringBuilder(2); jtulach@1348: sb.appendCodePoint(c1); jtulach@1348: sb.appendCodePoint(c0); jtulach@1348: result[1] = sb.toString(); jtulach@1348: return result; jtulach@1348: } jtulach@1348: jtulach@1348: int length = 1; jtulach@1348: int nCodePoints = countCodePoints(input); jtulach@1348: for(int x=1; x=0; y--) { jtulach@1348: if (combClass[y] == combClass[x]) { jtulach@1348: continue loop; jtulach@1348: } jtulach@1348: } jtulach@1348: StringBuilder sb = new StringBuilder(input); jtulach@1348: String otherChars = sb.delete(offset, offset+len).toString(); jtulach@1348: String[] subResult = producePermutations(otherChars); jtulach@1348: jtulach@1348: String prefix = input.substring(offset, offset+len); jtulach@1348: for(int y=0; y= pLen - 1) // No \Q sequence found jtulach@1348: return; jtulach@1348: int j = i; jtulach@1348: i += 2; jtulach@1348: int[] newtemp = new int[j + 2*(pLen-i) + 2]; jtulach@1348: System.arraycopy(temp, 0, newtemp, 0, j); jtulach@1348: jtulach@1348: boolean inQuote = true; jtulach@1348: while (i < pLen) { jtulach@1348: int c = temp[i++]; jtulach@1348: if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) { jtulach@1348: newtemp[j++] = c; jtulach@1348: } else if (c != '\\') { jtulach@1348: if (inQuote) newtemp[j++] = '\\'; jtulach@1348: newtemp[j++] = c; jtulach@1348: } else if (inQuote) { jtulach@1348: if (temp[i] == 'E') { jtulach@1348: i++; jtulach@1348: inQuote = false; jtulach@1348: } else { jtulach@1348: newtemp[j++] = '\\'; jtulach@1348: newtemp[j++] = '\\'; jtulach@1348: } jtulach@1348: } else { jtulach@1348: if (temp[i] == 'Q') { jtulach@1348: i++; jtulach@1348: inQuote = true; jtulach@1348: } else { jtulach@1348: newtemp[j++] = c; jtulach@1348: if (i != pLen) jtulach@1348: newtemp[j++] = temp[i++]; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: patternLength = j; jtulach@1348: temp = Arrays.copyOf(newtemp, j + 2); // double zero termination jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Copies regular expression to an int array and invokes the parsing jtulach@1348: * of the expression which will create the object tree. jtulach@1348: */ jtulach@1348: private void compile() { jtulach@1348: // Handle canonical equivalences jtulach@1348: if (has(CANON_EQ) && !has(LITERAL)) { jtulach@1348: normalize(); jtulach@1348: } else { jtulach@1348: normalizedPattern = pattern; jtulach@1348: } jtulach@1348: patternLength = normalizedPattern.length(); jtulach@1348: jtulach@1348: // Copy pattern to int array for convenience jtulach@1348: // Use double zero to terminate pattern jtulach@1348: temp = new int[patternLength + 2]; jtulach@1348: jtulach@1348: hasSupplementary = false; jtulach@1348: int c, count = 0; jtulach@1348: // Convert all chars into code points jtulach@1348: for (int x = 0; x < patternLength; x += Character.charCount(c)) { jtulach@1348: c = normalizedPattern.codePointAt(x); jtulach@1348: if (isSupplementary(c)) { jtulach@1348: hasSupplementary = true; jtulach@1348: } jtulach@1348: temp[count++] = c; jtulach@1348: } jtulach@1348: jtulach@1348: patternLength = count; // patternLength now in code points jtulach@1348: jtulach@1348: if (! has(LITERAL)) jtulach@1348: RemoveQEQuoting(); jtulach@1348: jtulach@1348: // Allocate all temporary objects here. jtulach@1348: buffer = new int[32]; jtulach@1348: groupNodes = new GroupHead[10]; jtulach@1348: namedGroups = null; jtulach@1348: jtulach@1348: if (has(LITERAL)) { jtulach@1348: // Literal pattern handling jtulach@1348: matchRoot = newSlice(temp, patternLength, hasSupplementary); jtulach@1348: matchRoot.next = lastAccept; jtulach@1348: } else { jtulach@1348: // Start recursive descent parsing jtulach@1348: matchRoot = expr(lastAccept); jtulach@1348: // Check extra pattern characters jtulach@1348: if (patternLength != cursor) { jtulach@1348: if (peek() == ')') { jtulach@1348: throw error("Unmatched closing ')'"); jtulach@1348: } else { jtulach@1348: throw error("Unexpected internal error"); jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: // Peephole optimization jtulach@1348: if (matchRoot instanceof Slice) { jtulach@1348: root = BnM.optimize(matchRoot); jtulach@1348: if (root == matchRoot) { jtulach@1348: root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); jtulach@1348: } jtulach@1348: } else if (matchRoot instanceof Begin || matchRoot instanceof First) { jtulach@1348: root = matchRoot; jtulach@1348: } else { jtulach@1348: root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); jtulach@1348: } jtulach@1348: jtulach@1348: // Release temporary storage jtulach@1348: temp = null; jtulach@1348: buffer = null; jtulach@1348: groupNodes = null; jtulach@1348: patternLength = 0; jtulach@1348: compiled = true; jtulach@1348: } jtulach@1348: jtulach@1348: Map namedGroups() { jtulach@1348: if (namedGroups == null) jtulach@1348: namedGroups = new HashMap<>(2); jtulach@1348: return namedGroups; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Used to print out a subtree of the Pattern to help with debugging. jtulach@1348: */ jtulach@1348: private static void printObjectTree(Node node) { jtulach@1348: while(node != null) { jtulach@1348: if (node instanceof Prolog) { jtulach@1348: System.out.println(node); jtulach@1348: printObjectTree(((Prolog)node).loop); jtulach@1348: System.out.println("**** end contents prolog loop"); jtulach@1348: } else if (node instanceof Loop) { jtulach@1348: System.out.println(node); jtulach@1348: printObjectTree(((Loop)node).body); jtulach@1348: System.out.println("**** end contents Loop body"); jtulach@1348: } else if (node instanceof Curly) { jtulach@1348: System.out.println(node); jtulach@1348: printObjectTree(((Curly)node).atom); jtulach@1348: System.out.println("**** end contents Curly body"); jtulach@1348: } else if (node instanceof GroupCurly) { jtulach@1348: System.out.println(node); jtulach@1348: printObjectTree(((GroupCurly)node).atom); jtulach@1348: System.out.println("**** end contents GroupCurly body"); jtulach@1348: } else if (node instanceof GroupTail) { jtulach@1348: System.out.println(node); jtulach@1348: System.out.println("Tail next is "+node.next); jtulach@1348: return; jtulach@1348: } else { jtulach@1348: System.out.println(node); jtulach@1348: } jtulach@1348: node = node.next; jtulach@1348: if (node != null) jtulach@1348: System.out.println("->next:"); jtulach@1348: if (node == Pattern.accept) { jtulach@1348: System.out.println("Accept Node"); jtulach@1348: node = null; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Used to accumulate information about a subtree of the object graph jtulach@1348: * so that optimizations can be applied to the subtree. jtulach@1348: */ jtulach@1348: static final class TreeInfo { jtulach@1348: int minLength; jtulach@1348: int maxLength; jtulach@1348: boolean maxValid; jtulach@1348: boolean deterministic; jtulach@1348: jtulach@1348: TreeInfo() { jtulach@1348: reset(); jtulach@1348: } jtulach@1348: void reset() { jtulach@1348: minLength = 0; jtulach@1348: maxLength = 0; jtulach@1348: maxValid = true; jtulach@1348: deterministic = true; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /* jtulach@1348: * The following private methods are mainly used to improve the jtulach@1348: * readability of the code. In order to let the Java compiler easily jtulach@1348: * inline them, we should not put many assertions or error checks in them. jtulach@1348: */ jtulach@1348: jtulach@1348: /** jtulach@1348: * Indicates whether a particular flag is set or not. jtulach@1348: */ jtulach@1348: private boolean has(int f) { jtulach@1348: return (flags & f) != 0; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Match next character, signal error if failed. jtulach@1348: */ jtulach@1348: private void accept(int ch, String s) { jtulach@1348: int testChar = temp[cursor++]; jtulach@1348: if (has(COMMENTS)) jtulach@1348: testChar = parsePastWhitespace(testChar); jtulach@1348: if (ch != testChar) { jtulach@1348: throw error(s); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Mark the end of pattern with a specific character. jtulach@1348: */ jtulach@1348: private void mark(int c) { jtulach@1348: temp[patternLength] = c; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Peek the next character, and do not advance the cursor. jtulach@1348: */ jtulach@1348: private int peek() { jtulach@1348: int ch = temp[cursor]; jtulach@1348: if (has(COMMENTS)) jtulach@1348: ch = peekPastWhitespace(ch); jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Read the next character, and advance the cursor by one. jtulach@1348: */ jtulach@1348: private int read() { jtulach@1348: int ch = temp[cursor++]; jtulach@1348: if (has(COMMENTS)) jtulach@1348: ch = parsePastWhitespace(ch); jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Read the next character, and advance the cursor by one, jtulach@1348: * ignoring the COMMENTS setting jtulach@1348: */ jtulach@1348: private int readEscaped() { jtulach@1348: int ch = temp[cursor++]; jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Advance the cursor by one, and peek the next character. jtulach@1348: */ jtulach@1348: private int next() { jtulach@1348: int ch = temp[++cursor]; jtulach@1348: if (has(COMMENTS)) jtulach@1348: ch = peekPastWhitespace(ch); jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Advance the cursor by one, and peek the next character, jtulach@1348: * ignoring the COMMENTS setting jtulach@1348: */ jtulach@1348: private int nextEscaped() { jtulach@1348: int ch = temp[++cursor]; jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * If in xmode peek past whitespace and comments. jtulach@1348: */ jtulach@1348: private int peekPastWhitespace(int ch) { jtulach@1348: while (ASCII.isSpace(ch) || ch == '#') { jtulach@1348: while (ASCII.isSpace(ch)) jtulach@1348: ch = temp[++cursor]; jtulach@1348: if (ch == '#') { jtulach@1348: ch = peekPastLine(); jtulach@1348: } jtulach@1348: } jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * If in xmode parse past whitespace and comments. jtulach@1348: */ jtulach@1348: private int parsePastWhitespace(int ch) { jtulach@1348: while (ASCII.isSpace(ch) || ch == '#') { jtulach@1348: while (ASCII.isSpace(ch)) jtulach@1348: ch = temp[cursor++]; jtulach@1348: if (ch == '#') jtulach@1348: ch = parsePastLine(); jtulach@1348: } jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * xmode parse past comment to end of line. jtulach@1348: */ jtulach@1348: private int parsePastLine() { jtulach@1348: int ch = temp[cursor++]; jtulach@1348: while (ch != 0 && !isLineSeparator(ch)) jtulach@1348: ch = temp[cursor++]; jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * xmode peek past comment to end of line. jtulach@1348: */ jtulach@1348: private int peekPastLine() { jtulach@1348: int ch = temp[++cursor]; jtulach@1348: while (ch != 0 && !isLineSeparator(ch)) jtulach@1348: ch = temp[++cursor]; jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Determines if character is a line separator in the current mode jtulach@1348: */ jtulach@1348: private boolean isLineSeparator(int ch) { jtulach@1348: if (has(UNIX_LINES)) { jtulach@1348: return ch == '\n'; jtulach@1348: } else { jtulach@1348: return (ch == '\n' || jtulach@1348: ch == '\r' || jtulach@1348: (ch|1) == '\u2029' || jtulach@1348: ch == '\u0085'); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Read the character after the next one, and advance the cursor by two. jtulach@1348: */ jtulach@1348: private int skip() { jtulach@1348: int i = cursor; jtulach@1348: int ch = temp[i+1]; jtulach@1348: cursor = i + 2; jtulach@1348: return ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Unread one next character, and retreat cursor by one. jtulach@1348: */ jtulach@1348: private void unread() { jtulach@1348: cursor--; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Internal method used for handling all syntax errors. The pattern is jtulach@1348: * displayed with a pointer to aid in locating the syntax error. jtulach@1348: */ jtulach@1348: private PatternSyntaxException error(String s) { jtulach@1348: return new PatternSyntaxException(s, normalizedPattern, cursor - 1); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Determines if there is any supplementary character or unpaired jtulach@1348: * surrogate in the specified range. jtulach@1348: */ jtulach@1348: private boolean findSupplementary(int start, int end) { jtulach@1348: for (int i = start; i < end; i++) { jtulach@1348: if (isSupplementary(temp[i])) jtulach@1348: return true; jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Determines if the specified code point is a supplementary jtulach@1348: * character or unpaired surrogate. jtulach@1348: */ jtulach@1348: private static final boolean isSupplementary(int ch) { jtulach@1348: return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || jtulach@1348: Character.isSurrogate((char)ch); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The following methods handle the main parsing. They are sorted jtulach@1348: * according to their precedence order, the lowest one first. jtulach@1348: */ jtulach@1348: jtulach@1348: /** jtulach@1348: * The expression is parsed with branch nodes added for alternations. jtulach@1348: * This may be called recursively to parse sub expressions that may jtulach@1348: * contain alternations. jtulach@1348: */ jtulach@1348: private Node expr(Node end) { jtulach@1348: Node prev = null; jtulach@1348: Node firstTail = null; jtulach@1348: Node branchConn = null; jtulach@1348: jtulach@1348: for (;;) { jtulach@1348: Node node = sequence(end); jtulach@1348: Node nodeTail = root; //double return jtulach@1348: if (prev == null) { jtulach@1348: prev = node; jtulach@1348: firstTail = nodeTail; jtulach@1348: } else { jtulach@1348: // Branch jtulach@1348: if (branchConn == null) { jtulach@1348: branchConn = new BranchConn(); jtulach@1348: branchConn.next = end; jtulach@1348: } jtulach@1348: if (node == end) { jtulach@1348: // if the node returned from sequence() is "end" jtulach@1348: // we have an empty expr, set a null atom into jtulach@1348: // the branch to indicate to go "next" directly. jtulach@1348: node = null; jtulach@1348: } else { jtulach@1348: // the "tail.next" of each atom goes to branchConn jtulach@1348: nodeTail.next = branchConn; jtulach@1348: } jtulach@1348: if (prev instanceof Branch) { jtulach@1348: ((Branch)prev).add(node); jtulach@1348: } else { jtulach@1348: if (prev == end) { jtulach@1348: prev = null; jtulach@1348: } else { jtulach@1348: // replace the "end" with "branchConn" at its tail.next jtulach@1348: // when put the "prev" into the branch as the first atom. jtulach@1348: firstTail.next = branchConn; jtulach@1348: } jtulach@1348: prev = new Branch(prev, node, branchConn); jtulach@1348: } jtulach@1348: } jtulach@1348: if (peek() != '|') { jtulach@1348: return prev; jtulach@1348: } jtulach@1348: next(); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parsing of sequences between alternations. jtulach@1348: */ jtulach@1348: private Node sequence(Node end) { jtulach@1348: Node head = null; jtulach@1348: Node tail = null; jtulach@1348: Node node = null; jtulach@1348: LOOP: jtulach@1348: for (;;) { jtulach@1348: int ch = peek(); jtulach@1348: switch (ch) { jtulach@1348: case '(': jtulach@1348: // Because group handles its own closure, jtulach@1348: // we need to treat it differently jtulach@1348: node = group0(); jtulach@1348: // Check for comment or flag group jtulach@1348: if (node == null) jtulach@1348: continue; jtulach@1348: if (head == null) jtulach@1348: head = node; jtulach@1348: else jtulach@1348: tail.next = node; jtulach@1348: // Double return: Tail was returned in root jtulach@1348: tail = root; jtulach@1348: continue; jtulach@1348: case '[': jtulach@1348: node = clazz(true); jtulach@1348: break; jtulach@1348: case '\\': jtulach@1348: ch = nextEscaped(); jtulach@1348: if (ch == 'p' || ch == 'P') { jtulach@1348: boolean oneLetter = true; jtulach@1348: boolean comp = (ch == 'P'); jtulach@1348: ch = next(); // Consume { if present jtulach@1348: if (ch != '{') { jtulach@1348: unread(); jtulach@1348: } else { jtulach@1348: oneLetter = false; jtulach@1348: } jtulach@1348: node = family(oneLetter, comp); jtulach@1348: } else { jtulach@1348: unread(); jtulach@1348: node = atom(); jtulach@1348: } jtulach@1348: break; jtulach@1348: case '^': jtulach@1348: next(); jtulach@1348: if (has(MULTILINE)) { jtulach@1348: if (has(UNIX_LINES)) jtulach@1348: node = new UnixCaret(); jtulach@1348: else jtulach@1348: node = new Caret(); jtulach@1348: } else { jtulach@1348: node = new Begin(); jtulach@1348: } jtulach@1348: break; jtulach@1348: case '$': jtulach@1348: next(); jtulach@1348: if (has(UNIX_LINES)) jtulach@1348: node = new UnixDollar(has(MULTILINE)); jtulach@1348: else jtulach@1348: node = new Dollar(has(MULTILINE)); jtulach@1348: break; jtulach@1348: case '.': jtulach@1348: next(); jtulach@1348: if (has(DOTALL)) { jtulach@1348: node = new All(); jtulach@1348: } else { jtulach@1348: if (has(UNIX_LINES)) jtulach@1348: node = new UnixDot(); jtulach@1348: else { jtulach@1348: node = new Dot(); jtulach@1348: } jtulach@1348: } jtulach@1348: break; jtulach@1348: case '|': jtulach@1348: case ')': jtulach@1348: break LOOP; jtulach@1348: case ']': // Now interpreting dangling ] and } as literals jtulach@1348: case '}': jtulach@1348: node = atom(); jtulach@1348: break; jtulach@1348: case '?': jtulach@1348: case '*': jtulach@1348: case '+': jtulach@1348: next(); jtulach@1348: throw error("Dangling meta character '" + ((char)ch) + "'"); jtulach@1348: case 0: jtulach@1348: if (cursor >= patternLength) { jtulach@1348: break LOOP; jtulach@1348: } jtulach@1348: // Fall through jtulach@1348: default: jtulach@1348: node = atom(); jtulach@1348: break; jtulach@1348: } jtulach@1348: jtulach@1348: node = closure(node); jtulach@1348: jtulach@1348: if (head == null) { jtulach@1348: head = tail = node; jtulach@1348: } else { jtulach@1348: tail.next = node; jtulach@1348: tail = node; jtulach@1348: } jtulach@1348: } jtulach@1348: if (head == null) { jtulach@1348: return end; jtulach@1348: } jtulach@1348: tail.next = end; jtulach@1348: root = tail; //double return jtulach@1348: return head; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parse and add a new Single or Slice. jtulach@1348: */ jtulach@1348: private Node atom() { jtulach@1348: int first = 0; jtulach@1348: int prev = -1; jtulach@1348: boolean hasSupplementary = false; jtulach@1348: int ch = peek(); jtulach@1348: for (;;) { jtulach@1348: switch (ch) { jtulach@1348: case '*': jtulach@1348: case '+': jtulach@1348: case '?': jtulach@1348: case '{': jtulach@1348: if (first > 1) { jtulach@1348: cursor = prev; // Unwind one character jtulach@1348: first--; jtulach@1348: } jtulach@1348: break; jtulach@1348: case '$': jtulach@1348: case '.': jtulach@1348: case '^': jtulach@1348: case '(': jtulach@1348: case '[': jtulach@1348: case '|': jtulach@1348: case ')': jtulach@1348: break; jtulach@1348: case '\\': jtulach@1348: ch = nextEscaped(); jtulach@1348: if (ch == 'p' || ch == 'P') { // Property jtulach@1348: if (first > 0) { // Slice is waiting; handle it first jtulach@1348: unread(); jtulach@1348: break; jtulach@1348: } else { // No slice; just return the family node jtulach@1348: boolean comp = (ch == 'P'); jtulach@1348: boolean oneLetter = true; jtulach@1348: ch = next(); // Consume { if present jtulach@1348: if (ch != '{') jtulach@1348: unread(); jtulach@1348: else jtulach@1348: oneLetter = false; jtulach@1348: return family(oneLetter, comp); jtulach@1348: } jtulach@1348: } jtulach@1348: unread(); jtulach@1348: prev = cursor; jtulach@1348: ch = escape(false, first == 0); jtulach@1348: if (ch >= 0) { jtulach@1348: append(ch, first); jtulach@1348: first++; jtulach@1348: if (isSupplementary(ch)) { jtulach@1348: hasSupplementary = true; jtulach@1348: } jtulach@1348: ch = peek(); jtulach@1348: continue; jtulach@1348: } else if (first == 0) { jtulach@1348: return root; jtulach@1348: } jtulach@1348: // Unwind meta escape sequence jtulach@1348: cursor = prev; jtulach@1348: break; jtulach@1348: case 0: jtulach@1348: if (cursor >= patternLength) { jtulach@1348: break; jtulach@1348: } jtulach@1348: // Fall through jtulach@1348: default: jtulach@1348: prev = cursor; jtulach@1348: append(ch, first); jtulach@1348: first++; jtulach@1348: if (isSupplementary(ch)) { jtulach@1348: hasSupplementary = true; jtulach@1348: } jtulach@1348: ch = next(); jtulach@1348: continue; jtulach@1348: } jtulach@1348: break; jtulach@1348: } jtulach@1348: if (first == 1) { jtulach@1348: return newSingle(buffer[0]); jtulach@1348: } else { jtulach@1348: return newSlice(buffer, first, hasSupplementary); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: private void append(int ch, int len) { jtulach@1348: if (len >= buffer.length) { jtulach@1348: int[] tmp = new int[len+len]; jtulach@1348: System.arraycopy(buffer, 0, tmp, 0, len); jtulach@1348: buffer = tmp; jtulach@1348: } jtulach@1348: buffer[len] = ch; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses a backref greedily, taking as many numbers as it jtulach@1348: * can. The first digit is always treated as a backref, but jtulach@1348: * multi digit numbers are only treated as a backref if at jtulach@1348: * least that many backrefs exist at this point in the regex. jtulach@1348: */ jtulach@1348: private Node ref(int refNum) { jtulach@1348: boolean done = false; jtulach@1348: while(!done) { jtulach@1348: int ch = peek(); jtulach@1348: switch(ch) { jtulach@1348: case '0': jtulach@1348: case '1': jtulach@1348: case '2': jtulach@1348: case '3': jtulach@1348: case '4': jtulach@1348: case '5': jtulach@1348: case '6': jtulach@1348: case '7': jtulach@1348: case '8': jtulach@1348: case '9': jtulach@1348: int newRefNum = (refNum * 10) + (ch - '0'); jtulach@1348: // Add another number if it doesn't make a group jtulach@1348: // that doesn't exist jtulach@1348: if (capturingGroupCount - 1 < newRefNum) { jtulach@1348: done = true; jtulach@1348: break; jtulach@1348: } jtulach@1348: refNum = newRefNum; jtulach@1348: read(); jtulach@1348: break; jtulach@1348: default: jtulach@1348: done = true; jtulach@1348: break; jtulach@1348: } jtulach@1348: } jtulach@1348: if (has(CASE_INSENSITIVE)) jtulach@1348: return new CIBackRef(refNum, has(UNICODE_CASE)); jtulach@1348: else jtulach@1348: return new BackRef(refNum); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses an escape sequence to determine the actual value that needs jtulach@1348: * to be matched. jtulach@1348: * If -1 is returned and create was true a new object was added to the tree jtulach@1348: * to handle the escape sequence. jtulach@1348: * If the returned value is greater than zero, it is the value that jtulach@1348: * matches the escape sequence. jtulach@1348: */ jtulach@1348: private int escape(boolean inclass, boolean create) { jtulach@1348: int ch = skip(); jtulach@1348: switch (ch) { jtulach@1348: case '0': jtulach@1348: return o(); jtulach@1348: case '1': jtulach@1348: case '2': jtulach@1348: case '3': jtulach@1348: case '4': jtulach@1348: case '5': jtulach@1348: case '6': jtulach@1348: case '7': jtulach@1348: case '8': jtulach@1348: case '9': jtulach@1348: if (inclass) break; jtulach@1348: if (create) { jtulach@1348: root = ref((ch - '0')); jtulach@1348: } jtulach@1348: return -1; jtulach@1348: case 'A': jtulach@1348: if (inclass) break; jtulach@1348: if (create) root = new Begin(); jtulach@1348: return -1; jtulach@1348: case 'B': jtulach@1348: if (inclass) break; jtulach@1348: if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); jtulach@1348: return -1; jtulach@1348: case 'C': jtulach@1348: break; jtulach@1348: case 'D': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.DIGIT).complement() jtulach@1348: : new Ctype(ASCII.DIGIT).complement(); jtulach@1348: return -1; jtulach@1348: case 'E': jtulach@1348: case 'F': jtulach@1348: break; jtulach@1348: case 'G': jtulach@1348: if (inclass) break; jtulach@1348: if (create) root = new LastMatch(); jtulach@1348: return -1; jtulach@1348: case 'H': jtulach@1348: case 'I': jtulach@1348: case 'J': jtulach@1348: case 'K': jtulach@1348: case 'L': jtulach@1348: case 'M': jtulach@1348: case 'N': jtulach@1348: case 'O': jtulach@1348: case 'P': jtulach@1348: case 'Q': jtulach@1348: case 'R': jtulach@1348: break; jtulach@1348: case 'S': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.WHITE_SPACE).complement() jtulach@1348: : new Ctype(ASCII.SPACE).complement(); jtulach@1348: return -1; jtulach@1348: case 'T': jtulach@1348: case 'U': jtulach@1348: case 'V': jtulach@1348: break; jtulach@1348: case 'W': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.WORD).complement() jtulach@1348: : new Ctype(ASCII.WORD).complement(); jtulach@1348: return -1; jtulach@1348: case 'X': jtulach@1348: case 'Y': jtulach@1348: break; jtulach@1348: case 'Z': jtulach@1348: if (inclass) break; jtulach@1348: if (create) { jtulach@1348: if (has(UNIX_LINES)) jtulach@1348: root = new UnixDollar(false); jtulach@1348: else jtulach@1348: root = new Dollar(false); jtulach@1348: } jtulach@1348: return -1; jtulach@1348: case 'a': jtulach@1348: return '\007'; jtulach@1348: case 'b': jtulach@1348: if (inclass) break; jtulach@1348: if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); jtulach@1348: return -1; jtulach@1348: case 'c': jtulach@1348: return c(); jtulach@1348: case 'd': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.DIGIT) jtulach@1348: : new Ctype(ASCII.DIGIT); jtulach@1348: return -1; jtulach@1348: case 'e': jtulach@1348: return '\033'; jtulach@1348: case 'f': jtulach@1348: return '\f'; jtulach@1348: case 'g': jtulach@1348: case 'h': jtulach@1348: case 'i': jtulach@1348: case 'j': jtulach@1348: break; jtulach@1348: case 'k': jtulach@1348: if (inclass) jtulach@1348: break; jtulach@1348: if (read() != '<') jtulach@1348: throw error("\\k is not followed by '<' for named capturing group"); jtulach@1348: String name = groupname(read()); jtulach@1348: if (!namedGroups().containsKey(name)) jtulach@1348: throw error("(named capturing group <"+ name+"> does not exit"); jtulach@1348: if (create) { jtulach@1348: if (has(CASE_INSENSITIVE)) jtulach@1348: root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); jtulach@1348: else jtulach@1348: root = new BackRef(namedGroups().get(name)); jtulach@1348: } jtulach@1348: return -1; jtulach@1348: case 'l': jtulach@1348: case 'm': jtulach@1348: break; jtulach@1348: case 'n': jtulach@1348: return '\n'; jtulach@1348: case 'o': jtulach@1348: case 'p': jtulach@1348: case 'q': jtulach@1348: break; jtulach@1348: case 'r': jtulach@1348: return '\r'; jtulach@1348: case 's': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.WHITE_SPACE) jtulach@1348: : new Ctype(ASCII.SPACE); jtulach@1348: return -1; jtulach@1348: case 't': jtulach@1348: return '\t'; jtulach@1348: case 'u': jtulach@1348: return u(); jtulach@1348: case 'v': jtulach@1348: return '\013'; jtulach@1348: case 'w': jtulach@1348: if (create) root = has(UNICODE_CHARACTER_CLASS) jtulach@1348: ? new Utype(UnicodeProp.WORD) jtulach@1348: : new Ctype(ASCII.WORD); jtulach@1348: return -1; jtulach@1348: case 'x': jtulach@1348: return x(); jtulach@1348: case 'y': jtulach@1348: break; jtulach@1348: case 'z': jtulach@1348: if (inclass) break; jtulach@1348: if (create) root = new End(); jtulach@1348: return -1; jtulach@1348: default: jtulach@1348: return ch; jtulach@1348: } jtulach@1348: throw error("Illegal/unsupported escape sequence"); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parse a character class, and return the node that matches it. jtulach@1348: * jtulach@1348: * Consumes a ] on the way out if consume is true. Usually consume jtulach@1348: * is true except for the case of [abc&&def] where def is a separate jtulach@1348: * right hand node with "understood" brackets. jtulach@1348: */ jtulach@1348: private CharProperty clazz(boolean consume) { jtulach@1348: CharProperty prev = null; jtulach@1348: CharProperty node = null; jtulach@1348: BitClass bits = new BitClass(); jtulach@1348: boolean include = true; jtulach@1348: boolean firstInClass = true; jtulach@1348: int ch = next(); jtulach@1348: for (;;) { jtulach@1348: switch (ch) { jtulach@1348: case '^': jtulach@1348: // Negates if first char in a class, otherwise literal jtulach@1348: if (firstInClass) { jtulach@1348: if (temp[cursor-1] != '[') jtulach@1348: break; jtulach@1348: ch = next(); jtulach@1348: include = !include; jtulach@1348: continue; jtulach@1348: } else { jtulach@1348: // ^ not first in class, treat as literal jtulach@1348: break; jtulach@1348: } jtulach@1348: case '[': jtulach@1348: firstInClass = false; jtulach@1348: node = clazz(true); jtulach@1348: if (prev == null) jtulach@1348: prev = node; jtulach@1348: else jtulach@1348: prev = union(prev, node); jtulach@1348: ch = peek(); jtulach@1348: continue; jtulach@1348: case '&': jtulach@1348: firstInClass = false; jtulach@1348: ch = next(); jtulach@1348: if (ch == '&') { jtulach@1348: ch = next(); jtulach@1348: CharProperty rightNode = null; jtulach@1348: while (ch != ']' && ch != '&') { jtulach@1348: if (ch == '[') { jtulach@1348: if (rightNode == null) jtulach@1348: rightNode = clazz(true); jtulach@1348: else jtulach@1348: rightNode = union(rightNode, clazz(true)); jtulach@1348: } else { // abc&&def jtulach@1348: unread(); jtulach@1348: rightNode = clazz(false); jtulach@1348: } jtulach@1348: ch = peek(); jtulach@1348: } jtulach@1348: if (rightNode != null) jtulach@1348: node = rightNode; jtulach@1348: if (prev == null) { jtulach@1348: if (rightNode == null) jtulach@1348: throw error("Bad class syntax"); jtulach@1348: else jtulach@1348: prev = rightNode; jtulach@1348: } else { jtulach@1348: prev = intersection(prev, node); jtulach@1348: } jtulach@1348: } else { jtulach@1348: // treat as a literal & jtulach@1348: unread(); jtulach@1348: break; jtulach@1348: } jtulach@1348: continue; jtulach@1348: case 0: jtulach@1348: firstInClass = false; jtulach@1348: if (cursor >= patternLength) jtulach@1348: throw error("Unclosed character class"); jtulach@1348: break; jtulach@1348: case ']': jtulach@1348: firstInClass = false; jtulach@1348: if (prev != null) { jtulach@1348: if (consume) jtulach@1348: next(); jtulach@1348: return prev; jtulach@1348: } jtulach@1348: break; jtulach@1348: default: jtulach@1348: firstInClass = false; jtulach@1348: break; jtulach@1348: } jtulach@1348: node = range(bits); jtulach@1348: if (include) { jtulach@1348: if (prev == null) { jtulach@1348: prev = node; jtulach@1348: } else { jtulach@1348: if (prev != node) jtulach@1348: prev = union(prev, node); jtulach@1348: } jtulach@1348: } else { jtulach@1348: if (prev == null) { jtulach@1348: prev = node.complement(); jtulach@1348: } else { jtulach@1348: if (prev != node) jtulach@1348: prev = setDifference(prev, node); jtulach@1348: } jtulach@1348: } jtulach@1348: ch = peek(); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: private CharProperty bitsOrSingle(BitClass bits, int ch) { jtulach@1348: /* Bits can only handle codepoints in [u+0000-u+00ff] range. jtulach@1348: Use "single" node instead of bits when dealing with unicode jtulach@1348: case folding for codepoints listed below. jtulach@1348: (1)Uppercase out of range: u+00ff, u+00b5 jtulach@1348: toUpperCase(u+00ff) -> u+0178 jtulach@1348: toUpperCase(u+00b5) -> u+039c jtulach@1348: (2)LatinSmallLetterLongS u+17f jtulach@1348: toUpperCase(u+017f) -> u+0053 jtulach@1348: (3)LatinSmallLetterDotlessI u+131 jtulach@1348: toUpperCase(u+0131) -> u+0049 jtulach@1348: (4)LatinCapitalLetterIWithDotAbove u+0130 jtulach@1348: toLowerCase(u+0130) -> u+0069 jtulach@1348: (5)KelvinSign u+212a jtulach@1348: toLowerCase(u+212a) ==> u+006B jtulach@1348: (6)AngstromSign u+212b jtulach@1348: toLowerCase(u+212b) ==> u+00e5 jtulach@1348: */ jtulach@1348: int d; jtulach@1348: if (ch < 256 && jtulach@1348: !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && jtulach@1348: (ch == 0xff || ch == 0xb5 || jtulach@1348: ch == 0x49 || ch == 0x69 || //I and i jtulach@1348: ch == 0x53 || ch == 0x73 || //S and s jtulach@1348: ch == 0x4b || ch == 0x6b || //K and k jtulach@1348: ch == 0xc5 || ch == 0xe5))) //A+ring jtulach@1348: return bits.add(ch, flags()); jtulach@1348: return newSingle(ch); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parse a single character or a character range in a character class jtulach@1348: * and return its representative node. jtulach@1348: */ jtulach@1348: private CharProperty range(BitClass bits) { jtulach@1348: int ch = peek(); jtulach@1348: if (ch == '\\') { jtulach@1348: ch = nextEscaped(); jtulach@1348: if (ch == 'p' || ch == 'P') { // A property jtulach@1348: boolean comp = (ch == 'P'); jtulach@1348: boolean oneLetter = true; jtulach@1348: // Consume { if present jtulach@1348: ch = next(); jtulach@1348: if (ch != '{') jtulach@1348: unread(); jtulach@1348: else jtulach@1348: oneLetter = false; jtulach@1348: return family(oneLetter, comp); jtulach@1348: } else { // ordinary escape jtulach@1348: unread(); jtulach@1348: ch = escape(true, true); jtulach@1348: if (ch == -1) jtulach@1348: return (CharProperty) root; jtulach@1348: } jtulach@1348: } else { jtulach@1348: ch = single(); jtulach@1348: } jtulach@1348: if (ch >= 0) { jtulach@1348: if (peek() == '-') { jtulach@1348: int endRange = temp[cursor+1]; jtulach@1348: if (endRange == '[') { jtulach@1348: return bitsOrSingle(bits, ch); jtulach@1348: } jtulach@1348: if (endRange != ']') { jtulach@1348: next(); jtulach@1348: int m = single(); jtulach@1348: if (m < ch) jtulach@1348: throw error("Illegal character range"); jtulach@1348: if (has(CASE_INSENSITIVE)) jtulach@1348: return caseInsensitiveRangeFor(ch, m); jtulach@1348: else jtulach@1348: return rangeFor(ch, m); jtulach@1348: } jtulach@1348: } jtulach@1348: return bitsOrSingle(bits, ch); jtulach@1348: } jtulach@1348: throw error("Unexpected character '"+((char)ch)+"'"); jtulach@1348: } jtulach@1348: jtulach@1348: private int single() { jtulach@1348: int ch = peek(); jtulach@1348: switch (ch) { jtulach@1348: case '\\': jtulach@1348: return escape(true, false); jtulach@1348: default: jtulach@1348: next(); jtulach@1348: return ch; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses a Unicode character family and returns its representative node. jtulach@1348: */ jtulach@1348: private CharProperty family(boolean singleLetter, jtulach@1348: boolean maybeComplement) jtulach@1348: { jtulach@1348: next(); jtulach@1348: String name; jtulach@1348: CharProperty node = null; jtulach@1348: jtulach@1348: if (singleLetter) { jtulach@1348: int c = temp[cursor]; jtulach@1348: if (!Character.isSupplementaryCodePoint(c)) { jtulach@1348: name = String.valueOf((char)c); jtulach@1348: } else { jtulach@1348: name = new String(temp, cursor, 1); jtulach@1348: } jtulach@1348: read(); jtulach@1348: } else { jtulach@1348: int i = cursor; jtulach@1348: mark('}'); jtulach@1348: while(read() != '}') { jtulach@1348: } jtulach@1348: mark('\000'); jtulach@1348: int j = cursor; jtulach@1348: if (j > patternLength) jtulach@1348: throw error("Unclosed character family"); jtulach@1348: if (i + 1 >= j) jtulach@1348: throw error("Empty character family"); jtulach@1348: name = new String(temp, i, j-i-1); jtulach@1348: } jtulach@1348: jtulach@1348: int i = name.indexOf('='); jtulach@1348: if (i != -1) { jtulach@1348: // property construct \p{name=value} jtulach@1348: String value = name.substring(i + 1); jtulach@1348: name = name.substring(0, i).toLowerCase(Locale.ENGLISH); jtulach@1350: /* if ("sc".equals(name) || "script".equals(name)) { jtulach@1348: node = unicodeScriptPropertyFor(value); jtulach@1348: } else if ("blk".equals(name) || "block".equals(name)) { jtulach@1348: node = unicodeBlockPropertyFor(value); jtulach@1350: } else*/ if ("gc".equals(name) || "general_category".equals(name)) { jtulach@1348: node = charPropertyNodeFor(value); jtulach@1348: } else { jtulach@1348: throw error("Unknown Unicode property {name=<" + name + ">, " jtulach@1348: + "value=<" + value + ">}"); jtulach@1348: } jtulach@1348: } else { jtulach@1350: /*if (name.startsWith("In")) { jtulach@1348: // \p{inBlockName} jtulach@1348: node = unicodeBlockPropertyFor(name.substring(2)); jtulach@1348: } else if (name.startsWith("Is")) { jtulach@1348: // \p{isGeneralCategory} and \p{isScriptName} jtulach@1348: name = name.substring(2); jtulach@1348: UnicodeProp uprop = UnicodeProp.forName(name); jtulach@1348: if (uprop != null) jtulach@1348: node = new Utype(uprop); jtulach@1348: if (node == null) jtulach@1348: node = CharPropertyNames.charPropertyFor(name); jtulach@1348: if (node == null) jtulach@1348: node = unicodeScriptPropertyFor(name); jtulach@1350: } else*/ { jtulach@1348: if (has(UNICODE_CHARACTER_CLASS)) { jtulach@1348: UnicodeProp uprop = UnicodeProp.forPOSIXName(name); jtulach@1348: if (uprop != null) jtulach@1348: node = new Utype(uprop); jtulach@1348: } jtulach@1348: if (node == null) jtulach@1348: node = charPropertyNodeFor(name); jtulach@1348: } jtulach@1348: } jtulach@1348: if (maybeComplement) { jtulach@1350: if (node instanceof Category /*|| node instanceof Block*/) jtulach@1348: hasSupplementary = true; jtulach@1348: node = node.complement(); jtulach@1348: } jtulach@1348: return node; jtulach@1348: } jtulach@1348: jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns a CharProperty matching all characters belong to jtulach@1348: * a UnicodeScript. jtulach@1350: * jtulach@1348: private CharProperty unicodeScriptPropertyFor(String name) { jtulach@1348: final Character.UnicodeScript script; jtulach@1348: try { jtulach@1348: script = Character.UnicodeScript.forName(name); jtulach@1348: } catch (IllegalArgumentException iae) { jtulach@1348: throw error("Unknown character script name {" + name + "}"); jtulach@1348: } jtulach@1348: return new Script(script); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns a CharProperty matching all characters in a UnicodeBlock. jtulach@1350: * jtulach@1348: private CharProperty unicodeBlockPropertyFor(String name) { jtulach@1348: final Character.UnicodeBlock block; jtulach@1348: try { jtulach@1348: block = Character.UnicodeBlock.forName(name); jtulach@1348: } catch (IllegalArgumentException iae) { jtulach@1348: throw error("Unknown character block name {" + name + "}"); jtulach@1348: } jtulach@1348: return new Block(block); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns a CharProperty matching all characters in a named property. jtulach@1348: */ jtulach@1348: private CharProperty charPropertyNodeFor(String name) { jtulach@1348: CharProperty p = CharPropertyNames.charPropertyFor(name); jtulach@1348: if (p == null) jtulach@1348: throw error("Unknown character property name {" + name + "}"); jtulach@1348: return p; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses and returns the name of a "named capturing group", the trailing jtulach@1348: * ">" is consumed after parsing. jtulach@1348: */ jtulach@1348: private String groupname(int ch) { jtulach@1348: StringBuilder sb = new StringBuilder(); jtulach@1348: sb.append(Character.toChars(ch)); jtulach@1348: while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) || jtulach@1348: ASCII.isDigit(ch)) { jtulach@1348: sb.append(Character.toChars(ch)); jtulach@1348: } jtulach@1348: if (sb.length() == 0) jtulach@1348: throw error("named capturing group has 0 length name"); jtulach@1348: if (ch != '>') jtulach@1348: throw error("named capturing group is missing trailing '>'"); jtulach@1348: return sb.toString(); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses a group and returns the head node of a set of nodes that process jtulach@1348: * the group. Sometimes a double return system is used where the tail is jtulach@1348: * returned in root. jtulach@1348: */ jtulach@1348: private Node group0() { jtulach@1348: boolean capturingGroup = false; jtulach@1348: Node head = null; jtulach@1348: Node tail = null; jtulach@1348: int save = flags; jtulach@1348: root = null; jtulach@1348: int ch = next(); jtulach@1348: if (ch == '?') { jtulach@1348: ch = skip(); jtulach@1348: switch (ch) { jtulach@1348: case ':': // (?:xxx) pure group jtulach@1348: head = createGroup(true); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: break; jtulach@1348: case '=': // (?=xxx) and (?!xxx) lookahead jtulach@1348: case '!': jtulach@1348: head = createGroup(true); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: if (ch == '=') { jtulach@1348: head = tail = new Pos(head); jtulach@1348: } else { jtulach@1348: head = tail = new Neg(head); jtulach@1348: } jtulach@1348: break; jtulach@1348: case '>': // (?>xxx) independent group jtulach@1348: head = createGroup(true); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: head = tail = new Ques(head, INDEPENDENT); jtulach@1348: break; jtulach@1348: case '<': // (? is already defined"); jtulach@1348: capturingGroup = true; jtulach@1348: head = createGroup(false); jtulach@1348: tail = root; jtulach@1348: namedGroups().put(name, capturingGroupCount-1); jtulach@1348: head.next = expr(tail); jtulach@1348: break; jtulach@1348: } jtulach@1348: int start = cursor; jtulach@1348: head = createGroup(true); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: tail.next = lookbehindEnd; jtulach@1348: TreeInfo info = new TreeInfo(); jtulach@1348: head.study(info); jtulach@1348: if (info.maxValid == false) { jtulach@1348: throw error("Look-behind group does not have " jtulach@1348: + "an obvious maximum length"); jtulach@1348: } jtulach@1348: boolean hasSupplementary = findSupplementary(start, patternLength); jtulach@1348: if (ch == '=') { jtulach@1348: head = tail = (hasSupplementary ? jtulach@1348: new BehindS(head, info.maxLength, jtulach@1348: info.minLength) : jtulach@1348: new Behind(head, info.maxLength, jtulach@1348: info.minLength)); jtulach@1348: } else if (ch == '!') { jtulach@1348: head = tail = (hasSupplementary ? jtulach@1348: new NotBehindS(head, info.maxLength, jtulach@1348: info.minLength) : jtulach@1348: new NotBehind(head, info.maxLength, jtulach@1348: info.minLength)); jtulach@1348: } else { jtulach@1348: throw error("Unknown look-behind group"); jtulach@1348: } jtulach@1348: break; jtulach@1348: case '$': jtulach@1348: case '@': jtulach@1348: throw error("Unknown group type"); jtulach@1348: default: // (?xxx:) inlined match flags jtulach@1348: unread(); jtulach@1348: addFlag(); jtulach@1348: ch = read(); jtulach@1348: if (ch == ')') { jtulach@1348: return null; // Inline modifier only jtulach@1348: } jtulach@1348: if (ch != ':') { jtulach@1348: throw error("Unknown inline modifier"); jtulach@1348: } jtulach@1348: head = createGroup(true); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: break; jtulach@1348: } jtulach@1348: } else { // (xxx) a regular group jtulach@1348: capturingGroup = true; jtulach@1348: head = createGroup(false); jtulach@1348: tail = root; jtulach@1348: head.next = expr(tail); jtulach@1348: } jtulach@1348: jtulach@1348: accept(')', "Unclosed group"); jtulach@1348: flags = save; jtulach@1348: jtulach@1348: // Check for quantifiers jtulach@1348: Node node = closure(head); jtulach@1348: if (node == head) { // No closure jtulach@1348: root = tail; jtulach@1348: return node; // Dual return jtulach@1348: } jtulach@1348: if (head == tail) { // Zero length assertion jtulach@1348: root = node; jtulach@1348: return node; // Dual return jtulach@1348: } jtulach@1348: jtulach@1348: if (node instanceof Ques) { jtulach@1348: Ques ques = (Ques) node; jtulach@1348: if (ques.type == POSSESSIVE) { jtulach@1348: root = node; jtulach@1348: return node; jtulach@1348: } jtulach@1348: tail.next = new BranchConn(); jtulach@1348: tail = tail.next; jtulach@1348: if (ques.type == GREEDY) { jtulach@1348: head = new Branch(head, null, tail); jtulach@1348: } else { // Reluctant quantifier jtulach@1348: head = new Branch(null, head, tail); jtulach@1348: } jtulach@1348: root = tail; jtulach@1348: return head; jtulach@1348: } else if (node instanceof Curly) { jtulach@1348: Curly curly = (Curly) node; jtulach@1348: if (curly.type == POSSESSIVE) { jtulach@1348: root = node; jtulach@1348: return node; jtulach@1348: } jtulach@1348: // Discover if the group is deterministic jtulach@1348: TreeInfo info = new TreeInfo(); jtulach@1348: if (head.study(info)) { // Deterministic jtulach@1348: GroupTail temp = (GroupTail) tail; jtulach@1348: head = root = new GroupCurly(head.next, curly.cmin, jtulach@1348: curly.cmax, curly.type, jtulach@1348: ((GroupTail)tail).localIndex, jtulach@1348: ((GroupTail)tail).groupIndex, jtulach@1348: capturingGroup); jtulach@1348: return head; jtulach@1348: } else { // Non-deterministic jtulach@1348: int temp = ((GroupHead) head).localIndex; jtulach@1348: Loop loop; jtulach@1348: if (curly.type == GREEDY) jtulach@1348: loop = new Loop(this.localCount, temp); jtulach@1348: else // Reluctant Curly jtulach@1348: loop = new LazyLoop(this.localCount, temp); jtulach@1348: Prolog prolog = new Prolog(loop); jtulach@1348: this.localCount += 1; jtulach@1348: loop.cmin = curly.cmin; jtulach@1348: loop.cmax = curly.cmax; jtulach@1348: loop.body = head; jtulach@1348: tail.next = loop; jtulach@1348: root = loop; jtulach@1348: return prolog; // Dual return jtulach@1348: } jtulach@1348: } jtulach@1348: throw error("Internal logic error"); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Create group head and tail nodes using double return. If the group is jtulach@1348: * created with anonymous true then it is a pure group and should not jtulach@1348: * affect group counting. jtulach@1348: */ jtulach@1348: private Node createGroup(boolean anonymous) { jtulach@1348: int localIndex = localCount++; jtulach@1348: int groupIndex = 0; jtulach@1348: if (!anonymous) jtulach@1348: groupIndex = capturingGroupCount++; jtulach@1348: GroupHead head = new GroupHead(localIndex); jtulach@1348: root = new GroupTail(localIndex, groupIndex); jtulach@1348: if (!anonymous && groupIndex < 10) jtulach@1348: groupNodes[groupIndex] = head; jtulach@1348: return head; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses inlined match flags and set them appropriately. jtulach@1348: */ jtulach@1348: private void addFlag() { jtulach@1348: int ch = peek(); jtulach@1348: for (;;) { jtulach@1348: switch (ch) { jtulach@1348: case 'i': jtulach@1348: flags |= CASE_INSENSITIVE; jtulach@1348: break; jtulach@1348: case 'm': jtulach@1348: flags |= MULTILINE; jtulach@1348: break; jtulach@1348: case 's': jtulach@1348: flags |= DOTALL; jtulach@1348: break; jtulach@1348: case 'd': jtulach@1348: flags |= UNIX_LINES; jtulach@1348: break; jtulach@1348: case 'u': jtulach@1348: flags |= UNICODE_CASE; jtulach@1348: break; jtulach@1348: case 'c': jtulach@1348: flags |= CANON_EQ; jtulach@1348: break; jtulach@1348: case 'x': jtulach@1348: flags |= COMMENTS; jtulach@1348: break; jtulach@1348: case 'U': jtulach@1348: flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); jtulach@1348: break; jtulach@1348: case '-': // subFlag then fall through jtulach@1348: ch = next(); jtulach@1348: subFlag(); jtulach@1348: default: jtulach@1348: return; jtulach@1348: } jtulach@1348: ch = next(); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Parses the second part of inlined match flags and turns off jtulach@1348: * flags appropriately. jtulach@1348: */ jtulach@1348: private void subFlag() { jtulach@1348: int ch = peek(); jtulach@1348: for (;;) { jtulach@1348: switch (ch) { jtulach@1348: case 'i': jtulach@1348: flags &= ~CASE_INSENSITIVE; jtulach@1348: break; jtulach@1348: case 'm': jtulach@1348: flags &= ~MULTILINE; jtulach@1348: break; jtulach@1348: case 's': jtulach@1348: flags &= ~DOTALL; jtulach@1348: break; jtulach@1348: case 'd': jtulach@1348: flags &= ~UNIX_LINES; jtulach@1348: break; jtulach@1348: case 'u': jtulach@1348: flags &= ~UNICODE_CASE; jtulach@1348: break; jtulach@1348: case 'c': jtulach@1348: flags &= ~CANON_EQ; jtulach@1348: break; jtulach@1348: case 'x': jtulach@1348: flags &= ~COMMENTS; jtulach@1348: break; jtulach@1348: case 'U': jtulach@1348: flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); jtulach@1348: default: jtulach@1348: return; jtulach@1348: } jtulach@1348: ch = next(); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: static final int MAX_REPS = 0x7FFFFFFF; jtulach@1348: jtulach@1348: static final int GREEDY = 0; jtulach@1348: jtulach@1348: static final int LAZY = 1; jtulach@1348: jtulach@1348: static final int POSSESSIVE = 2; jtulach@1348: jtulach@1348: static final int INDEPENDENT = 3; jtulach@1348: jtulach@1348: /** jtulach@1348: * Processes repetition. If the next character peeked is a quantifier jtulach@1348: * then new nodes must be appended to handle the repetition. jtulach@1348: * Prev could be a single or a group, so it could be a chain of nodes. jtulach@1348: */ jtulach@1348: private Node closure(Node prev) { jtulach@1348: Node atom; jtulach@1348: int ch = peek(); jtulach@1348: switch (ch) { jtulach@1348: case '?': jtulach@1348: ch = next(); jtulach@1348: if (ch == '?') { jtulach@1348: next(); jtulach@1348: return new Ques(prev, LAZY); jtulach@1348: } else if (ch == '+') { jtulach@1348: next(); jtulach@1348: return new Ques(prev, POSSESSIVE); jtulach@1348: } jtulach@1348: return new Ques(prev, GREEDY); jtulach@1348: case '*': jtulach@1348: ch = next(); jtulach@1348: if (ch == '?') { jtulach@1348: next(); jtulach@1348: return new Curly(prev, 0, MAX_REPS, LAZY); jtulach@1348: } else if (ch == '+') { jtulach@1348: next(); jtulach@1348: return new Curly(prev, 0, MAX_REPS, POSSESSIVE); jtulach@1348: } jtulach@1348: return new Curly(prev, 0, MAX_REPS, GREEDY); jtulach@1348: case '+': jtulach@1348: ch = next(); jtulach@1348: if (ch == '?') { jtulach@1348: next(); jtulach@1348: return new Curly(prev, 1, MAX_REPS, LAZY); jtulach@1348: } else if (ch == '+') { jtulach@1348: next(); jtulach@1348: return new Curly(prev, 1, MAX_REPS, POSSESSIVE); jtulach@1348: } jtulach@1348: return new Curly(prev, 1, MAX_REPS, GREEDY); jtulach@1348: case '{': jtulach@1348: ch = temp[cursor+1]; jtulach@1348: if (ASCII.isDigit(ch)) { jtulach@1348: skip(); jtulach@1348: int cmin = 0; jtulach@1348: do { jtulach@1348: cmin = cmin * 10 + (ch - '0'); jtulach@1348: } while (ASCII.isDigit(ch = read())); jtulach@1348: int cmax = cmin; jtulach@1348: if (ch == ',') { jtulach@1348: ch = read(); jtulach@1348: cmax = MAX_REPS; jtulach@1348: if (ch != '}') { jtulach@1348: cmax = 0; jtulach@1348: while (ASCII.isDigit(ch)) { jtulach@1348: cmax = cmax * 10 + (ch - '0'); jtulach@1348: ch = read(); jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: if (ch != '}') jtulach@1348: throw error("Unclosed counted closure"); jtulach@1348: if (((cmin) | (cmax) | (cmax - cmin)) < 0) jtulach@1348: throw error("Illegal repetition range"); jtulach@1348: Curly curly; jtulach@1348: ch = peek(); jtulach@1348: if (ch == '?') { jtulach@1348: next(); jtulach@1348: curly = new Curly(prev, cmin, cmax, LAZY); jtulach@1348: } else if (ch == '+') { jtulach@1348: next(); jtulach@1348: curly = new Curly(prev, cmin, cmax, POSSESSIVE); jtulach@1348: } else { jtulach@1348: curly = new Curly(prev, cmin, cmax, GREEDY); jtulach@1348: } jtulach@1348: return curly; jtulach@1348: } else { jtulach@1348: throw error("Illegal repetition"); jtulach@1348: } jtulach@1348: default: jtulach@1348: return prev; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Utility method for parsing control escape sequences. jtulach@1348: */ jtulach@1348: private int c() { jtulach@1348: if (cursor < patternLength) { jtulach@1348: return read() ^ 64; jtulach@1348: } jtulach@1348: throw error("Illegal control escape sequence"); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Utility method for parsing octal escape sequences. jtulach@1348: */ jtulach@1348: private int o() { jtulach@1348: int n = read(); jtulach@1348: if (((n-'0')|('7'-n)) >= 0) { jtulach@1348: int m = read(); jtulach@1348: if (((m-'0')|('7'-m)) >= 0) { jtulach@1348: int o = read(); jtulach@1348: if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { jtulach@1348: return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); jtulach@1348: } jtulach@1348: unread(); jtulach@1348: return (n - '0') * 8 + (m - '0'); jtulach@1348: } jtulach@1348: unread(); jtulach@1348: return (n - '0'); jtulach@1348: } jtulach@1348: throw error("Illegal octal escape sequence"); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Utility method for parsing hexadecimal escape sequences. jtulach@1348: */ jtulach@1348: private int x() { jtulach@1348: int n = read(); jtulach@1348: if (ASCII.isHexDigit(n)) { jtulach@1348: int m = read(); jtulach@1348: if (ASCII.isHexDigit(m)) { jtulach@1348: return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); jtulach@1348: } jtulach@1348: } else if (n == '{' && ASCII.isHexDigit(peek())) { jtulach@1348: int ch = 0; jtulach@1348: while (ASCII.isHexDigit(n = read())) { jtulach@1348: ch = (ch << 4) + ASCII.toDigit(n); jtulach@1348: if (ch > Character.MAX_CODE_POINT) jtulach@1348: throw error("Hexadecimal codepoint is too big"); jtulach@1348: } jtulach@1348: if (n != '}') jtulach@1348: throw error("Unclosed hexadecimal escape sequence"); jtulach@1348: return ch; jtulach@1348: } jtulach@1348: throw error("Illegal hexadecimal escape sequence"); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Utility method for parsing unicode escape sequences. jtulach@1348: */ jtulach@1348: private int cursor() { jtulach@1348: return cursor; jtulach@1348: } jtulach@1348: jtulach@1348: private void setcursor(int pos) { jtulach@1348: cursor = pos; jtulach@1348: } jtulach@1348: jtulach@1348: private int uxxxx() { jtulach@1348: int n = 0; jtulach@1348: for (int i = 0; i < 4; i++) { jtulach@1348: int ch = read(); jtulach@1348: if (!ASCII.isHexDigit(ch)) { jtulach@1348: throw error("Illegal Unicode escape sequence"); jtulach@1348: } jtulach@1348: n = n * 16 + ASCII.toDigit(ch); jtulach@1348: } jtulach@1348: return n; jtulach@1348: } jtulach@1348: jtulach@1348: private int u() { jtulach@1348: int n = uxxxx(); jtulach@1348: if (Character.isHighSurrogate((char)n)) { jtulach@1348: int cur = cursor(); jtulach@1348: if (read() == '\\' && read() == 'u') { jtulach@1348: int n2 = uxxxx(); jtulach@1348: if (Character.isLowSurrogate((char)n2)) jtulach@1348: return Character.toCodePoint((char)n, (char)n2); jtulach@1348: } jtulach@1348: setcursor(cur); jtulach@1348: } jtulach@1348: return n; jtulach@1348: } jtulach@1348: jtulach@1348: // jtulach@1348: // Utility methods for code point support jtulach@1348: // jtulach@1348: jtulach@1348: private static final int countChars(CharSequence seq, int index, jtulach@1348: int lengthInCodePoints) { jtulach@1348: // optimization jtulach@1348: if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { jtulach@1348: assert (index >= 0 && index < seq.length()); jtulach@1348: return 1; jtulach@1348: } jtulach@1348: int length = seq.length(); jtulach@1348: int x = index; jtulach@1348: if (lengthInCodePoints >= 0) { jtulach@1348: assert (index >= 0 && index < length); jtulach@1348: for (int i = 0; x < length && i < lengthInCodePoints; i++) { jtulach@1348: if (Character.isHighSurrogate(seq.charAt(x++))) { jtulach@1348: if (x < length && Character.isLowSurrogate(seq.charAt(x))) { jtulach@1348: x++; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: return x - index; jtulach@1348: } jtulach@1348: jtulach@1348: assert (index >= 0 && index <= length); jtulach@1348: if (index == 0) { jtulach@1348: return 0; jtulach@1348: } jtulach@1348: int len = -lengthInCodePoints; jtulach@1348: for (int i = 0; x > 0 && i < len; i++) { jtulach@1348: if (Character.isLowSurrogate(seq.charAt(--x))) { jtulach@1348: if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { jtulach@1348: x--; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: return index - x; jtulach@1348: } jtulach@1348: jtulach@1348: private static final int countCodePoints(CharSequence seq) { jtulach@1348: int length = seq.length(); jtulach@1348: int n = 0; jtulach@1348: for (int i = 0; i < length; ) { jtulach@1348: n++; jtulach@1348: if (Character.isHighSurrogate(seq.charAt(i++))) { jtulach@1348: if (i < length && Character.isLowSurrogate(seq.charAt(i))) { jtulach@1348: i++; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: return n; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Creates a bit vector for matching Latin-1 values. A normal BitClass jtulach@1348: * never matches values above Latin-1, and a complemented BitClass always jtulach@1348: * matches values above Latin-1. jtulach@1348: */ jtulach@1348: private static final class BitClass extends BmpCharProperty { jtulach@1348: final boolean[] bits; jtulach@1348: BitClass() { bits = new boolean[256]; } jtulach@1348: private BitClass(boolean[] bits) { this.bits = bits; } jtulach@1348: BitClass add(int c, int flags) { jtulach@1348: assert c >= 0 && c <= 255; jtulach@1348: if ((flags & CASE_INSENSITIVE) != 0) { jtulach@1348: if (ASCII.isAscii(c)) { jtulach@1348: bits[ASCII.toUpper(c)] = true; jtulach@1348: bits[ASCII.toLower(c)] = true; jtulach@1348: } else if ((flags & UNICODE_CASE) != 0) { jtulach@1348: bits[Character.toLowerCase(c)] = true; jtulach@1348: bits[Character.toUpperCase(c)] = true; jtulach@1348: } jtulach@1348: } jtulach@1348: bits[c] = true; jtulach@1348: return this; jtulach@1348: } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch < 256 && bits[ch]; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns a suitably optimized, single character matcher. jtulach@1348: */ jtulach@1348: private CharProperty newSingle(final int ch) { jtulach@1348: if (has(CASE_INSENSITIVE)) { jtulach@1348: int lower, upper; jtulach@1348: if (has(UNICODE_CASE)) { jtulach@1348: upper = Character.toUpperCase(ch); jtulach@1348: lower = Character.toLowerCase(upper); jtulach@1348: if (upper != lower) jtulach@1348: return new SingleU(lower); jtulach@1348: } else if (ASCII.isAscii(ch)) { jtulach@1348: lower = ASCII.toLower(ch); jtulach@1348: upper = ASCII.toUpper(ch); jtulach@1348: if (lower != upper) jtulach@1348: return new SingleI(lower, upper); jtulach@1348: } jtulach@1348: } jtulach@1348: if (isSupplementary(ch)) jtulach@1348: return new SingleS(ch); // Match a given Unicode character jtulach@1348: return new Single(ch); // Match a given BMP character jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Utility method for creating a string slice matcher. jtulach@1348: */ jtulach@1348: private Node newSlice(int[] buf, int count, boolean hasSupplementary) { jtulach@1348: int[] tmp = new int[count]; jtulach@1348: if (has(CASE_INSENSITIVE)) { jtulach@1348: if (has(UNICODE_CASE)) { jtulach@1348: for (int i = 0; i < count; i++) { jtulach@1348: tmp[i] = Character.toLowerCase( jtulach@1348: Character.toUpperCase(buf[i])); jtulach@1348: } jtulach@1348: return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); jtulach@1348: } jtulach@1348: for (int i = 0; i < count; i++) { jtulach@1348: tmp[i] = ASCII.toLower(buf[i]); jtulach@1348: } jtulach@1348: return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); jtulach@1348: } jtulach@1348: for (int i = 0; i < count; i++) { jtulach@1348: tmp[i] = buf[i]; jtulach@1348: } jtulach@1348: return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The following classes are the building components of the object jtulach@1348: * tree that represents a compiled regular expression. The object tree jtulach@1348: * is made of individual elements that handle constructs in the Pattern. jtulach@1348: * Each type of object knows how to match its equivalent construct with jtulach@1348: * the match() method. jtulach@1348: */ jtulach@1348: jtulach@1348: /** jtulach@1348: * Base class for all node classes. Subclasses should override the match() jtulach@1348: * method as appropriate. This class is an accepting node, so its match() jtulach@1348: * always returns true. jtulach@1348: */ jtulach@1348: static class Node extends Object { jtulach@1348: Node next; jtulach@1348: Node() { jtulach@1348: next = Pattern.accept; jtulach@1348: } jtulach@1348: /** jtulach@1348: * This method implements the classic accept node. jtulach@1348: */ jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: matcher.last = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: /** jtulach@1348: * This method is good for all zero length assertions. jtulach@1348: */ jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: if (next != null) { jtulach@1348: return next.study(info); jtulach@1348: } else { jtulach@1348: return info.deterministic; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: static class LastNode extends Node { jtulach@1348: /** jtulach@1348: * This method implements the classic accept node with jtulach@1348: * the addition of a check to see if the match occurred jtulach@1348: * using all of the input. jtulach@1348: */ jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to) jtulach@1348: return false; jtulach@1348: matcher.last = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Used for REs that can start anywhere within the input string. jtulach@1348: * This basically tries to match repeatedly at each spot in the jtulach@1348: * input string, moving forward after each try. An anchored search jtulach@1348: * or a BnM will bypass this node completely. jtulach@1348: */ jtulach@1348: static class Start extends Node { jtulach@1348: int minLength; jtulach@1348: Start(Node node) { jtulach@1348: this.next = node; jtulach@1348: TreeInfo info = new TreeInfo(); jtulach@1348: next.study(info); jtulach@1348: minLength = info.minLength; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (i > matcher.to - minLength) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int guard = matcher.to - minLength; jtulach@1348: for (; i <= guard; i++) { jtulach@1348: if (next.match(matcher, i, seq)) { jtulach@1348: matcher.first = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: next.study(info); jtulach@1348: info.maxValid = false; jtulach@1348: info.deterministic = false; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /* jtulach@1348: * StartS supports supplementary characters, including unpaired surrogates. jtulach@1348: */ jtulach@1348: static final class StartS extends Start { jtulach@1348: StartS(Node node) { jtulach@1348: super(node); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (i > matcher.to - minLength) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int guard = matcher.to - minLength; jtulach@1348: while (i <= guard) { jtulach@1348: //if ((ret = next.match(matcher, i, seq)) || i == guard) jtulach@1348: if (next.match(matcher, i, seq)) { jtulach@1348: matcher.first = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: if (i == guard) jtulach@1348: break; jtulach@1348: // Optimization to move to the next character. This is jtulach@1348: // faster than countChars(seq, i, 1). jtulach@1348: if (Character.isHighSurrogate(seq.charAt(i++))) { jtulach@1348: if (i < seq.length() && jtulach@1348: Character.isLowSurrogate(seq.charAt(i))) { jtulach@1348: i++; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the beginning of input. This object implements the jtulach@1348: * match for a \A sequence, and the caret anchor will use this if not in jtulach@1348: * multiline mode. jtulach@1348: */ jtulach@1348: static final class Begin extends Node { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int fromIndex = (matcher.anchoringBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: if (i == fromIndex && next.match(matcher, i, seq)) { jtulach@1348: matcher.first = i; jtulach@1348: matcher.groups[0] = i; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } else { jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the end of input. This is the absolute end, so this jtulach@1348: * should not match at the last newline before the end as $ will. jtulach@1348: */ jtulach@1348: static final class End extends Node { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int endIndex = (matcher.anchoringBounds) ? jtulach@1348: matcher.to : matcher.getTextLength(); jtulach@1348: if (i == endIndex) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the beginning of a line. This is essentially the jtulach@1348: * object to match for the multiline ^. jtulach@1348: */ jtulach@1348: static final class Caret extends Node { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int startIndex = matcher.from; jtulach@1348: int endIndex = matcher.to; jtulach@1348: if (!matcher.anchoringBounds) { jtulach@1348: startIndex = 0; jtulach@1348: endIndex = matcher.getTextLength(); jtulach@1348: } jtulach@1348: // Perl does not match ^ at end of input even after newline jtulach@1348: if (i == endIndex) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: if (i > startIndex) { jtulach@1348: char ch = seq.charAt(i-1); jtulach@1348: if (ch != '\n' && ch != '\r' jtulach@1348: && (ch|1) != '\u2029' jtulach@1348: && ch != '\u0085' ) { jtulach@1348: return false; jtulach@1348: } jtulach@1348: // Should treat /r/n as one newline jtulach@1348: if (ch == '\r' && seq.charAt(i) == '\n') jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the beginning of a line when in unixdot mode. jtulach@1348: */ jtulach@1348: static final class UnixCaret extends Node { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int startIndex = matcher.from; jtulach@1348: int endIndex = matcher.to; jtulach@1348: if (!matcher.anchoringBounds) { jtulach@1348: startIndex = 0; jtulach@1348: endIndex = matcher.getTextLength(); jtulach@1348: } jtulach@1348: // Perl does not match ^ at end of input even after newline jtulach@1348: if (i == endIndex) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: if (i > startIndex) { jtulach@1348: char ch = seq.charAt(i-1); jtulach@1348: if (ch != '\n') { jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to match the location where the last match ended. jtulach@1348: * This is used for the \G construct. jtulach@1348: */ jtulach@1348: static final class LastMatch extends Node { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (i != matcher.oldLast) jtulach@1348: return false; jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the end of a line or the end of input based on the jtulach@1348: * multiline mode. jtulach@1348: * jtulach@1348: * When not in multiline mode, the $ can only match at the very end jtulach@1348: * of the input, unless the input ends in a line terminator in which jtulach@1348: * it matches right before the last line terminator. jtulach@1348: * jtulach@1348: * Note that \r\n is considered an atomic line terminator. jtulach@1348: * jtulach@1348: * Like ^ the $ operator matches at a position, it does not match the jtulach@1348: * line terminators themselves. jtulach@1348: */ jtulach@1348: static final class Dollar extends Node { jtulach@1348: boolean multiline; jtulach@1348: Dollar(boolean mul) { jtulach@1348: multiline = mul; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int endIndex = (matcher.anchoringBounds) ? jtulach@1348: matcher.to : matcher.getTextLength(); jtulach@1348: if (!multiline) { jtulach@1348: if (i < endIndex - 2) jtulach@1348: return false; jtulach@1348: if (i == endIndex - 2) { jtulach@1348: char ch = seq.charAt(i); jtulach@1348: if (ch != '\r') jtulach@1348: return false; jtulach@1348: ch = seq.charAt(i + 1); jtulach@1348: if (ch != '\n') jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: // Matches before any line terminator; also matches at the jtulach@1348: // end of input jtulach@1348: // Before line terminator: jtulach@1348: // If multiline, we match here no matter what jtulach@1348: // If not multiline, fall through so that the end jtulach@1348: // is marked as hit; this must be a /r/n or a /n jtulach@1348: // at the very end so the end was hit; more input jtulach@1348: // could make this not match here jtulach@1348: if (i < endIndex) { jtulach@1348: char ch = seq.charAt(i); jtulach@1348: if (ch == '\n') { jtulach@1348: // No match between \r\n jtulach@1348: if (i > 0 && seq.charAt(i-1) == '\r') jtulach@1348: return false; jtulach@1348: if (multiline) jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } else if (ch == '\r' || ch == '\u0085' || jtulach@1348: (ch|1) == '\u2029') { jtulach@1348: if (multiline) jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } else { // No line terminator, no match jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: // Matched at current end so hit end jtulach@1348: matcher.hitEnd = true; jtulach@1348: // If a $ matches because of end of input, then more input jtulach@1348: // could cause it to fail! jtulach@1348: matcher.requireEnd = true; jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: next.study(info); jtulach@1348: return info.deterministic; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node to anchor at the end of a line or the end of input based on the jtulach@1348: * multiline mode when in unix lines mode. jtulach@1348: */ jtulach@1348: static final class UnixDollar extends Node { jtulach@1348: boolean multiline; jtulach@1348: UnixDollar(boolean mul) { jtulach@1348: multiline = mul; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int endIndex = (matcher.anchoringBounds) ? jtulach@1348: matcher.to : matcher.getTextLength(); jtulach@1348: if (i < endIndex) { jtulach@1348: char ch = seq.charAt(i); jtulach@1348: if (ch == '\n') { jtulach@1348: // If not multiline, then only possible to jtulach@1348: // match at very end or one before end jtulach@1348: if (multiline == false && i != endIndex - 1) jtulach@1348: return false; jtulach@1348: // If multiline return next.match without setting jtulach@1348: // matcher.hitEnd jtulach@1348: if (multiline) jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } else { jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: // Matching because at the end or 1 before the end; jtulach@1348: // more input could change this so set hitEnd jtulach@1348: matcher.hitEnd = true; jtulach@1348: // If a $ matches because of end of input, then more input jtulach@1348: // could cause it to fail! jtulach@1348: matcher.requireEnd = true; jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: next.study(info); jtulach@1348: return info.deterministic; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Abstract node class to match one character satisfying some jtulach@1348: * boolean property. jtulach@1348: */ jtulach@1348: private static abstract class CharProperty extends Node { jtulach@1348: abstract boolean isSatisfiedBy(int ch); jtulach@1348: CharProperty complement() { jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ! CharProperty.this.isSatisfiedBy(ch);}}; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (i < matcher.to) { jtulach@1348: int ch = Character.codePointAt(seq, i); jtulach@1348: return isSatisfiedBy(ch) jtulach@1348: && next.match(matcher, i+Character.charCount(ch), seq); jtulach@1348: } else { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.minLength++; jtulach@1348: info.maxLength++; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Optimized version of CharProperty that works only for jtulach@1348: * properties never satisfied by Supplementary characters. jtulach@1348: */ jtulach@1348: private static abstract class BmpCharProperty extends CharProperty { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (i < matcher.to) { jtulach@1348: return isSatisfiedBy(seq.charAt(i)) jtulach@1348: && next.match(matcher, i+1, seq); jtulach@1348: } else { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a Supplementary Unicode character jtulach@1348: */ jtulach@1348: static final class SingleS extends CharProperty { jtulach@1348: final int c; jtulach@1348: SingleS(int c) { this.c = c; } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch == c; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Optimization -- matches a given BMP character jtulach@1348: */ jtulach@1348: static final class Single extends BmpCharProperty { jtulach@1348: final int c; jtulach@1348: Single(int c) { this.c = c; } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch == c; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Case insensitive matches a given BMP character jtulach@1348: */ jtulach@1348: static final class SingleI extends BmpCharProperty { jtulach@1348: final int lower; jtulach@1348: final int upper; jtulach@1348: SingleI(int lower, int upper) { jtulach@1348: this.lower = lower; jtulach@1348: this.upper = upper; jtulach@1348: } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch == lower || ch == upper; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Unicode case insensitive matches a given Unicode character jtulach@1348: */ jtulach@1348: static final class SingleU extends CharProperty { jtulach@1348: final int lower; jtulach@1348: SingleU(int lower) { jtulach@1348: this.lower = lower; jtulach@1348: } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return lower == ch || jtulach@1348: lower == Character.toLowerCase(Character.toUpperCase(ch)); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a Unicode block. jtulach@1350: * jtulach@1348: static final class Block extends CharProperty { jtulach@1348: final Character.UnicodeBlock block; jtulach@1348: Block(Character.UnicodeBlock block) { jtulach@1348: this.block = block; jtulach@1348: } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return block == Character.UnicodeBlock.of(ch); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a Unicode script jtulach@1350: * jtulach@1348: static final class Script extends CharProperty { jtulach@1348: final Character.UnicodeScript script; jtulach@1348: Script(Character.UnicodeScript script) { jtulach@1348: this.script = script; jtulach@1348: } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return script == Character.UnicodeScript.of(ch); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a Unicode category. jtulach@1348: */ jtulach@1348: static final class Category extends CharProperty { jtulach@1348: final int typeMask; jtulach@1348: Category(int typeMask) { this.typeMask = typeMask; } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return (typeMask & (1 << Character.getType(ch))) != 0; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a Unicode "type" jtulach@1348: */ jtulach@1348: static final class Utype extends CharProperty { jtulach@1348: final UnicodeProp uprop; jtulach@1348: Utype(UnicodeProp uprop) { this.uprop = uprop; } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return uprop.is(ch); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class that matches a POSIX type. jtulach@1348: */ jtulach@1348: static final class Ctype extends BmpCharProperty { jtulach@1348: final int ctype; jtulach@1348: Ctype(int ctype) { this.ctype = ctype; } jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch < 128 && ASCII.isType(ch, ctype); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Base class for all Slice nodes jtulach@1348: */ jtulach@1348: static class SliceNode extends Node { jtulach@1348: int[] buffer; jtulach@1348: SliceNode(int[] buf) { jtulach@1348: buffer = buf; jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.minLength += buffer.length; jtulach@1348: info.maxLength += buffer.length; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a case sensitive/BMP-only sequence of literal jtulach@1348: * characters. jtulach@1348: */ jtulach@1348: static final class Slice extends SliceNode { jtulach@1348: Slice(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] buf = buffer; jtulach@1348: int len = buf.length; jtulach@1348: for (int j=0; j= matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: if (buf[j] != seq.charAt(i+j)) jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i+len, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a case_insensitive/BMP-only sequence of literal jtulach@1348: * characters. jtulach@1348: */ jtulach@1348: static class SliceI extends SliceNode { jtulach@1348: SliceI(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] buf = buffer; jtulach@1348: int len = buf.length; jtulach@1348: for (int j=0; j= matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int c = seq.charAt(i+j); jtulach@1348: if (buf[j] != c && jtulach@1348: buf[j] != ASCII.toLower(c)) jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i+len, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a unicode_case_insensitive/BMP-only sequence of jtulach@1348: * literal characters. Uses unicode case folding. jtulach@1348: */ jtulach@1348: static final class SliceU extends SliceNode { jtulach@1348: SliceU(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] buf = buffer; jtulach@1348: int len = buf.length; jtulach@1348: for (int j=0; j= matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int c = seq.charAt(i+j); jtulach@1348: if (buf[j] != c && jtulach@1348: buf[j] != Character.toLowerCase(Character.toUpperCase(c))) jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i+len, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a case sensitive sequence of literal characters jtulach@1348: * including supplementary characters. jtulach@1348: */ jtulach@1348: static final class SliceS extends SliceNode { jtulach@1348: SliceS(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] buf = buffer; jtulach@1348: int x = i; jtulach@1348: for (int j = 0; j < buf.length; j++) { jtulach@1348: if (x >= matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int c = Character.codePointAt(seq, x); jtulach@1348: if (buf[j] != c) jtulach@1348: return false; jtulach@1348: x += Character.charCount(c); jtulach@1348: if (x > matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: return next.match(matcher, x, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a case insensitive sequence of literal characters jtulach@1348: * including supplementary characters. jtulach@1348: */ jtulach@1348: static class SliceIS extends SliceNode { jtulach@1348: SliceIS(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: int toLower(int c) { jtulach@1348: return ASCII.toLower(c); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] buf = buffer; jtulach@1348: int x = i; jtulach@1348: for (int j = 0; j < buf.length; j++) { jtulach@1348: if (x >= matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: int c = Character.codePointAt(seq, x); jtulach@1348: if (buf[j] != c && buf[j] != toLower(c)) jtulach@1348: return false; jtulach@1348: x += Character.charCount(c); jtulach@1348: if (x > matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: return next.match(matcher, x, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for a case insensitive sequence of literal characters. jtulach@1348: * Uses unicode case folding. jtulach@1348: */ jtulach@1348: static final class SliceUS extends SliceIS { jtulach@1348: SliceUS(int[] buf) { jtulach@1348: super(buf); jtulach@1348: } jtulach@1348: int toLower(int c) { jtulach@1348: return Character.toLowerCase(Character.toUpperCase(c)); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: private static boolean inRange(int lower, int ch, int upper) { jtulach@1348: return lower <= ch && ch <= upper; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns node for matching characters within an explicit value range. jtulach@1348: */ jtulach@1348: private static CharProperty rangeFor(final int lower, jtulach@1348: final int upper) { jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return inRange(lower, ch, upper);}}; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns node for matching characters within an explicit value jtulach@1348: * range in a case insensitive manner. jtulach@1348: */ jtulach@1348: private CharProperty caseInsensitiveRangeFor(final int lower, jtulach@1348: final int upper) { jtulach@1348: if (has(UNICODE_CASE)) jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: if (inRange(lower, ch, upper)) jtulach@1348: return true; jtulach@1348: int up = Character.toUpperCase(ch); jtulach@1348: return inRange(lower, up, upper) || jtulach@1348: inRange(lower, Character.toLowerCase(up), upper);}}; jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return inRange(lower, ch, upper) || jtulach@1348: ASCII.isAscii(ch) && jtulach@1348: (inRange(lower, ASCII.toUpper(ch), upper) || jtulach@1348: inRange(lower, ASCII.toLower(ch), upper)); jtulach@1348: }}; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Implements the Unicode category ALL and the dot metacharacter when jtulach@1348: * in dotall mode. jtulach@1348: */ jtulach@1348: static final class All extends CharProperty { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for the dot metacharacter when dotall is not enabled. jtulach@1348: */ jtulach@1348: static final class Dot extends CharProperty { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return (ch != '\n' && ch != '\r' jtulach@1348: && (ch|1) != '\u2029' jtulach@1348: && ch != '\u0085'); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Node class for the dot metacharacter when dotall is not enabled jtulach@1348: * but UNIX_LINES is enabled. jtulach@1348: */ jtulach@1348: static final class UnixDot extends CharProperty { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ch != '\n'; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The 0 or 1 quantifier. This one class implements all three types. jtulach@1348: */ jtulach@1348: static final class Ques extends Node { jtulach@1348: Node atom; jtulach@1348: int type; jtulach@1348: Ques(Node node, int type) { jtulach@1348: this.atom = node; jtulach@1348: this.type = type; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: switch (type) { jtulach@1348: case GREEDY: jtulach@1348: return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) jtulach@1348: || next.match(matcher, i, seq); jtulach@1348: case LAZY: jtulach@1348: return next.match(matcher, i, seq) jtulach@1348: || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); jtulach@1348: case POSSESSIVE: jtulach@1348: if (atom.match(matcher, i, seq)) i = matcher.last; jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: default: jtulach@1348: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: if (type != INDEPENDENT) { jtulach@1348: int minL = info.minLength; jtulach@1348: atom.study(info); jtulach@1348: info.minLength = minL; jtulach@1348: info.deterministic = false; jtulach@1348: return next.study(info); jtulach@1348: } else { jtulach@1348: atom.study(info); jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles the curly-brace style repetition with a specified minimum and jtulach@1348: * maximum occurrences. The * quantifier is handled as a special case. jtulach@1348: * This class handles the three types. jtulach@1348: */ jtulach@1348: static final class Curly extends Node { jtulach@1348: Node atom; jtulach@1348: int type; jtulach@1348: int cmin; jtulach@1348: int cmax; jtulach@1348: jtulach@1348: Curly(Node node, int cmin, int cmax, int type) { jtulach@1348: this.atom = node; jtulach@1348: this.type = type; jtulach@1348: this.cmin = cmin; jtulach@1348: this.cmax = cmax; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int j; jtulach@1348: for (j = 0; j < cmin; j++) { jtulach@1348: if (atom.match(matcher, i, seq)) { jtulach@1348: i = matcher.last; jtulach@1348: continue; jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: if (type == GREEDY) jtulach@1348: return match0(matcher, i, j, seq); jtulach@1348: else if (type == LAZY) jtulach@1348: return match1(matcher, i, j, seq); jtulach@1348: else jtulach@1348: return match2(matcher, i, j, seq); jtulach@1348: } jtulach@1348: // Greedy match. jtulach@1348: // i is the index to start matching at jtulach@1348: // j is the number of atoms that have matched jtulach@1348: boolean match0(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: if (j >= cmax) { jtulach@1348: // We have matched the maximum... continue with the rest of jtulach@1348: // the regular expression jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: int backLimit = j; jtulach@1348: while (atom.match(matcher, i, seq)) { jtulach@1348: // k is the length of this match jtulach@1348: int k = matcher.last - i; jtulach@1348: if (k == 0) // Zero length match jtulach@1348: break; jtulach@1348: // Move up index and number matched jtulach@1348: i = matcher.last; jtulach@1348: j++; jtulach@1348: // We are greedy so match as many as we can jtulach@1348: while (j < cmax) { jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: break; jtulach@1348: if (i + k != matcher.last) { jtulach@1348: if (match0(matcher, matcher.last, j+1, seq)) jtulach@1348: return true; jtulach@1348: break; jtulach@1348: } jtulach@1348: i += k; jtulach@1348: j++; jtulach@1348: } jtulach@1348: // Handle backing off if match fails jtulach@1348: while (j >= backLimit) { jtulach@1348: if (next.match(matcher, i, seq)) jtulach@1348: return true; jtulach@1348: i -= k; jtulach@1348: j--; jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: // Reluctant match. At this point, the minimum has been satisfied. jtulach@1348: // i is the index to start matching at jtulach@1348: // j is the number of atoms that have matched jtulach@1348: boolean match1(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: for (;;) { jtulach@1348: // Try finishing match without consuming any more jtulach@1348: if (next.match(matcher, i, seq)) jtulach@1348: return true; jtulach@1348: // At the maximum, no match found jtulach@1348: if (j >= cmax) jtulach@1348: return false; jtulach@1348: // Okay, must try one more atom jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: return false; jtulach@1348: // If we haven't moved forward then must break out jtulach@1348: if (i == matcher.last) jtulach@1348: return false; jtulach@1348: // Move up index and number matched jtulach@1348: i = matcher.last; jtulach@1348: j++; jtulach@1348: } jtulach@1348: } jtulach@1348: boolean match2(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: for (; j < cmax; j++) { jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: break; jtulach@1348: if (i == matcher.last) jtulach@1348: break; jtulach@1348: i = matcher.last; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: // Save original info jtulach@1348: int minL = info.minLength; jtulach@1348: int maxL = info.maxLength; jtulach@1348: boolean maxV = info.maxValid; jtulach@1348: boolean detm = info.deterministic; jtulach@1348: info.reset(); jtulach@1348: jtulach@1348: atom.study(info); jtulach@1348: jtulach@1348: int temp = info.minLength * cmin + minL; jtulach@1348: if (temp < minL) { jtulach@1348: temp = 0xFFFFFFF; // arbitrary large number jtulach@1348: } jtulach@1348: info.minLength = temp; jtulach@1348: jtulach@1348: if (maxV & info.maxValid) { jtulach@1348: temp = info.maxLength * cmax + maxL; jtulach@1348: info.maxLength = temp; jtulach@1348: if (temp < maxL) { jtulach@1348: info.maxValid = false; jtulach@1348: } jtulach@1348: } else { jtulach@1348: info.maxValid = false; jtulach@1348: } jtulach@1348: jtulach@1348: if (info.deterministic && cmin == cmax) jtulach@1348: info.deterministic = detm; jtulach@1348: else jtulach@1348: info.deterministic = false; jtulach@1348: jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles the curly-brace style repetition with a specified minimum and jtulach@1348: * maximum occurrences in deterministic cases. This is an iterative jtulach@1348: * optimization over the Prolog and Loop system which would handle this jtulach@1348: * in a recursive way. The * quantifier is handled as a special case. jtulach@1348: * If capture is true then this class saves group settings and ensures jtulach@1348: * that groups are unset when backing off of a group match. jtulach@1348: */ jtulach@1348: static final class GroupCurly extends Node { jtulach@1348: Node atom; jtulach@1348: int type; jtulach@1348: int cmin; jtulach@1348: int cmax; jtulach@1348: int localIndex; jtulach@1348: int groupIndex; jtulach@1348: boolean capture; jtulach@1348: jtulach@1348: GroupCurly(Node node, int cmin, int cmax, int type, int local, jtulach@1348: int group, boolean capture) { jtulach@1348: this.atom = node; jtulach@1348: this.type = type; jtulach@1348: this.cmin = cmin; jtulach@1348: this.cmax = cmax; jtulach@1348: this.localIndex = local; jtulach@1348: this.groupIndex = group; jtulach@1348: this.capture = capture; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] groups = matcher.groups; jtulach@1348: int[] locals = matcher.locals; jtulach@1348: int save0 = locals[localIndex]; jtulach@1348: int save1 = 0; jtulach@1348: int save2 = 0; jtulach@1348: jtulach@1348: if (capture) { jtulach@1348: save1 = groups[groupIndex]; jtulach@1348: save2 = groups[groupIndex+1]; jtulach@1348: } jtulach@1348: jtulach@1348: // Notify GroupTail there is no need to setup group info jtulach@1348: // because it will be set here jtulach@1348: locals[localIndex] = -1; jtulach@1348: jtulach@1348: boolean ret = true; jtulach@1348: for (int j = 0; j < cmin; j++) { jtulach@1348: if (atom.match(matcher, i, seq)) { jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex] = i; jtulach@1348: groups[groupIndex+1] = matcher.last; jtulach@1348: } jtulach@1348: i = matcher.last; jtulach@1348: } else { jtulach@1348: ret = false; jtulach@1348: break; jtulach@1348: } jtulach@1348: } jtulach@1348: if (ret) { jtulach@1348: if (type == GREEDY) { jtulach@1348: ret = match0(matcher, i, cmin, seq); jtulach@1348: } else if (type == LAZY) { jtulach@1348: ret = match1(matcher, i, cmin, seq); jtulach@1348: } else { jtulach@1348: ret = match2(matcher, i, cmin, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: if (!ret) { jtulach@1348: locals[localIndex] = save0; jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex] = save1; jtulach@1348: groups[groupIndex+1] = save2; jtulach@1348: } jtulach@1348: } jtulach@1348: return ret; jtulach@1348: } jtulach@1348: // Aggressive group match jtulach@1348: boolean match0(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: int[] groups = matcher.groups; jtulach@1348: int save0 = 0; jtulach@1348: int save1 = 0; jtulach@1348: if (capture) { jtulach@1348: save0 = groups[groupIndex]; jtulach@1348: save1 = groups[groupIndex+1]; jtulach@1348: } jtulach@1348: for (;;) { jtulach@1348: if (j >= cmax) jtulach@1348: break; jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: break; jtulach@1348: int k = matcher.last - i; jtulach@1348: if (k <= 0) { jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex] = i; jtulach@1348: groups[groupIndex+1] = i + k; jtulach@1348: } jtulach@1348: i = i + k; jtulach@1348: break; jtulach@1348: } jtulach@1348: for (;;) { jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex] = i; jtulach@1348: groups[groupIndex+1] = i + k; jtulach@1348: } jtulach@1348: i = i + k; jtulach@1348: if (++j >= cmax) jtulach@1348: break; jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: break; jtulach@1348: if (i + k != matcher.last) { jtulach@1348: if (match0(matcher, i, j, seq)) jtulach@1348: return true; jtulach@1348: break; jtulach@1348: } jtulach@1348: } jtulach@1348: while (j > cmin) { jtulach@1348: if (next.match(matcher, i, seq)) { jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex+1] = i; jtulach@1348: groups[groupIndex] = i - k; jtulach@1348: } jtulach@1348: i = i - k; jtulach@1348: return true; jtulach@1348: } jtulach@1348: // backing off jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex+1] = i; jtulach@1348: groups[groupIndex] = i - k; jtulach@1348: } jtulach@1348: i = i - k; jtulach@1348: j--; jtulach@1348: } jtulach@1348: break; jtulach@1348: } jtulach@1348: if (capture) { jtulach@1348: groups[groupIndex] = save0; jtulach@1348: groups[groupIndex+1] = save1; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: // Reluctant matching jtulach@1348: boolean match1(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: for (;;) { jtulach@1348: if (next.match(matcher, i, seq)) jtulach@1348: return true; jtulach@1348: if (j >= cmax) jtulach@1348: return false; jtulach@1348: if (!atom.match(matcher, i, seq)) jtulach@1348: return false; jtulach@1348: if (i == matcher.last) jtulach@1348: return false; jtulach@1348: if (capture) { jtulach@1348: matcher.groups[groupIndex] = i; jtulach@1348: matcher.groups[groupIndex+1] = matcher.last; jtulach@1348: } jtulach@1348: i = matcher.last; jtulach@1348: j++; jtulach@1348: } jtulach@1348: } jtulach@1348: // Possessive matching jtulach@1348: boolean match2(Matcher matcher, int i, int j, CharSequence seq) { jtulach@1348: for (; j < cmax; j++) { jtulach@1348: if (!atom.match(matcher, i, seq)) { jtulach@1348: break; jtulach@1348: } jtulach@1348: if (capture) { jtulach@1348: matcher.groups[groupIndex] = i; jtulach@1348: matcher.groups[groupIndex+1] = matcher.last; jtulach@1348: } jtulach@1348: if (i == matcher.last) { jtulach@1348: break; jtulach@1348: } jtulach@1348: i = matcher.last; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: // Save original info jtulach@1348: int minL = info.minLength; jtulach@1348: int maxL = info.maxLength; jtulach@1348: boolean maxV = info.maxValid; jtulach@1348: boolean detm = info.deterministic; jtulach@1348: info.reset(); jtulach@1348: jtulach@1348: atom.study(info); jtulach@1348: jtulach@1348: int temp = info.minLength * cmin + minL; jtulach@1348: if (temp < minL) { jtulach@1348: temp = 0xFFFFFFF; // Arbitrary large number jtulach@1348: } jtulach@1348: info.minLength = temp; jtulach@1348: jtulach@1348: if (maxV & info.maxValid) { jtulach@1348: temp = info.maxLength * cmax + maxL; jtulach@1348: info.maxLength = temp; jtulach@1348: if (temp < maxL) { jtulach@1348: info.maxValid = false; jtulach@1348: } jtulach@1348: } else { jtulach@1348: info.maxValid = false; jtulach@1348: } jtulach@1348: jtulach@1348: if (info.deterministic && cmin == cmax) { jtulach@1348: info.deterministic = detm; jtulach@1348: } else { jtulach@1348: info.deterministic = false; jtulach@1348: } jtulach@1348: jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * A Guard node at the end of each atom node in a Branch. It jtulach@1348: * serves the purpose of chaining the "match" operation to jtulach@1348: * "next" but not the "study", so we can collect the TreeInfo jtulach@1348: * of each atom node without including the TreeInfo of the jtulach@1348: * "next". jtulach@1348: */ jtulach@1348: static final class BranchConn extends Node { jtulach@1348: BranchConn() {}; jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: return info.deterministic; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles the branching of alternations. Note this is also used for jtulach@1348: * the ? quantifier to branch between the case where it matches once jtulach@1348: * and where it does not occur. jtulach@1348: */ jtulach@1348: static final class Branch extends Node { jtulach@1348: Node[] atoms = new Node[2]; jtulach@1348: int size = 2; jtulach@1348: Node conn; jtulach@1348: Branch(Node first, Node second, Node branchConn) { jtulach@1348: conn = branchConn; jtulach@1348: atoms[0] = first; jtulach@1348: atoms[1] = second; jtulach@1348: } jtulach@1348: jtulach@1348: void add(Node node) { jtulach@1348: if (size >= atoms.length) { jtulach@1348: Node[] tmp = new Node[atoms.length*2]; jtulach@1348: System.arraycopy(atoms, 0, tmp, 0, atoms.length); jtulach@1348: atoms = tmp; jtulach@1348: } jtulach@1348: atoms[size++] = node; jtulach@1348: } jtulach@1348: jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: for (int n = 0; n < size; n++) { jtulach@1348: if (atoms[n] == null) { jtulach@1348: if (conn.next.match(matcher, i, seq)) jtulach@1348: return true; jtulach@1348: } else if (atoms[n].match(matcher, i, seq)) { jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: int minL = info.minLength; jtulach@1348: int maxL = info.maxLength; jtulach@1348: boolean maxV = info.maxValid; jtulach@1348: jtulach@1348: int minL2 = Integer.MAX_VALUE; //arbitrary large enough num jtulach@1348: int maxL2 = -1; jtulach@1348: for (int n = 0; n < size; n++) { jtulach@1348: info.reset(); jtulach@1348: if (atoms[n] != null) jtulach@1348: atoms[n].study(info); jtulach@1348: minL2 = Math.min(minL2, info.minLength); jtulach@1348: maxL2 = Math.max(maxL2, info.maxLength); jtulach@1348: maxV = (maxV & info.maxValid); jtulach@1348: } jtulach@1348: jtulach@1348: minL += minL2; jtulach@1348: maxL += maxL2; jtulach@1348: jtulach@1348: info.reset(); jtulach@1348: conn.next.study(info); jtulach@1348: jtulach@1348: info.minLength += minL; jtulach@1348: info.maxLength += maxL; jtulach@1348: info.maxValid &= maxV; jtulach@1348: info.deterministic = false; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The GroupHead saves the location where the group begins in the locals jtulach@1348: * and restores them when the match is done. jtulach@1348: * jtulach@1348: * The matchRef is used when a reference to this group is accessed later jtulach@1348: * in the expression. The locals will have a negative value in them to jtulach@1348: * indicate that we do not want to unset the group if the reference jtulach@1348: * doesn't match. jtulach@1348: */ jtulach@1348: static final class GroupHead extends Node { jtulach@1348: int localIndex; jtulach@1348: GroupHead(int localCount) { jtulach@1348: localIndex = localCount; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int save = matcher.locals[localIndex]; jtulach@1348: matcher.locals[localIndex] = i; jtulach@1348: boolean ret = next.match(matcher, i, seq); jtulach@1348: matcher.locals[localIndex] = save; jtulach@1348: return ret; jtulach@1348: } jtulach@1348: boolean matchRef(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int save = matcher.locals[localIndex]; jtulach@1348: matcher.locals[localIndex] = ~i; // HACK jtulach@1348: boolean ret = next.match(matcher, i, seq); jtulach@1348: matcher.locals[localIndex] = save; jtulach@1348: return ret; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Recursive reference to a group in the regular expression. It calls jtulach@1348: * matchRef because if the reference fails to match we would not unset jtulach@1348: * the group. jtulach@1348: */ jtulach@1348: static final class GroupRef extends Node { jtulach@1348: GroupHead head; jtulach@1348: GroupRef(GroupHead head) { jtulach@1348: this.head = head; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: return head.matchRef(matcher, i, seq) jtulach@1348: && next.match(matcher, matcher.last, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.maxValid = false; jtulach@1348: info.deterministic = false; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * The GroupTail handles the setting of group beginning and ending jtulach@1348: * locations when groups are successfully matched. It must also be able to jtulach@1348: * unset groups that have to be backed off of. jtulach@1348: * jtulach@1348: * The GroupTail node is also used when a previous group is referenced, jtulach@1348: * and in that case no group information needs to be set. jtulach@1348: */ jtulach@1348: static final class GroupTail extends Node { jtulach@1348: int localIndex; jtulach@1348: int groupIndex; jtulach@1348: GroupTail(int localCount, int groupCount) { jtulach@1348: localIndex = localCount; jtulach@1348: groupIndex = groupCount + groupCount; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int tmp = matcher.locals[localIndex]; jtulach@1348: if (tmp >= 0) { // This is the normal group case. jtulach@1348: // Save the group so we can unset it if it jtulach@1348: // backs off of a match. jtulach@1348: int groupStart = matcher.groups[groupIndex]; jtulach@1348: int groupEnd = matcher.groups[groupIndex+1]; jtulach@1348: jtulach@1348: matcher.groups[groupIndex] = tmp; jtulach@1348: matcher.groups[groupIndex+1] = i; jtulach@1348: if (next.match(matcher, i, seq)) { jtulach@1348: return true; jtulach@1348: } jtulach@1348: matcher.groups[groupIndex] = groupStart; jtulach@1348: matcher.groups[groupIndex+1] = groupEnd; jtulach@1348: return false; jtulach@1348: } else { jtulach@1348: // This is a group reference case. We don't need to save any jtulach@1348: // group info because it isn't really a group. jtulach@1348: matcher.last = i; jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * This sets up a loop to handle a recursive quantifier structure. jtulach@1348: */ jtulach@1348: static final class Prolog extends Node { jtulach@1348: Loop loop; jtulach@1348: Prolog(Loop loop) { jtulach@1348: this.loop = loop; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: return loop.matchInit(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: return loop.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles the repetition count for a greedy Curly. The matchInit jtulach@1348: * is called from the Prolog to save the index of where the group jtulach@1348: * beginning is stored. A zero length group check occurs in the jtulach@1348: * normal match but is skipped in the matchInit. jtulach@1348: */ jtulach@1348: static class Loop extends Node { jtulach@1348: Node body; jtulach@1348: int countIndex; // local count index in matcher locals jtulach@1348: int beginIndex; // group beginning index jtulach@1348: int cmin, cmax; jtulach@1348: Loop(int countIndex, int beginIndex) { jtulach@1348: this.countIndex = countIndex; jtulach@1348: this.beginIndex = beginIndex; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: // Avoid infinite loop in zero-length case. jtulach@1348: if (i > matcher.locals[beginIndex]) { jtulach@1348: int count = matcher.locals[countIndex]; jtulach@1348: jtulach@1348: // This block is for before we reach the minimum jtulach@1348: // iterations required for the loop to match jtulach@1348: if (count < cmin) { jtulach@1348: matcher.locals[countIndex] = count + 1; jtulach@1348: boolean b = body.match(matcher, i, seq); jtulach@1348: // If match failed we must backtrack, so jtulach@1348: // the loop count should NOT be incremented jtulach@1348: if (!b) jtulach@1348: matcher.locals[countIndex] = count; jtulach@1348: // Return success or failure since we are under jtulach@1348: // minimum jtulach@1348: return b; jtulach@1348: } jtulach@1348: // This block is for after we have the minimum jtulach@1348: // iterations required for the loop to match jtulach@1348: if (count < cmax) { jtulach@1348: matcher.locals[countIndex] = count + 1; jtulach@1348: boolean b = body.match(matcher, i, seq); jtulach@1348: // If match failed we must backtrack, so jtulach@1348: // the loop count should NOT be incremented jtulach@1348: if (!b) jtulach@1348: matcher.locals[countIndex] = count; jtulach@1348: else jtulach@1348: return true; jtulach@1348: } jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean matchInit(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int save = matcher.locals[countIndex]; jtulach@1348: boolean ret = false; jtulach@1348: if (0 < cmin) { jtulach@1348: matcher.locals[countIndex] = 1; jtulach@1348: ret = body.match(matcher, i, seq); jtulach@1348: } else if (0 < cmax) { jtulach@1348: matcher.locals[countIndex] = 1; jtulach@1348: ret = body.match(matcher, i, seq); jtulach@1348: if (ret == false) jtulach@1348: ret = next.match(matcher, i, seq); jtulach@1348: } else { jtulach@1348: ret = next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: matcher.locals[countIndex] = save; jtulach@1348: return ret; jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.maxValid = false; jtulach@1348: info.deterministic = false; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles the repetition count for a reluctant Curly. The matchInit jtulach@1348: * is called from the Prolog to save the index of where the group jtulach@1348: * beginning is stored. A zero length group check occurs in the jtulach@1348: * normal match but is skipped in the matchInit. jtulach@1348: */ jtulach@1348: static final class LazyLoop extends Loop { jtulach@1348: LazyLoop(int countIndex, int beginIndex) { jtulach@1348: super(countIndex, beginIndex); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: // Check for zero length group jtulach@1348: if (i > matcher.locals[beginIndex]) { jtulach@1348: int count = matcher.locals[countIndex]; jtulach@1348: if (count < cmin) { jtulach@1348: matcher.locals[countIndex] = count + 1; jtulach@1348: boolean result = body.match(matcher, i, seq); jtulach@1348: // If match failed we must backtrack, so jtulach@1348: // the loop count should NOT be incremented jtulach@1348: if (!result) jtulach@1348: matcher.locals[countIndex] = count; jtulach@1348: return result; jtulach@1348: } jtulach@1348: if (next.match(matcher, i, seq)) jtulach@1348: return true; jtulach@1348: if (count < cmax) { jtulach@1348: matcher.locals[countIndex] = count + 1; jtulach@1348: boolean result = body.match(matcher, i, seq); jtulach@1348: // If match failed we must backtrack, so jtulach@1348: // the loop count should NOT be incremented jtulach@1348: if (!result) jtulach@1348: matcher.locals[countIndex] = count; jtulach@1348: return result; jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: return next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: boolean matchInit(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int save = matcher.locals[countIndex]; jtulach@1348: boolean ret = false; jtulach@1348: if (0 < cmin) { jtulach@1348: matcher.locals[countIndex] = 1; jtulach@1348: ret = body.match(matcher, i, seq); jtulach@1348: } else if (next.match(matcher, i, seq)) { jtulach@1348: ret = true; jtulach@1348: } else if (0 < cmax) { jtulach@1348: matcher.locals[countIndex] = 1; jtulach@1348: ret = body.match(matcher, i, seq); jtulach@1348: } jtulach@1348: matcher.locals[countIndex] = save; jtulach@1348: return ret; jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.maxValid = false; jtulach@1348: info.deterministic = false; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Refers to a group in the regular expression. Attempts to match jtulach@1348: * whatever the group referred to last matched. jtulach@1348: */ jtulach@1348: static class BackRef extends Node { jtulach@1348: int groupIndex; jtulach@1348: BackRef(int groupCount) { jtulach@1348: super(); jtulach@1348: groupIndex = groupCount + groupCount; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int j = matcher.groups[groupIndex]; jtulach@1348: int k = matcher.groups[groupIndex+1]; jtulach@1348: jtulach@1348: int groupSize = k - j; jtulach@1348: jtulach@1348: // If the referenced group didn't match, neither can this jtulach@1348: if (j < 0) jtulach@1348: return false; jtulach@1348: jtulach@1348: // If there isn't enough input left no match jtulach@1348: if (i + groupSize > matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: jtulach@1348: // Check each new char to make sure it matches what the group jtulach@1348: // referenced matched last time around jtulach@1348: for (int index=0; index matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: jtulach@1348: // Check each new char to make sure it matches what the group jtulach@1348: // referenced matched last time around jtulach@1348: int x = i; jtulach@1348: for (int index=0; index matcher.to) { jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: if (atom.match(matcher, i, seq)) { jtulach@1348: return next.match(matcher, matcher.last, seq); jtulach@1348: } jtulach@1348: i += countChars(seq, i, 1); jtulach@1348: matcher.first++; jtulach@1348: } jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: atom.study(info); jtulach@1348: info.maxValid = false; jtulach@1348: info.deterministic = false; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: static final class Conditional extends Node { jtulach@1348: Node cond, yes, not; jtulach@1348: Conditional(Node cond, Node yes, Node not) { jtulach@1348: this.cond = cond; jtulach@1348: this.yes = yes; jtulach@1348: this.not = not; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: if (cond.match(matcher, i, seq)) { jtulach@1348: return yes.match(matcher, i, seq); jtulach@1348: } else { jtulach@1348: return not.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: int minL = info.minLength; jtulach@1348: int maxL = info.maxLength; jtulach@1348: boolean maxV = info.maxValid; jtulach@1348: info.reset(); jtulach@1348: yes.study(info); jtulach@1348: jtulach@1348: int minL2 = info.minLength; jtulach@1348: int maxL2 = info.maxLength; jtulach@1348: boolean maxV2 = info.maxValid; jtulach@1348: info.reset(); jtulach@1348: not.study(info); jtulach@1348: jtulach@1348: info.minLength = minL + Math.min(minL2, info.minLength); jtulach@1348: info.maxLength = maxL + Math.max(maxL2, info.maxLength); jtulach@1348: info.maxValid = (maxV & maxV2 & info.maxValid); jtulach@1348: info.deterministic = false; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width positive lookahead. jtulach@1348: */ jtulach@1348: static final class Pos extends Node { jtulach@1348: Node cond; jtulach@1348: Pos(Node cond) { jtulach@1348: this.cond = cond; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int savedTo = matcher.to; jtulach@1348: boolean conditionMatched = false; jtulach@1348: jtulach@1348: // Relax transparent region boundaries for lookahead jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.to = matcher.getTextLength(); jtulach@1348: try { jtulach@1348: conditionMatched = cond.match(matcher, i, seq); jtulach@1348: } finally { jtulach@1348: // Reinstate region boundaries jtulach@1348: matcher.to = savedTo; jtulach@1348: } jtulach@1348: return conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width negative lookahead. jtulach@1348: */ jtulach@1348: static final class Neg extends Node { jtulach@1348: Node cond; jtulach@1348: Neg(Node cond) { jtulach@1348: this.cond = cond; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int savedTo = matcher.to; jtulach@1348: boolean conditionMatched = false; jtulach@1348: jtulach@1348: // Relax transparent region boundaries for lookahead jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.to = matcher.getTextLength(); jtulach@1348: try { jtulach@1348: if (i < matcher.to) { jtulach@1348: conditionMatched = !cond.match(matcher, i, seq); jtulach@1348: } else { jtulach@1348: // If a negative lookahead succeeds then more input jtulach@1348: // could cause it to fail! jtulach@1348: matcher.requireEnd = true; jtulach@1348: conditionMatched = !cond.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } finally { jtulach@1348: // Reinstate region boundaries jtulach@1348: matcher.to = savedTo; jtulach@1348: } jtulach@1348: return conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * For use with lookbehinds; matches the position where the lookbehind jtulach@1348: * was encountered. jtulach@1348: */ jtulach@1348: static Node lookbehindEnd = new Node() { jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: return i == matcher.lookbehindTo; jtulach@1348: } jtulach@1348: }; jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width positive lookbehind. jtulach@1348: */ jtulach@1348: static class Behind extends Node { jtulach@1348: Node cond; jtulach@1348: int rmax, rmin; jtulach@1348: Behind(Node cond, int rmax, int rmin) { jtulach@1348: this.cond = cond; jtulach@1348: this.rmax = rmax; jtulach@1348: this.rmin = rmin; jtulach@1348: } jtulach@1348: jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int savedFrom = matcher.from; jtulach@1348: boolean conditionMatched = false; jtulach@1348: int startIndex = (!matcher.transparentBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: int from = Math.max(i - rmax, startIndex); jtulach@1348: // Set end boundary jtulach@1348: int savedLBT = matcher.lookbehindTo; jtulach@1348: matcher.lookbehindTo = i; jtulach@1348: // Relax transparent region boundaries for lookbehind jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.from = 0; jtulach@1348: for (int j = i - rmin; !conditionMatched && j >= from; j--) { jtulach@1348: conditionMatched = cond.match(matcher, j, seq); jtulach@1348: } jtulach@1348: matcher.from = savedFrom; jtulach@1348: matcher.lookbehindTo = savedLBT; jtulach@1348: return conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width positive lookbehind, including supplementary jtulach@1348: * characters or unpaired surrogates. jtulach@1348: */ jtulach@1348: static final class BehindS extends Behind { jtulach@1348: BehindS(Node cond, int rmax, int rmin) { jtulach@1348: super(cond, rmax, rmin); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int rmaxChars = countChars(seq, i, -rmax); jtulach@1348: int rminChars = countChars(seq, i, -rmin); jtulach@1348: int savedFrom = matcher.from; jtulach@1348: int startIndex = (!matcher.transparentBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: boolean conditionMatched = false; jtulach@1348: int from = Math.max(i - rmaxChars, startIndex); jtulach@1348: // Set end boundary jtulach@1348: int savedLBT = matcher.lookbehindTo; jtulach@1348: matcher.lookbehindTo = i; jtulach@1348: // Relax transparent region boundaries for lookbehind jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.from = 0; jtulach@1348: jtulach@1348: for (int j = i - rminChars; jtulach@1348: !conditionMatched && j >= from; jtulach@1348: j -= j>from ? countChars(seq, j, -1) : 1) { jtulach@1348: conditionMatched = cond.match(matcher, j, seq); jtulach@1348: } jtulach@1348: matcher.from = savedFrom; jtulach@1348: matcher.lookbehindTo = savedLBT; jtulach@1348: return conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width negative lookbehind. jtulach@1348: */ jtulach@1348: static class NotBehind extends Node { jtulach@1348: Node cond; jtulach@1348: int rmax, rmin; jtulach@1348: NotBehind(Node cond, int rmax, int rmin) { jtulach@1348: this.cond = cond; jtulach@1348: this.rmax = rmax; jtulach@1348: this.rmin = rmin; jtulach@1348: } jtulach@1348: jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int savedLBT = matcher.lookbehindTo; jtulach@1348: int savedFrom = matcher.from; jtulach@1348: boolean conditionMatched = false; jtulach@1348: int startIndex = (!matcher.transparentBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: int from = Math.max(i - rmax, startIndex); jtulach@1348: matcher.lookbehindTo = i; jtulach@1348: // Relax transparent region boundaries for lookbehind jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.from = 0; jtulach@1348: for (int j = i - rmin; !conditionMatched && j >= from; j--) { jtulach@1348: conditionMatched = cond.match(matcher, j, seq); jtulach@1348: } jtulach@1348: // Reinstate region boundaries jtulach@1348: matcher.from = savedFrom; jtulach@1348: matcher.lookbehindTo = savedLBT; jtulach@1348: return !conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Zero width negative lookbehind, including supplementary jtulach@1348: * characters or unpaired surrogates. jtulach@1348: */ jtulach@1348: static final class NotBehindS extends NotBehind { jtulach@1348: NotBehindS(Node cond, int rmax, int rmin) { jtulach@1348: super(cond, rmax, rmin); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int rmaxChars = countChars(seq, i, -rmax); jtulach@1348: int rminChars = countChars(seq, i, -rmin); jtulach@1348: int savedFrom = matcher.from; jtulach@1348: int savedLBT = matcher.lookbehindTo; jtulach@1348: boolean conditionMatched = false; jtulach@1348: int startIndex = (!matcher.transparentBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: int from = Math.max(i - rmaxChars, startIndex); jtulach@1348: matcher.lookbehindTo = i; jtulach@1348: // Relax transparent region boundaries for lookbehind jtulach@1348: if (matcher.transparentBounds) jtulach@1348: matcher.from = 0; jtulach@1348: for (int j = i - rminChars; jtulach@1348: !conditionMatched && j >= from; jtulach@1348: j -= j>from ? countChars(seq, j, -1) : 1) { jtulach@1348: conditionMatched = cond.match(matcher, j, seq); jtulach@1348: } jtulach@1348: //Reinstate region boundaries jtulach@1348: matcher.from = savedFrom; jtulach@1348: matcher.lookbehindTo = savedLBT; jtulach@1348: return !conditionMatched && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns the set union of two CharProperty nodes. jtulach@1348: */ jtulach@1348: private static CharProperty union(final CharProperty lhs, jtulach@1348: final CharProperty rhs) { jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns the set intersection of two CharProperty nodes. jtulach@1348: */ jtulach@1348: private static CharProperty intersection(final CharProperty lhs, jtulach@1348: final CharProperty rhs) { jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Returns the set difference of two CharProperty nodes. jtulach@1348: */ jtulach@1348: private static CharProperty setDifference(final CharProperty lhs, jtulach@1348: final CharProperty rhs) { jtulach@1348: return new CharProperty() { jtulach@1348: boolean isSatisfiedBy(int ch) { jtulach@1348: return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Handles word boundaries. Includes a field to allow this one class to jtulach@1348: * deal with the different types of word boundaries we can match. The word jtulach@1348: * characters include underscores, letters, and digits. Non spacing marks jtulach@1348: * can are also part of a word if they have a base character, otherwise jtulach@1348: * they are ignored for purposes of finding word boundaries. jtulach@1348: */ jtulach@1348: static final class Bound extends Node { jtulach@1348: static int LEFT = 0x1; jtulach@1348: static int RIGHT= 0x2; jtulach@1348: static int BOTH = 0x3; jtulach@1348: static int NONE = 0x4; jtulach@1348: int type; jtulach@1348: boolean useUWORD; jtulach@1348: Bound(int n, boolean useUWORD) { jtulach@1348: type = n; jtulach@1348: this.useUWORD = useUWORD; jtulach@1348: } jtulach@1348: jtulach@1348: boolean isWord(int ch) { jtulach@1348: return useUWORD ? UnicodeProp.WORD.is(ch) jtulach@1348: : (ch == '_' || Character.isLetterOrDigit(ch)); jtulach@1348: } jtulach@1348: jtulach@1348: int check(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int ch; jtulach@1348: boolean left = false; jtulach@1348: int startIndex = matcher.from; jtulach@1348: int endIndex = matcher.to; jtulach@1348: if (matcher.transparentBounds) { jtulach@1348: startIndex = 0; jtulach@1348: endIndex = matcher.getTextLength(); jtulach@1348: } jtulach@1348: if (i > startIndex) { jtulach@1348: ch = Character.codePointBefore(seq, i); jtulach@1348: left = (isWord(ch) || jtulach@1348: ((Character.getType(ch) == Character.NON_SPACING_MARK) jtulach@1348: && hasBaseCharacter(matcher, i-1, seq))); jtulach@1348: } jtulach@1348: boolean right = false; jtulach@1348: if (i < endIndex) { jtulach@1348: ch = Character.codePointAt(seq, i); jtulach@1348: right = (isWord(ch) || jtulach@1348: ((Character.getType(ch) == Character.NON_SPACING_MARK) jtulach@1348: && hasBaseCharacter(matcher, i, seq))); jtulach@1348: } else { jtulach@1348: // Tried to access char past the end jtulach@1348: matcher.hitEnd = true; jtulach@1348: // The addition of another char could wreck a boundary jtulach@1348: matcher.requireEnd = true; jtulach@1348: } jtulach@1348: return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: return (check(matcher, i, seq) & type) > 0 jtulach@1348: && next.match(matcher, i, seq); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Non spacing marks only count as word characters in bounds calculations jtulach@1348: * if they have a base character. jtulach@1348: */ jtulach@1348: private static boolean hasBaseCharacter(Matcher matcher, int i, jtulach@1348: CharSequence seq) jtulach@1348: { jtulach@1348: int start = (!matcher.transparentBounds) ? jtulach@1348: matcher.from : 0; jtulach@1348: for (int x=i; x >= start; x--) { jtulach@1348: int ch = Character.codePointAt(seq, x); jtulach@1348: if (Character.isLetterOrDigit(ch)) jtulach@1348: return true; jtulach@1348: if (Character.getType(ch) == Character.NON_SPACING_MARK) jtulach@1348: continue; jtulach@1348: return false; jtulach@1348: } jtulach@1348: return false; jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Attempts to match a slice in the input using the Boyer-Moore string jtulach@1348: * matching algorithm. The algorithm is based on the idea that the jtulach@1348: * pattern can be shifted farther ahead in the search text if it is jtulach@1348: * matched right to left. jtulach@1348: *

jtulach@1348: * The pattern is compared to the input one character at a time, from jtulach@1348: * the rightmost character in the pattern to the left. If the characters jtulach@1348: * all match the pattern has been found. If a character does not match, jtulach@1348: * the pattern is shifted right a distance that is the maximum of two jtulach@1348: * functions, the bad character shift and the good suffix shift. This jtulach@1348: * shift moves the attempted match position through the input more jtulach@1348: * quickly than a naive one position at a time check. jtulach@1348: *

jtulach@1348: * The bad character shift is based on the character from the text that jtulach@1348: * did not match. If the character does not appear in the pattern, the jtulach@1348: * pattern can be shifted completely beyond the bad character. If the jtulach@1348: * character does occur in the pattern, the pattern can be shifted to jtulach@1348: * line the pattern up with the next occurrence of that character. jtulach@1348: *

jtulach@1348: * The good suffix shift is based on the idea that some subset on the right jtulach@1348: * side of the pattern has matched. When a bad character is found, the jtulach@1348: * pattern can be shifted right by the pattern length if the subset does jtulach@1348: * not occur again in pattern, or by the amount of distance to the jtulach@1348: * next occurrence of the subset in the pattern. jtulach@1348: * jtulach@1348: * Boyer-Moore search methods adapted from code by Amy Yu. jtulach@1348: */ jtulach@1348: static class BnM extends Node { jtulach@1348: int[] buffer; jtulach@1348: int[] lastOcc; jtulach@1348: int[] optoSft; jtulach@1348: jtulach@1348: /** jtulach@1348: * Pre calculates arrays needed to generate the bad character jtulach@1348: * shift and the good suffix shift. Only the last seven bits jtulach@1348: * are used to see if chars match; This keeps the tables small jtulach@1348: * and covers the heavily used ASCII range, but occasionally jtulach@1348: * results in an aliased match for the bad character shift. jtulach@1348: */ jtulach@1348: static Node optimize(Node node) { jtulach@1348: if (!(node instanceof Slice)) { jtulach@1348: return node; jtulach@1348: } jtulach@1348: jtulach@1348: int[] src = ((Slice) node).buffer; jtulach@1348: int patternLength = src.length; jtulach@1348: // The BM algorithm requires a bit of overhead; jtulach@1348: // If the pattern is short don't use it, since jtulach@1348: // a shift larger than the pattern length cannot jtulach@1348: // be used anyway. jtulach@1348: if (patternLength < 4) { jtulach@1348: return node; jtulach@1348: } jtulach@1348: int i, j, k; jtulach@1348: int[] lastOcc = new int[128]; jtulach@1348: int[] optoSft = new int[patternLength]; jtulach@1348: // Precalculate part of the bad character shift jtulach@1348: // It is a table for where in the pattern each jtulach@1348: // lower 7-bit value occurs jtulach@1348: for (i = 0; i < patternLength; i++) { jtulach@1348: lastOcc[src[i]&0x7F] = i + 1; jtulach@1348: } jtulach@1348: // Precalculate the good suffix shift jtulach@1348: // i is the shift amount being considered jtulach@1348: NEXT: for (i = patternLength; i > 0; i--) { jtulach@1348: // j is the beginning index of suffix being considered jtulach@1348: for (j = patternLength - 1; j >= i; j--) { jtulach@1348: // Testing for good suffix jtulach@1348: if (src[j] == src[j-i]) { jtulach@1348: // src[j..len] is a good suffix jtulach@1348: optoSft[j-1] = i; jtulach@1348: } else { jtulach@1348: // No match. The array has already been jtulach@1348: // filled up with correct values before. jtulach@1348: continue NEXT; jtulach@1348: } jtulach@1348: } jtulach@1348: // This fills up the remaining of optoSft jtulach@1348: // any suffix can not have larger shift amount jtulach@1348: // then its sub-suffix. Why??? jtulach@1348: while (j > 0) { jtulach@1348: optoSft[--j] = i; jtulach@1348: } jtulach@1348: } jtulach@1348: // Set the guard value because of unicode compression jtulach@1348: optoSft[patternLength-1] = 1; jtulach@1348: if (node instanceof SliceS) jtulach@1348: return new BnMS(src, lastOcc, optoSft, node.next); jtulach@1348: return new BnM(src, lastOcc, optoSft, node.next); jtulach@1348: } jtulach@1348: BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { jtulach@1348: this.buffer = src; jtulach@1348: this.lastOcc = lastOcc; jtulach@1348: this.optoSft = optoSft; jtulach@1348: this.next = next; jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] src = buffer; jtulach@1348: int patternLength = src.length; jtulach@1348: int last = matcher.to - patternLength; jtulach@1348: jtulach@1348: // Loop over all possible match positions in text jtulach@1348: NEXT: while (i <= last) { jtulach@1348: // Loop over pattern from right to left jtulach@1348: for (int j = patternLength - 1; j >= 0; j--) { jtulach@1348: int ch = seq.charAt(i+j); jtulach@1348: if (ch != src[j]) { jtulach@1348: // Shift search to the right by the maximum of the jtulach@1348: // bad character shift and the good suffix shift jtulach@1348: i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); jtulach@1348: continue NEXT; jtulach@1348: } jtulach@1348: } jtulach@1348: // Entire pattern matched starting at i jtulach@1348: matcher.first = i; jtulach@1348: boolean ret = next.match(matcher, i + patternLength, seq); jtulach@1348: if (ret) { jtulach@1348: matcher.first = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: i++; jtulach@1348: } jtulach@1348: // BnM is only used as the leading node in the unanchored case, jtulach@1348: // and it replaced its Start() which always searches to the end jtulach@1348: // if it doesn't find what it's looking for, so hitEnd is true. jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: boolean study(TreeInfo info) { jtulach@1348: info.minLength += buffer.length; jtulach@1348: info.maxValid = false; jtulach@1348: return next.study(info); jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /** jtulach@1348: * Supplementary support version of BnM(). Unpaired surrogates are jtulach@1348: * also handled by this class. jtulach@1348: */ jtulach@1348: static final class BnMS extends BnM { jtulach@1348: int lengthInChars; jtulach@1348: jtulach@1348: BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { jtulach@1348: super(src, lastOcc, optoSft, next); jtulach@1348: for (int x = 0; x < buffer.length; x++) { jtulach@1348: lengthInChars += Character.charCount(buffer[x]); jtulach@1348: } jtulach@1348: } jtulach@1348: boolean match(Matcher matcher, int i, CharSequence seq) { jtulach@1348: int[] src = buffer; jtulach@1348: int patternLength = src.length; jtulach@1348: int last = matcher.to - lengthInChars; jtulach@1348: jtulach@1348: // Loop over all possible match positions in text jtulach@1348: NEXT: while (i <= last) { jtulach@1348: // Loop over pattern from right to left jtulach@1348: int ch; jtulach@1348: for (int j = countChars(seq, i, patternLength), x = patternLength - 1; jtulach@1348: j > 0; j -= Character.charCount(ch), x--) { jtulach@1348: ch = Character.codePointBefore(seq, i+j); jtulach@1348: if (ch != src[x]) { jtulach@1348: // Shift search to the right by the maximum of the jtulach@1348: // bad character shift and the good suffix shift jtulach@1348: int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); jtulach@1348: i += countChars(seq, i, n); jtulach@1348: continue NEXT; jtulach@1348: } jtulach@1348: } jtulach@1348: // Entire pattern matched starting at i jtulach@1348: matcher.first = i; jtulach@1348: boolean ret = next.match(matcher, i + lengthInChars, seq); jtulach@1348: if (ret) { jtulach@1348: matcher.first = i; jtulach@1348: matcher.groups[0] = matcher.first; jtulach@1348: matcher.groups[1] = matcher.last; jtulach@1348: return true; jtulach@1348: } jtulach@1348: i += countChars(seq, i, 1); jtulach@1348: } jtulach@1348: matcher.hitEnd = true; jtulach@1348: return false; jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: /////////////////////////////////////////////////////////////////////////////// jtulach@1348: /////////////////////////////////////////////////////////////////////////////// jtulach@1348: jtulach@1348: /** jtulach@1348: * This must be the very first initializer. jtulach@1348: */ jtulach@1348: static Node accept = new Node(); jtulach@1348: jtulach@1348: static Node lastAccept = new LastNode(); jtulach@1348: jtulach@1348: private static class CharPropertyNames { jtulach@1348: jtulach@1348: static CharProperty charPropertyFor(String name) { jtulach@1348: CharPropertyFactory m = map.get(name); jtulach@1348: return m == null ? null : m.make(); jtulach@1348: } jtulach@1348: jtulach@1348: private static abstract class CharPropertyFactory { jtulach@1348: abstract CharProperty make(); jtulach@1348: } jtulach@1348: jtulach@1348: private static void defCategory(String name, jtulach@1348: final int typeMask) { jtulach@1348: map.put(name, new CharPropertyFactory() { jtulach@1348: CharProperty make() { return new Category(typeMask);}}); jtulach@1348: } jtulach@1348: jtulach@1348: private static void defRange(String name, jtulach@1348: final int lower, final int upper) { jtulach@1348: map.put(name, new CharPropertyFactory() { jtulach@1348: CharProperty make() { return rangeFor(lower, upper);}}); jtulach@1348: } jtulach@1348: jtulach@1348: private static void defCtype(String name, jtulach@1348: final int ctype) { jtulach@1348: map.put(name, new CharPropertyFactory() { jtulach@1348: CharProperty make() { return new Ctype(ctype);}}); jtulach@1348: } jtulach@1348: jtulach@1348: private static abstract class CloneableProperty jtulach@1348: extends CharProperty implements Cloneable jtulach@1348: { jtulach@1348: public CloneableProperty clone() { jtulach@1348: try { jtulach@1348: return (CloneableProperty) super.clone(); jtulach@1348: } catch (CloneNotSupportedException e) { jtulach@1348: throw new AssertionError(e); jtulach@1348: } jtulach@1348: } jtulach@1348: } jtulach@1348: jtulach@1348: private static void defClone(String name, jtulach@1348: final CloneableProperty p) { jtulach@1348: map.put(name, new CharPropertyFactory() { jtulach@1348: CharProperty make() { return p.clone();}}); jtulach@1348: } jtulach@1348: jtulach@1348: private static final HashMap map jtulach@1348: = new HashMap<>(); jtulach@1348: jtulach@1348: static { jtulach@1348: // Unicode character property aliases, defined in jtulach@1348: // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt jtulach@1348: defCategory("Cn", 1<