主要分2大块,
核心部分:就是一个NFA,只支持标准正则的操作,concatenation, union, iteration,限定上限的iteration,对应的meta character只有 (, ), |, *, ., {upper}
扩展部分:这部分是把扩展正则表达式转化为标准正则表达式,扩展正则表达式支持集合[a-z0-9], 重复次数上下界限制 {lower, upper},+, ?等
一:核心NFA部分
对于DFA来说,内部就是一张状态转移表table[state][next input char];对于NFA,内部就是一个epsilon状态转移有向图。DFA的状态转移必须由输入字符触发,而且转移是确定的。NFA可以不读输入进行转移(而且有多种转移),对应meta character,比如*,可以back reference之前的字符,而“(”和“)”也不需要consume输入,因为是meta character,不对应输入字符。NFA的不确定体现在,有多种转移,可以到达多种状态,那些状态又可以有多种转移,所以是一个Digraph的dfs问题,所以判断match的条件是从初始状态出发dfs能够到达终止状态,是一个路径搜索问题。
1 simulate(判断匹配)
1)从状态0 dfs,遍历epsilon转移,得出一个初始状态集
2)对于当前输入字符,
2).1 尝试从当前状态集的每个状态匹配,这个是consume输入的。转移到下一个状态,if (re.charAt(v) == c || re.charAt(v) == '.') states.add(v + 1)
2).2 从2).1得到的新的状态集出发多源dfs, 进行epsilon转移遍历,得出新的状态集,作为下一个字符对应的状态集
3)match判断,最终状态集里如果包含终止状态,说明终止状态是可达的。
2 epsilon转移Digraph的构建,读入标准正则表达式
1)对于(,)和* 添加一条到下一个字符的转移
2)对于iteration操作符*,添加*到back reference的实体的起始位置的双向转移,实体是单个字符或者子regex(被小括号括着)
3)对于Union操作符|,规定必须由小括号括着,即必须是(a|b),添加从(到|之后的实体的转移,以及|之后的实体到)的转移
4)对于有上限的iteration {upper}, 和*类似,添加从{到之前实体的双向转移,和从{到下一个字符(}之后)的转移。但是从{往前back reference的转移的边是带权的,权值就是重复次数 - 1。为了支持限定次数的iteration,Digraph是带权的,普通边权值设定为-1,带整数权值的边表示这条边只能走这么多次,相应的dfs算法也会做判断,每走一次decrease一次权值,直到0,0表示这条边不能走,相当于没有。
二:扩展的正则表达式支持
对原始输入的正则表达式进行处理,转化成标准正则表达式
[a-z0-9] 转化成((a|b)|c...
x+ 转化成 xx*
x?转化成x{1}
x{2, 3}转化成 xx{3-2}即xx{1}
[^abc] 先求差集charset - [abc],在转化成((a|b)|c)的形式
package excercise;
import java.util.*;
class Edge {
int from, to, weight;
public Edge(int from, int to, int w) {
this.from = from;
this.to = to;
this.weight = w;
}
}
class Digraph {
private List<Edge>[] adj;
private int V;
public Digraph(int V) {
this.V = V;
adj = new List[V];
for (int v = 0; v < V; ++v)
adj[v] = new ArrayList<Edge>();
}
public void addEdge(int v, int w, int weight) {
this.adj[v].add(new Edge(v, w, weight));
}
public void addEdge(int v, int w) { addEdge(v, w, -1);}
public int V() {return V;}
public Iterable<Edge> adj(int v) { return adj[v];}
}
class DirectedDFS {
private boolean[] marked;
private Digraph G;
public DirectedDFS(Digraph G, int v) {
marked = new boolean[G.V()];
this.G = G;
dfs(v);
}
public DirectedDFS(Digraph G, Iterable<Integer> s) {
marked = new boolean[G.V()];
this.G = G;
for (int v : s) dfs(v);
}
public boolean marked(int v) {return marked[v];}
private void dfs(int v) {
marked[v] = true;
for (Edge e : G.adj(v)) {
if (!marked[e.to]) {
if (e.weight < 0) dfs(e.to);
else if (e.weight > 0) {
e.weight--;
dfs(e.to);
}
}
}
}
}
class NFA {
private String re;
private Digraph G;
private int M;
public NFA(String regex) {
re = regex;
M = regex.length();
G = buildEpsilonTransitionGraph();
}
private Digraph buildEpsilonTransitionGraph() {
Digraph G = new Digraph(M + 1);
Stack<Integer> ops = new Stack<Integer>();
for (int i = 0; i < M; ++i) {
int lp = i;
if (re.charAt(i) == '(' || re.charAt(i) == '|') ops.push(i);
else if (re.charAt(i) == ')') {
int or = ops.pop();
if (re.charAt(or) == '|') {
lp = ops.pop();
G.addEdge(or, i);
G.addEdge(lp, or + 1);
}
else lp = or;
}
if (i < M - 1 && re.charAt(i + 1) == '*') {
G.addEdge(lp, i + 1);
G.addEdge(i + 1, lp);
}
if (i < M - 1 && re.charAt(i + 1) == '{') {
int rb = re.indexOf('}', i + 1);
int num = Integer.parseInt(re.substring(i + 2, rb));
G.addEdge(lp, i + 1);
G.addEdge(i + 1, lp, num - 1);
}
if (re.charAt(i) == '(' || re.charAt(i) == ')' || re.charAt(i) == '*' )
G.addEdge(i, i + 1);
else if (re.charAt(i) == '{') {
int rb = re.indexOf('}', i);
G.addEdge(i, rb + 1);
}
}
return G;
}
public boolean recognizes(String text) {
List<Integer> e_closure = new ArrayList<Integer>();
DirectedDFS dfs = new DirectedDFS(G, 0);
for (int v = 0; v < G.V(); ++v) //compute e-closure(0), epsilon closure of start state
if (dfs.marked(v)) e_closure.add(v);
for (int i = 0; i < text.length(); ++i) {
char c = text.charAt(i);
List<Integer> states = new ArrayList<Integer>();
for (int v : e_closure) { //compute Y = c(X), the state set Y that state set X could goto on c
if (v == M) continue;
if (re.charAt(v) == c || re.charAt(v) == '.') states.add(v + 1);
}
if (states.isEmpty()) return false;
dfs = new DirectedDFS(G, states);
e_closure = new ArrayList<Integer>(); //compute e-closure(Y)
for (int v = 0; v < G.V(); ++v)
if (dfs.marked(v)) e_closure.add(v);
if (e_closure.isEmpty()) return false;
}
for (int v : e_closure)
if (v == M) return true;
return false;
}
}
public class RE {
private NFA nfa;
private final Set<Character> metaChars;
public RE(String regex) {
char[] metaArray = {'*', '(', ')', '[', ']', '{', '}', '.', '|'};
metaChars = new HashSet<Character>();
for (char c : metaArray) metaChars.add(c);
String afterProcess = preProcess(regex);
// System.out.println(afterProcess);
nfa = new NFA(afterProcess);
}
public boolean recognizes(String text) {
return nfa.recognizes(text);
}
private String preProcess(String s) {
StringBuilder sb = new StringBuilder();
int lastREStart = -1;
int leftParenthese = -1;
int or = -1;
for (int i = 0; i < s.length(); ++i) {
if (s.charAt(i) == '[') {
int rb = s.indexOf(']', i + 1);
lastREStart = sb.length();
sb.append(handleBracket(s, i, rb));
i = rb;
}
else if (s.charAt(i) == '{') {
int end = sb.length();
int comma = s.indexOf(',', i + 1);
int num = Integer.parseInt(s.substring(i + 1, comma));
for (int j = 0; j < num; ++j)
sb.append(sb.substring(lastREStart, end));
int rb = s.indexOf('}', comma + 1);
int num2 = Integer.parseInt(s.substring(comma + 1, rb));
sb.append("{" + (num2 - num) + "}");
i = rb;
}
else if (s.charAt(i) == '+') {
sb.append(sb.substring(lastREStart, sb.length()));
sb.append('*');
}
else if (s.charAt(i) == '?') {
sb.append("{1}");
}
else if (s.charAt(i) == '|') {
if (or > 0) {
sb.insert(0, '(');
sb.append(')');
lastREStart = 0;
}
sb.append(s.charAt(i));
or = sb.length() - 1;
}
else {
if (leftParenthese == -1) lastREStart = sb.length();
sb.append(s.charAt(i));
if (s.charAt(i) == '(') leftParenthese = i;
else if (s.charAt(i) == ')') {
lastREStart = leftParenthese;
leftParenthese = -1;
}
}
}
if (or > 0) {
sb.insert(0, '(');
sb.append(')');
lastREStart = 0;
}
return sb.toString();
}
private String handleBracket(String s, int l, int r) {
boolean filter = true;
boolean[] include = new boolean[256];
if (s.charAt(l + 1) == '^') {
filter = false;
++l;
}
for (int i = l + 1; i < r; ++i) {
if (s.charAt(i) != '-')
include[s.charAt(i)] = true;
else {
for (char c = (char)(s.charAt(i - 1) + 1); c < s.charAt(i + 1); ++c)
include[c] = true;
}
}
ArrayDeque<Character> ad = new ArrayDeque<Character>();
for (char c = 0; c < 256; ++c) {
if (include[c] == filter && !metaChars.contains(c)) {
if (ad.isEmpty()) ad.addLast(c);
else {
ad.addLast('|');
ad.addLast(c);
ad.addFirst('(');
ad.addLast(')');
}
}
}
char[] a = new char[ad.size()];
for (int i = 0; i < a.length; ++i)
a[i] = ad.pollFirst();
return new String(a);
}
}