出于好奇,我决定尝试一下这个概念。
这是我的尝试。
Plan
让我们用函数调用来解析算术表达式。
现在,我们想要解析具有可能未知标识符的(部分)表达式。
如果表达式不完整,我们希望“暗示”最小标记来完成它(并可能继续解析)。
如果标识符未知,我们希望对域中的已知标识符(变量或函数)进行模糊匹配,并按概率递减的顺序对它们进行排名。
基本定义
首先,我们决定希望输入位于内存中,这样我们就可以通过使用来引用位置/子字符串string_view
s:
#include <boost/utility/string_view.hpp>
using Source = boost::string_view;
using Location = Source::const_iterator;
完成提示
除了 AST 之外,我们还希望解析器生成完成提示(隐式完成标记和建议)
namespace Completion {
using Candidates = std::vector<std::string>;
class Hints {
struct ByLocation {
template <typename T, typename U>
bool operator()(T const& a, U const& b) const { return loc(a) < loc(b); }
private:
static Location loc(Source const& s) { return s.begin(); }
static Location loc(Location const& l) { return l; }
};
public:
std::map<Location, std::string, ByLocation> incomplete;
std::map<Source, Candidates, ByLocation> suggestions;
/*explicit*/ operator bool() const { return incomplete.size() || suggestions.size(); }
};
此外,让我们编写一个快速而肮脏的模糊标识符匹配评分函数。
我选择了一个简单的同步比较
- 逐步对相应的字符运行进行评分,并且
- 与跳过输入中的字符相比,更倾向于跳过候选字符(这意味着
adj_diff
是一个很好的模糊匹配adjacent_difference
即使候选字符已被跳过,但是adj_qqq_diff
更糟糕的是因为qqq
输入无法匹配)
- 该算法是递归完成的并且没有分配
- 如果匹配的话第一个字符会得到提升(
rate=1
第一次调用时)
static int fuzzy_match(Source input, boost::string_view candidate, int rate = 1) { // start with first-letter boost
int score = 0;
while (!(input.empty() || candidate.empty())) {
if (input.front() != candidate.front()) {
return score + std::max(
fuzzy_match(input.substr(1), candidate, std::max(rate-2,0)), //penalty for ignoring an input char
fuzzy_match(input, candidate.substr(1), std::max(rate-1,0)));
}
input.remove_prefix(1);
candidate.remove_prefix(1);
score += ++rate;
}
return score;
}
} // namespace Completion
我们将看看它在语法中是如何使用的。
AST
一个普通的 AST,可以执行二进制表达式、字符串/数字文字、变量和(嵌套)函数调用:
#include <boost/variant.hpp>
namespace Ast {
using NumLiteral = double;
using StringLiteral = std::string; // de-escaped, not source view
struct Identifier : std::string {
using std::string::string;
using std::string::operator=;
};
struct BinaryExpression;
struct CallExpression;
using Expression = boost::variant<
NumLiteral,
StringLiteral,
Identifier,
boost::recursive_wrapper<BinaryExpression>,
boost::recursive_wrapper<CallExpression>
>;
struct BinaryExpression {
Expression lhs;
char op;
Expression rhs;
};
using ArgList = std::vector<Expression>;
struct CallExpression {
Identifier function;
ArgList args;
};
}
Grammar
Basics
语法一开始也非常“基本”:
namespace Parsing {
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
template <typename It>
struct Expression : qi::grammar<It, Ast::Expression()> {
Expression(Completion::Hints& hints) : Expression::base_type(start), _hints(hints) {
using namespace qi;
start = skip(space) [expression];
expression = term [_val = _1] >> *(char_("-+") >> expression) [_val = make_binary(_val, _1, _2)];
term = factor [_val = _1] >> *(char_("*/") >> term) [_val = make_binary(_val, _1, _2)];
factor = simple [_val = _1] >> *(char_("^") >> factor) [_val = make_binary(_val, _1, _2)];
simple = call | variable | compound | number | string;
到目前为止一切顺利:构造函数存储了对Completion::Hints&
被记录。所有这些规则均已声明为qi::rule<It, Expression(), qi::space_type>
.
隐含代币
现在它变得更有趣了,这里的一些规则是词素
number = double_;
identifier = raw[(alpha|'_') >> *(alnum|'_')];
有些产品可以容忍缺少结束标记:
// imply the closing quotes
string %= '"' >> *('\\' >> char_ | ~char_('"')) >> implied('"');
compound %= '(' >> expression >> implied(')');
The implied
magic 使得如果预期的结束标记丢失,它将被记录为提示,并且解析继续:
auto implied = [=](char ch) {
return copy(omit[lit(ch) | raw[eps][imply(_1, ch)]]);
};
当然,这引出了一个问题:imply(_1, ch)
确实如此,我们稍后会看到。
现在,观察我们所做的raw[eps]
获取(空)源iterator_range
构建一个Location
来自语义动作。
标识符查找
我们将“符号表”存储在Domain
会员_variables
and _functions
:
using Domain = qi::symbols<char>;
Domain _variables, _functions;
然后我们声明一些可以对它们中的任何一个进行查找的规则:
// domain identifier lookups
qi::_r1_type _domain;
qi::rule<It, Ast::Identifier(Domain const&)> maybe_known, known, unknown;
相应的声明将很快显示。
变量非常简单:
variable = maybe_known(phx::ref(_variables));
打电话比较棘手。如果一个名字是未知的,我们不想假设它暗示着一个function除非后面跟着一个'('
特点。然而,如果标识符是已知的函数名称,我们甚至想暗示(
(这使得用户体验看起来自动完成,当用户键入时sqrt
,它建议下一个字符是(
神奇地)。
// The heuristics:
// - an unknown identifier followed by (
// - an unclosed argument list implies )
call %= ( known(phx::ref(_functions)) // known -> imply the parens
| &(identifier >> '(') >> unknown(phx::ref(_functions))
) >> implied('(') >> -(expression % ',') >> implied(')');
这一切都建立在known
, unknown
and maybe_known
:
///////////////////////////////
// identifier loopkup, suggesting
{
maybe_known = known(_domain) | unknown(_domain);
// distinct to avoid partially-matching identifiers
using boost::spirit::repository::qi::distinct;
auto kw = distinct(copy(alnum | '_'));
known = raw[kw[lazy(_domain)]];
unknown = raw[identifier[_val=_1]] [suggest_for(_1, _domain)];
}
调试、预定义变量/函数
剩下的就是一些繁文缛节:
BOOST_SPIRIT_DEBUG_NODES(
(start)
(expression)(term)(factor)(simple)(compound)
(call)(variable)
(identifier)(number)(string)
//(maybe_known)(known)(unknown) // qi::symbols<> non-streamable
)
_variables += "foo", "bar", "qux";
_functions += "print", "sin", "tan", "sqrt", "frobnicate";
}
private:
Completion::Hints& _hints;
using Domain = qi::symbols<char>;
Domain _variables, _functions;
qi::rule<It, Ast::Expression()> start;
qi::rule<It, Ast::Expression(), qi::space_type> expression, term, factor, simple;
// completables
qi::rule<It, Ast::Expression(), qi::space_type> compound;
qi::rule<It, Ast::CallExpression(), qi::space_type> call;
// implicit lexemes
qi::rule<It, Ast::Identifier()> variable, identifier;
qi::rule<It, Ast::NumLiteral()> number;
qi::rule<It, Ast::StringLiteral()> string;
// domain identifier lookups
qi::_r1_type _domain;
qi::rule<It, Ast::Identifier(Domain const&)> maybe_known, known, unknown;
凤凰助手
让我们从构建二进制 AST 节点的常用助手开始:
///////////////////////////////
// binary expression factory
struct make_binary_f {
Ast::BinaryExpression operator()(Ast::Expression const& lhs, char op, Ast::Expression const& rhs) const {
return {lhs, op, rhs};
}
};
boost::phoenix::function<make_binary_f> make_binary;
类似地,我们可以有一个 Phoenix 函数对象来注册隐含字符:
///////////////////////////////
// auto-completing incomplete expressions
struct imply_f {
Completion::Hints& _hints;
void operator()(boost::iterator_range<It> where, char implied_char) const {
auto inserted =
_hints.incomplete.emplace(&*where.begin(), std::string(1, implied_char));
// add the implied char to existing completion
if (!inserted.second)
inserted.first->second += implied_char;
}
};
boost::phoenix::function<imply_f> imply { imply_f { _hints } };
请注意,它按位置排序(这使用户体验更容易)并检测先前隐含令牌何时位于同一位置,在这种情况下,我们只需附加到它即可。
到目前为止,为未知标识符生成相关建议也是一个凤凰演员就不足为奇了:
///////////////////////////////
// suggest_for
struct suggester {
Completion::Hints& _hints;
void operator()(boost::iterator_range<It> where, Domain const& symbols) const {
using namespace Completion;
Source id(&*where.begin(), where.size());
Candidates c;
symbols.for_each([&](std::string const& k, ...) { c.push_back(k); });
auto score = [id](Source v) { return fuzzy_match(id, v); };
auto byscore = [=](Source a, Source b) { return score(a) > score(b); };
sort(c.begin(), c.end(), byscore);
c.erase( remove_if(c.begin(), c.end(), [=](Source s) { return score(s) < 3; }), c.end());
_hints.suggestions.emplace(id, c);
}
};
boost::phoenix::function<suggester> suggest_for {suggester{_hints}};
就这样。如果它看起来更复杂,那是因为它做了更多的事情:它对所有候选者进行模糊评分,按分数降序对它们进行排序,并删除分数
};
}
BONUS
让我们稍微改变一下并允许','
隐含在参数列表中:
call %= ( known(phx::ref(_functions)) // known -> imply the parens
| &(identifier >> '(') >> unknown(phx::ref(_functions))
)
>> implied('(')
>> -(expression % ',')
>> implied(')')
;
我们来替换一下','
there:
>> -(expression % (',' | !(')'|eoi) >> implied(',')))
NOTE: 检测似乎更有意义&expression
断言后面有一个表达式,而不是断言尚未到达输入/参数列表的末尾。
但是,这样做会很糟糕,因为任何包含的表达式都可能会触发更多隐含标记作为副作用,从而导致重复的隐含标记。
这种(副作用语义操作)是我通常避免语义操作或使用它们仅在规则的(公开的)属性中存储效果的主要原因之一。看Boost Spirit:“语义行为是邪恶的”? https://stackoverflow.com/questions/8259440/boost-spirit-semantic-actions-are-evil
试驾员
如果没有一些好的测试用例,这篇文章就毫无意义。所以这里是:
int main() {
for (Source const input : {
"", // invalid
"(3", // incomplete, imply ')'
"3*(6+sqrt(9))^7 - 1e8", // completely valid
"(3*(((6+sqrt(9))^7 - 1e8", // incomplete, imply ")))"
"print(\"hello \\\"world!", // completes the string literal and the parameter list
"foo", // okay, known variable
"baz", // (suggest bar)
"baz(", // incomplete, imply ')', unknown function
"taz(", // incomplete, imply ')', unknown function
"san(", // 2 suggestions (sin/tan)
"print(1, 2, \"three\", complicated(san(78",
"(print sqrt sin 9) -0) \"bye",
})
{
std::cout << "-------------- '" << input << "'\n";
Location f = input.begin(), l = input.end();
Ast::Expression expr;
Completion::Hints hints;
bool ok = parse(f, l, Parsing::Expression<Location>{hints}, expr);
if (hints) {
std::cout << "Input: '" << input << "'\n";
}
for (auto& c : hints.incomplete) {
std::cout << "Missing " << std::setw(c.first - input.begin()) << "" << "^ implied: '" << c.second << "'\n";
}
for (auto& id : hints.suggestions) {
std::cout << "Unknown " << std::setw(id.first.begin() - input.begin()) << "" << std::string(id.first.size(), '^');
if (id.second.empty())
std::cout << " (no suggestions)\n";
else {
std::cout << " (did you mean ";
once_t first;
for (auto& s : id.second)
std::cout << (first?"":" or ") << "'" << s << "'";
std::cout << "?)\n";
}
}
if (ok) {
std::cout << "AST: " << expr << "\n";
} else {
std::cout << "Parse failed\n";
}
if (f!=l)
std::cout << "Remaining input: '" << std::string(f,l) << "'\n";
}
}
请注意,除了第一个输入 (""
)一切都被启发式地解析为some一种表达方式!最后一个旨在展示所有参数列表是如何隐含的,因为print
, sqrt
and sin
是已知函数。然后一些','
是隐含的,最后未闭合的字符串文字和剩余的括号被闭合。这是(非调试)输出:
-------------- ''
Parse failed
-------------- '(3'
Input: '(3'
Missing ^ implied: ')'
AST: 3
-------------- '3*(6+sqrt(9))^7 - 1e8'
AST: ((3 * ((6 + (sqrt (9))) ^ 7)) - 1e+08)
-------------- '(3*(((6+sqrt(9))^7 - 1e8'
Input: '(3*(((6+sqrt(9))^7 - 1e8'
Missing ^ implied: ')))'
AST: (3 * (((6 + (sqrt (9))) ^ 7) - 1e+08))
-------------- 'print("hello \"world!'
Input: 'print("hello \"world!'
Missing ^ implied: '")'
AST: (print (hello "world!))
-------------- 'foo'
AST: foo
-------------- 'baz'
Input: 'baz'
Unknown ^^^ (did you mean 'bar'?)
AST: baz
-------------- 'baz('
Input: 'baz('
Missing ^ implied: ')'
Unknown ^^^ (no suggestions)
AST: (baz ())
-------------- 'taz('
Input: 'taz('
Missing ^ implied: ')'
Unknown ^^^ (did you mean 'tan'?)
AST: (taz ())
-------------- 'san('
Input: 'san('
Missing ^ implied: ')'
Unknown ^^^ (did you mean 'sin' or 'tan'?)
AST: (san ())
-------------- 'print(1, 2, "three", complicated(san(78'
Input: 'print(1, 2, "three", complicated(san(78'
Missing ^ implied: ')))'
Unknown ^^^^^^^^^^^ (did you mean 'frobnicate' or 'print'?)
Unknown ^^^ (did you mean 'sin' or 'tan'?)
AST: (print (1, 2, three, (complicated ((san (78))))))
-------------- '(print sqrt sin 9) -0) "bye'
Input: '(print sqrt sin 9) -0) "bye'
Missing ^ implied: '('
Missing ^ implied: '('
Missing ^ implied: '('
Missing ^ implied: ','
Missing ^ implied: '"))'
AST: (print ((sqrt (((sin (9)) - 0))), bye))
完整列表/现场演示
Live On Coliru http://coliru.stacked-crooked.com/a/c1b0c5c573e9ec78
//#define BOOST_SPIRIT_DEBUG
#include <boost/utility/string_view.hpp>
using Source = boost::string_view;
using Location = Source::const_iterator;
#include <map>
#include <vector>
namespace Completion {
static int fuzzy_match(Source input, boost::string_view candidate, int rate = 1) { // start with first-letter boost
int score = 0;
while (!(input.empty() || candidate.empty())) {
if (input.front() != candidate.front()) {
return score + std::max(
fuzzy_match(input.substr(1), candidate, std::max(rate-2,0)), //penalty for ignoring an input char
fuzzy_match(input, candidate.substr(1), std::max(rate-1,0)));
}
input.remove_prefix(1);
candidate.remove_prefix(1);
score += ++rate;
}
return score;
}
using Candidates = std::vector<std::string>;
class Hints {
struct ByLocation {
template <typename T, typename U>
bool operator()(T const& a, U const& b) const { return loc(a) < loc(b); }
private:
static Location loc(Source const& s) { return s.begin(); }
static Location loc(Location const& l) { return l; }
};
public:
std::map<Location, std::string, ByLocation> incomplete;
std::map<Source, Candidates, ByLocation> suggestions;
/*explicit*/ operator bool() const { return incomplete.size() || suggestions.size(); }
};
}
#include <boost/variant.hpp>
namespace Ast {
using NumLiteral = double;
using StringLiteral = std::string; // de-escaped, not source view
struct Identifier : std::string {
using std::string::string;
using std::string::operator=;
};
struct BinaryExpression;
struct CallExpression;
using Expression = boost::variant<
NumLiteral,
StringLiteral,
Identifier,
boost::recursive_wrapper<BinaryExpression>,
boost::recursive_wrapper<CallExpression>
>;
struct BinaryExpression {
Expression lhs;
char op;
Expression rhs;
};
using ArgList = std::vector<Expression>;
struct CallExpression {
Identifier function;
ArgList args;
};
}
#include <boost/fusion/adapted/struct.hpp>
BOOST_FUSION_ADAPT_STRUCT(Ast::BinaryExpression, lhs, op, rhs)
BOOST_FUSION_ADAPT_STRUCT(Ast::CallExpression, function, args)
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_distinct.hpp>
// for debug printing:
namespace {
struct once_t { // an auto-reset flag
operator bool() { bool v = flag; flag = false; return v; }
bool flag = true;
};
}
// for debug printing:
namespace Ast {
static inline std::ostream& operator<<(std::ostream& os, std::vector<Expression> const& args) {
os << "(";
once_t first;
for (auto& a : args) os << (first?"":", ") << a;
return os << ")";
}
static inline std::ostream& operator<<(std::ostream& os, BinaryExpression const& e) {
return os << boost::fusion::as_vector(e);
}
static inline std::ostream& operator<<(std::ostream& os, CallExpression const& e) {
return os << boost::fusion::as_vector(e);
}
}
namespace Parsing {
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
template <typename It>
struct Expression : qi::grammar<It, Ast::Expression()> {
Expression(Completion::Hints& hints) : Expression::base_type(start), _hints(hints) {
using namespace qi;
start = skip(space) [expression];
expression = term [_val = _1] >> *(char_("-+") >> expression) [_val = make_binary(_val, _1, _2)];
term = factor [_val = _1] >> *(char_("*/") >> term) [_val = make_binary(_val, _1, _2)];
factor = simple [_val = _1] >> *(char_("^") >> factor) [_val = make_binary(_val, _1, _2)];
simple = call | variable | compound | number | string;
auto implied = [=](char ch) {
return copy(omit[lit(ch) | raw[eps][imply(_1, ch)]]);
};
variable = maybe_known(phx::ref(_variables));
compound %= '(' >> expression >> implied(')');
// The heuristics:
// - an unknown identifier followed by (
// - an unclosed argument list implies )
call %= ( known(phx::ref(_functions)) // known -> imply the parens
| &(identifier >> '(') >> unknown(phx::ref(_functions))
)
>> implied('(')
>> -(expression % (',' | !(')'|eoi) >> implied(',')))
>> implied(')')
;
// lexemes, primitive rules
identifier = raw[(alpha|'_') >> *(alnum|'_')];
// imply the closing quotes
string %= '"' >> *('\\' >> char_ | ~char_('"')) >> implied('"'); // TODO more escapes
number = double_; // TODO integral arguments
///////////////////////////////
// identifier loopkup, suggesting
{
maybe_known = known(_domain) | unknown(_domain);
// distinct to avoid partially-matching identifiers
using boost::spirit::repository::qi::distinct;
auto kw = distinct(copy(alnum | '_'));
known = raw[kw[lazy(_domain)]];
unknown = raw[identifier[_val=_1]] [suggest_for(_1, _domain)];
}
BOOST_SPIRIT_DEBUG_NODES(
(start)
(expression)(term)(factor)(simple)(compound)
(call)(variable)
(identifier)(number)(string)
//(maybe_known)(known)(unknown) // qi::symbols<> non-streamable
)
_variables += "foo", "bar", "qux";
_functions += "print", "sin", "tan", "sqrt", "frobnicate";
}
private:
Completion::Hints& _hints;
using Domain = qi::symbols<char>;
Domain _variables, _functions;
qi::rule<It, Ast::Expression()> start;
qi::rule<It, Ast::Expression(), qi::space_type> expression, term, factor, simple;
// completables
qi::rule<It, Ast::Expression(), qi::space_type> compound;
qi::rule<It, Ast::CallExpression(), qi::space_type> call;
// implicit lexemes
qi::rule<It, Ast::Identifier()> variable, identifier;
qi::rule<It, Ast::NumLiteral()> number;
qi::rule<It, Ast::StringLiteral()> string;
// domain identifier lookups
qi::_r1_type _domain;
qi::rule<It, Ast::Identifier(Domain const&)> maybe_known, known, unknown;
///////////////////////////////
// binary expression factory
struct make_binary_f {
Ast::BinaryExpression operator()(Ast::Expression const& lhs, char op, Ast::Expression const& rhs) const {
return {lhs, op, rhs};
}
};
boost::phoenix::function<make_binary_f> make_binary;
///////////////////////////////
// auto-completing incomplete expressions
struct imply_f {
Completion::Hints& _hints;
void operator()(boost::iterator_range<It> where, char implied_char) const {
auto inserted =
_hints.incomplete.emplace(&*where.begin(), std::string(1, implied_char));
// add the implied char to existing completion
if (!inserted.second)
inserted.first->second += implied_char;
}
};
boost::phoenix::function<imply_f> imply { imply_f { _hints } };
///////////////////////////////
// suggest_for
struct suggester {
Completion::Hints& _hints;
void operator()(boost::iterator_range<It> where, Domain const& symbols) const {
using namespace Completion;
Source id(&*where.begin(), where.size());
Candidates c;
symbols.for_each([&](std::string const& k, ...) { c.push_back(k); });
auto score = [id](Source v) { return fuzzy_match(id, v); };
auto byscore = [=](Source a, Source b) { return score(a) > score(b); };
sort(c.begin(), c.end(), byscore);
c.erase( remove_if(c.begin(), c.end(), [=](Source s) { return score(s) < 3; }), c.end());
_hints.suggestions.emplace(id, c);
}
};
boost::phoenix::function<suggester> suggest_for {suggester{_hints}};
};
}
#include <iostream>
#include <iomanip>
int main() {
for (Source const input : {
"", // invalid
"(3", // incomplete, imply ')'
"3*(6+sqrt(9))^7 - 1e8", // completely valid
"(3*(((6+sqrt(9))^7 - 1e8", // incomplete, imply ")))"
"print(\"hello \\\"world!", // completes the string literal and the parameter list
"foo", // okay, known variable
"baz", // (suggest bar)
"baz(", // incomplete, imply ')', unknown function
"taz(", // incomplete, imply ')', unknown function
"san(", // 2 suggestions (sin/tan)
"print(1, 2, \"three\", complicated(san(78",
"(print sqrt sin 9) -0) \"bye",
})
{
std::cout << "-------------- '" << input << "'\n";
Location f = input.begin(), l = input.end();
Ast::Expression expr;
Completion::Hints hints;
bool ok = parse(f, l, Parsing::Expression<Location>{hints}, expr);
if (hints) {
std::cout << "Input: '" << input << "'\n";
}
for (auto& c : hints.incomplete) {
std::cout << "Missing " << std::setw(c.first - input.begin()) << "" << "^ implied: '" << c.second << "'\n";
}
for (auto& id : hints.suggestions) {
std::cout << "Unknown " << std::setw(id.first.begin() - input.begin()) << "" << std::string(id.first.size(), '^');
if (id.second.empty())
std::cout << " (no suggestions)\n";
else {
std::cout << " (did you mean ";
once_t first;
for (auto& s : id.second)
std::cout << (first?"":" or ") << "'" << s << "'";
std::cout << "?)\n";
}
}
if (ok) {
std::cout << "AST: " << expr << "\n";
} else {
std::cout << "Parse failed\n";
}
if (f!=l)
std::cout << "Remaining input: '" << std::string(f,l) << "'\n";
}
}
¹ 提升精神船长问题 https://stackoverflow.com/questions/17072987/boost-spirit-skipper-issues/17073965#17073965