Jump to content
Main menu
Main menu
move to sidebar
hide
Navigation
Main page
Recent changes
Random page
Help about MediaWiki
Humanipedia
Search
Search
Appearance
Create account
Log in
Personal tools
Create account
Log in
Pages for logged out editors
learn more
Contributions
Talk
Editing
Module:User:Cscott/llpeg
Module
Discussion
English
Read
Edit source
View history
Tools
Tools
move to sidebar
hide
Actions
Read
Edit source
View history
General
What links here
Related changes
Special pages
Page information
Appearance
move to sidebar
hide
Warning:
You are not logged in. Your IP address will be publicly visible if you make any edits. If you
log in
or
create an account
, your edits will be attributed to your username, along with other benefits.
Anti-spam check. Do
not
fill this in!
return (function() local builders = {} local function register(name, f) builders[name] = f end register('advent.compat', function() return require [[Module:User:Cscott/compat]] end) register('llpeg.types', function(myrequire) myrequire('strict') local compat = myrequire('advent.compat') local CHARMAX = 0x7F -- maximum codepoint for charsets -- metatable for pattern objects; will be filled in later local metareg = {} local enum = function(keys) local Enum = {} Enum.__index = Enum function Enum:__tostring() return self.name end function Enum:pairs() return keys end function Enum:type() return Enum end for name, value in pairs(keys) do Enum[name] = setmetatable({ name = name, value = value }, Enum) end return Enum end local CapKind = enum{ close = "close", -- not used in trees */ position = "position", const = "constant", -- ktable[key] is Lua constant backref = "backref", -- ktable[key] is "name" of group to get capture arg = "argument", -- 'key' is arg's number simple = "simple", -- next node is pattern table = "table", -- next node is pattern ["function"] = "function", -- ktable[key] is function; next node is pattern acc = "acc", -- ktable[key] is function; next node is pattern query = "query", -- ktable[key] is table; next node is pattern string = "string", -- ktable[key] is string; next node is pattern num = "num", -- numbered capture; 'key' is number of value to return subst = "substitution", -- substitution capture; next node is pattern fold = "fold", -- ktable[key] is function; next node is pattern runtime = "runtime", -- not used in trees (is uses another type for tree) group = "group", -- ktable[key] is group's "name" } local TTag = enum{ Char = "char", -- 'n' has unicode codepoint Set = "set", -- 'set' has sparse array codepoint->true for codepoint <=CHARMAX -- 'rest' indicates whether all codepoints > CHARMAX should be -- part of the set (true) or not (false) Any = "any", True = "true", False = "false", UTFR = "utf8.range", --[[ range of UTF-8 codepoints; 'from' has initial codepoint; 'to' has final codepoint ]]-- Rep = "rep", -- 'sib1' * Seq = "seq", -- 'sib1' 'sib2' Choice = "choice", -- 'sib1' / 'sib2' Not = "not", -- !'sib1' And = "and", -- &'sib1' Call = "call", -- 'sib2' is rule being called; otherwise same as TOpenCall OpenCall = "opencall", -- 'key' is rule name Rule = "rule", --[[ 'key' is rule name (but key == nil for unused rules); 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; 'n' is rule's sequential number, 'name' is rule name (even for unused rules) ]]-- XInfo = "xinfo", -- extra info (not used) Grammar = "grammar", -- 'sib1' is initial (and first) rule, 'n' is # rules Behind = "behind", -- 'sib1' is pattern, 'n' is how much to go back Capture = "capture", --[[ captures: 'cap' is kind of capture (enum 'CapKind'); 'key' is Lua value associated with capture; 'sib1' is capture body ]]-- RunTime = "run-time", --[[ run-time capture: 'key' is Lua function; 'sib1' is capture body ]]-- Throw = "throw", -- labeled failure: 'key' is label's name, -- sib2 is associated recovery rule } local PE = enum{ nullable = "nullable", nofail = "nofail", } -- virtual machine instructions local Opcode = enum{ Any = "any", -- if no char, fail Char = "char", -- if char != aux, fail Set = "set", -- if char not in buff, fail TestAny = "testany", -- in no char, jump to 'offset' TestChar = "testchar", -- if char != aux, jump to 'offset' TestSet = "testset", -- if char not in buff, jump to 'offset' Span = "span", -- read a span of chars in buff UTFR = "utf-range", -- if codepoint not in range [offset, utf_to], fail Behind = "behind", -- walk back 'aux' characters (fail if not possible) Ret = "ret", -- return from a rule End = "end", -- end of pattern Choice = "choice", -- stack a choice; next fail will jump to 'offset' PredChoice = "pred_choice", -- labeled failure: stack a choice; changes label env next fail will jump to 'offset' Jmp = "jmp", -- jump to 'offset' Call = "call", -- call rule at 'offset' OpenCall = "open_call", -- call rule number 'key' (must be closed to a ICall) Commit = "commit", -- pop choice and jump to 'offset' PartialCommit = "partial_commit", -- update top choice to current position and jump BackCommit = "back_commit", -- backtrack like "fail" but jump to its own 'offset' FailTwice = "failtwice", -- pop one choice and then fail Fail = "fail", -- go back to saved state on choice and jump to saved offset Giveup = "giveup", -- internal use FullCapture = "fullcapture", -- complete capture of last 'off' chars OpenCapture = "opencapture", -- start a capture CloseCapture = "closecapture", CloseRunTime = "closeruntime", Throw = "throw", -- fails with a given label --labeled failure ThrowRec = "throw_rec", -- fails with a given label and call rule at 'offset' --labeled failure Empty = "--", -- to fill empty slots left by optimizations } -- helper for visitor pattern definitions function define(dispatch, which, f) for _,v in pairs(which) do assert(v ~= nil) -- catch typos dispatch[v] = f end end local numsiblings = {} define(numsiblings, { TTag.Char, TTag.Set, TTag.Any, TTag.True, TTag.False, TTag.UTFR, TTag.Call, TTag.OpenCall, TTag.Throw, }, 0) define(numsiblings, { TTag.Rep, TTag.Not, TTag.And, TTag.Grammar, TTag.Behind, TTag.Capture, TTag.RunTime, }, 1) define(numsiblings, { TTag.Seq, TTag.Choice, TTag.Rule, }, 2) -- more help for visitor functions local function_name_registry = {} function register_fname(name, f) assert(type(name) == "string") assert(type(f) == "function") function_name_registry[f] = name end function report_ferror(f, msg) local fname = function_name_registry[f] if fname ~= nil then msg = fname .. ": " .. msg end error(msg) end function define_type_visitor(tbl) local dispatch = {} for keys,func in pairs(tbl) do if type(keys) ~= "table" then keys = { keys } end define(dispatch, keys, func) end local visit visit = function(val, ...) local a = dispatch["assert"] if a ~= nil then a(val, ...) end -- assert preconditions local ty = type(val) if ty == 'table' and getmetatable(val) == metareg then ty = 'pattern' end local f = dispatch[ty] if f ~= nil then return f(val, ...) end f = dispatch.default if f == nil then report_ferror(visit, "no default for " .. ty) end return f(val, ...) end return visit end function define_tree_visitor(tbl, opt_name) local dispatch = {} for keys,func in pairs(tbl) do if type(keys) ~= "table" or getmetatable(keys) == TTag then keys = { keys } end define(dispatch, keys, func) end local visit visit = function(tree, ...) if tree == nil then report_ferror(visit, "nil tree") end local a = dispatch["assert"] if a ~= nil then a(tree, ...) end -- assert preconditions local f = dispatch[tree.tag] if f ~= nil then return f(tree, ...) end f = dispatch.default if f == nil then report_ferror(visit, "no default for " .. tree.tag) end return f(tree, ...) end return visit end function define_opcode_visitor(tbl) local dispatch = {} for keys,func in pairs(tbl) do if type(keys) ~= "table" or getmetatable(keys) == Opcode then keys = { keys } end define(dispatch, keys, func) end local visit visit = function(op, ...) if op == nil then report_ferror(visit, "nil op") end local a = dispatch["assert"] if a ~= nil then a(op, ...) end -- assert preconditions local f = dispatch[op.code] if f ~= nil then return f(op, ...) end f = dispatch.default if f == nil then report_ferror(visit, "no default for " .. op.code) end return f(op, ...) end return visit end -- helper for module imports function from(mod, list) local result = {} for _,v in ipairs(list) do table.insert(result, mod[v]) end return compat.unpack(result) end function newcharset() return setmetatable({ tag = TTag.Set, code = nil, rest = false, set = {} }, metareg) end local fullset = newcharset() for i = 0,CHARMAX do fullset.set[i] = true end fullset.rest = true -- make sure non-ascii unicode chars are included! assert(fullset.tag == TTag.Set) return { CHARMAX = CHARMAX, CapKind = CapKind, Opcode = Opcode, PE = PE, TTag = TTag, define = define, define_tree_visitor = define_tree_visitor, enum = enum, from = from, fullset = fullset, metareg = metareg, newcharset = newcharset, numsiblings = numsiblings, register_fname = register_fname, } end) register('llpeg.print', function(myrequire) myrequire('strict') local compat = myrequire('advent.compat') local from = myrequire('llpeg.types').from local CHARMAX, CapKind, Opcode, TTag, define, define_tree_visitor, numsiblings, _ = from(myrequire('llpeg.types'), { 'CHARMAX', 'CapKind', 'Opcode', 'TTag', 'define', 'define_tree_visitor', 'numsiblings', }) function printcharset(tree) local result = "[" local i = 0 while i <= CHARMAX do local first = i while tree.set[i] and i <= CHARMAX do i = i + 1 end if first == (i - 1) then -- unary range result = result .. string.format("(%02x)", first) elseif first < (i-1) then -- non-empty range result = result .. string.format("(%02x-%02x)", first, i - 1) end i = i + 1 end if tree.rest then result = result .. "(80-FFFF)" end return result .. "]" end function printjmp(op, pc) return "-> " .. op.target end local printinst_helper = define_opcode_visitor{ [Opcode.Char] = function(op, pc) return string.format("'%c' (%02x)", op.aux, op.aux) end, [Opcode.TestChar] = function(op, pc) return string.format("'%c' (%02x)", op.aux, op.aux) .. printjmp(op, pc) end, [Opcode.UTFR] = function(op, pc) return string.format("%d - %d", op.from, op.to) end, [Opcode.FullCapture] = function(op, pc) return string.format("%s (size = %s) (idx = %s)", op.cap.value, op.aux, op.key) end, [Opcode.OpenCapture] = function(op, pc) return string.format("%s (idx = %s)", op.cap.value, op.key) end, [Opcode.Set] = function(op, pc) return printcharset(op) end, [Opcode.TestSet] = function(op, pc) return printcharset(op) .. printjmp(op, pc) end, [Opcode.Span] = function(op, pc) return printcharset(op) end, [Opcode.OpenCall] = function(op, pc) return string.format("-> %d", op.target) -- rule number end, [Opcode.Behind] = function(op, pc) return string.format("%d", op.aux) end, [{Opcode.Jmp, Opcode.Call, Opcode.Commit, Opcode.Choice, Opcode.PartialCommit, Opcode.BackCommit, Opcode.TestAny, Opcode.PredChoice}] = function(op, pc) return printjmp(op, pc) end, [Opcode.Throw] = function(op, pc) -- labeled failure return string.format("(idx = %s)", op.key) end, [Opcode.ThrowRec] = function(op, pc) return printjmp(op, pc) .. string.format("(idx = %s)", op.key) end, default = function() return '' end, } function printinst(pc, op, accum) table.insert(accum, string.format("%02d: %s ", pc, op.code.value)) table.insert(accum, printinst_helper(op, pc)) table.insert(accum, "\n") return accum end function printpatt(code, accum) for pc,op in ipairs(code) do printinst(pc, op, accum) end return accum end function printcap(cap, indent) print(string.format("%s%s", string.rep(' ', indent), cap)) end function printcap2close(captures, ncaptures, i, indent) local head = captures[i] i = i + 1 printcap(head, indent) -- print head capture while i <= ncaptures and head:inside(captures[i]) do i = printcap2close(captures, ncaptures, i, indent + 2) -- print nested captures end if i <= ncaptures and head:isopencap() then assert(captures[i]:isclosecap()) printcap(captures[i], indent) -- print and skip close capture i = i + 1 end return i end function printcaplist(captures, ncaptures) -- for debugging, first print a raw list of captures if ncaptures == nil then ncaptures = #captures end for i=1,ncaptures do printcap(captures[i], 0) end print(">======"); local i=1 while i <= ncaptures and not captures[i]:isclosecap() do i = printcap2close(captures, ncaptures, i, 0) end if i > ncaptures then print("<unmatched>") end print("======="); end local printtree_helper = define_tree_visitor{ [TTag.Char] = function(tree) local c = compat.utf8char(tree.n) if c:find("%C") ~= nil then -- printable? return " '" .. c .. "'" else return string.format(" (%02X)", tree.n) end end, [TTag.Set] = function(tree) return printcharset(tree) end, [TTag.UTFR] = function(tree) return " " .. tree.from .. " - " .. tree.to end, [{TTag.OpenCall, TTag.Call}] = function(tree) local ret = string.format(" key: %s", tree.key) local rule = tree.sib2 if rule ~= nil then ret = ret .. " (rule: " .. rule.n .. ")" end return ret end, [TTag.Behind] = function(tree) return " " .. tree.n end, [TTag.Capture] = function(tree) return string.format(" kind: '%s' key: %s", tree.cap.value, tree.key) end, [TTag.Rule] = function(tree) return string.format(" key: %s", tree.key) end, [TTag.XInfo] = function(tree) return " n: " .. tree.n end, [TTag.Grammar] = function(tree) return " " .. tree.n -- number of rules end, [TTag.Throw] = function(tree) return string.format(" key: %s", tree.key) end, default = function(tree) return '' end } function printtree(tree, indent, accum) local sibs = numsiblings[tree.tag] table.insert(accum, string.rep(' ', indent)) table.insert(accum, tree.tag.value) table.insert(accum, printtree_helper(tree)) table.insert(accum, "\n") if tree.tag == TTag.Rule then sibs = 1 -- don't print sib2 elseif tree.tag == TTag.Grammar then local rule = tree.sib1 for i=1,tree.n do printtree(rule, indent + 2, accum) rule = rule.sib2 end sibs = 0 -- siblings already handled end if sibs >= 1 then printtree(tree.sib1, indent + 2, accum) if sibs >= 2 then printtree(tree.sib2, indent + 2, accum) end end return accum end local PREFIX = "" -- could also be "l." or "lpeg." etc local printrepl_helper printrepl_helper = define_tree_visitor{ [TTag.True] = function(tree, buf) table.insert(buf, PREFIX .. 'P""') end, [TTag.Any] = function(tree, buf) table.insert(buf, PREFIX .. 'P(1)') end, [TTag.Char] = function(tree, buf) table.insert(buf, PREFIX .. 'P"') local c = compat.utf8char(tree.n) if c:find("%C") ~= nil then -- printable? table.insert(buf, c) else table.insert(buf, string.format('\\%02X', tree.n)) end table.insert(buf, '"') end, [TTag.Set] = function(tree, buf) local nbuf = {} local insertchar = function(cp) local c = compat.utf8char(cp) if string.find(c, "^[^%w%p ]") ~= nil then table.insert(nbuf, string.format('\\x%02X', cp)) else table.insert(nbuf, c) end end local nargs = 0 local inserttwo = function(cp1, cp2) if nargs > 0 then table.insert(nbuf, ',') end nargs = nargs + 1 table.insert(nbuf, '"') insertchar(cp1) insertchar(cp2) table.insert(nbuf, '"') end local i = 0 while i <= CHARMAX do local first = i while tree.set[i] and i <= CHARMAX do i = i + 1 end if first == (i - 1) then -- unary range inserttwo(first, first) elseif first < (i-1) then -- non-empty range inserttwo(first, i-1) end i = i + 1 end local r = table.concat(nbuf) if nargs == 1 then r = PREFIX .. 'S' .. r else r = PREFIX .. 'S(' .. r .. ')' end if tree.rest then table.insert(buf, '(') table.insert(buf, r) table.insert(buf, ' + ') table.insert(buf, PREFIX) table.insert(buf, 'utfR(0x80, 0x10FFFF))') else table.insert(buf, r) end end, [TTag.UTFR] = function(tree, buf) table.insert(buf, string.format("%sutfR(0x%04X, 0x%04X)", PREFIX, tree.from, tree.to)) end, [{TTag.OpenCall, TTag.Call}] = function(tree, buf) table.insert(buf, string.format('%sV"%s"', PREFIX, tree.key)) end, [TTag.Not] = function(tree, buf) table.insert(buf, '-(') printrepl_helper(tree.sib1, buf) table.insert(buf, ')') end, [TTag.Seq] = function(tree, buf) table.insert(buf, "(") printrepl_helper(tree.sib1, buf) table.insert(buf, " * ") printrepl_helper(tree.sib2, buf) table.insert(buf, ")") end, [TTag.Choice] = function(tree, buf) table.insert(buf, "(") printrepl_helper(tree.sib1, buf) table.insert(buf, " + ") printrepl_helper(tree.sib2, buf) table.insert(buf, ")") end, [TTag.Rep] = function(tree, buf) printrepl_helper(tree.sib1, buf) table.insert(buf, "^0") end, --[[ [TTag.Behind] = function(tree) return " " .. tree.n end, ]]-- [TTag.Capture] = function(tree, buf) local repl = define_type_visitor{ string = function(v) return '"' .. v .. '"' -- xxx should handle escapes end, default = tostring, } local name = nil local show_patt = false local show_key = false if tree.cap == CapKind.simple then name = 'C' show_patt = true elseif tree.cap == CapKind.subst then name = 'Cs' show_patt = true elseif tree.cap == CapKind.table then name = 'Ct' show_patt = true elseif tree.cap == CapKind.pos then name = 'Cp' elseif tree.cap == CapKind.arg then name = 'Carg' show_key = true elseif tree.cap == CapKind.backref then name = 'Cb' show_key = true elseif tree.cap == CapKind.group then name = 'Cg' show_patt = true show_key = (tree.key ~= nil) end if name ~= nil then table.insert(buf, PREFIX) table.insert(buf, name) table.insert(buf, '(') if show_patt then printrepl_helper(tree.sib1, buf) if show_key then table.insert(buf, ', ') end end if show_key then table.insert(buf, repl(tree.key)) end table.insert(buf, ')') return end if tree.cap == CapKind.string or tree.cap == CapKind.num or tree.cap == CapKind.query or tree.cap == CapKind['function'] then printrepl_helper(tree.sib1, buf) table.insert(buf, ' / ') table.insert(buf, repl(tree.key)) return end -- fallback table.insert(buf, string.format("<pattern %s>", tostring(tree.tag))) end, [TTag.Rule] = function(tree, buf) local key = tree.name if type(key) == 'number' then key = string.format("[%d]", key) end table.insert(buf, key) table.insert(buf, " = ") printrepl_helper(tree.sib1, buf) end, [TTag.Grammar] = function(tree, buf) table.insert(buf, PREFIX .. "P{") local rule = tree.sib1 local r = {} local first_rule_name = rule.name r[first_rule_name] = rule rule = rule.sib2 local names = {} for i=2,tree.n do table.insert(names, rule.name) r[rule.name] = rule rule = rule.sib2 end -- sort rule names table.sort(names) table.insert(names, 1, first_rule_name) -- now print in order for _,name in ipairs(names) do printrepl_helper(r[name], buf) table.insert(buf, ", ") end table.insert(buf, "}") end, --[[ [TTag.Throw] = function(tree) return " key: " .. tree.key .. " (rule: " .. tree.sib2.cap .. ")" end, ]]-- default = function(tree, buf) table.insert(buf, string.format("<pattern %s>", tostring(tree.tag))) end, } function printrepl(tree) local buf = {} printrepl_helper(tree, buf) return table.concat(buf) end return { printcaplist = printcaplist, printcharset = printcharset, printinst = printinst, printpatt = printpatt, printrepl = printrepl, printtree = printtree, } end) register('llpeg.code', function(myrequire) myrequire('strict') local compat = myrequire('advent.compat') local from = myrequire('llpeg.types').from local CHARMAX, CapKind, Opcode, PE, TTag, define, define_tree_visitor, fullset, newcharset, numsiblings, register_fname, _ = from(myrequire('llpeg.types'), { 'CHARMAX', 'CapKind', 'Opcode', 'PE', 'TTag', 'define', 'define_tree_visitor', 'fullset', 'newcharset', 'numsiblings', 'register_fname', }) local printinst = myrequire('llpeg.print').printinst local TRACE_INSTRUCTIONS = false -- signals a "no-instruction" local NOINST = nil -- don't optimize captures longer than this local MAXOFF = 15 -- forward declarations local codegen local CompileState = {} CompileState.__index = CompileState --[[ ** {====================================================== ** Analysis and some optimizations ** ======================================================= ]]-- --[[ ** Check whether a charset is empty (returns IFail), singleton (IChar), ** full (IAny), or none of those (ISet). When singleton, '*c' returns ** which character it is. (When generic set, the set was the input, ** so there is no need to return it.) ]]-- function charsettype(cs) local count = 0 local candidate for i,_ in pairs(cs.set) do candidate = i count = count + 1 end if cs.rest then if count == (CHARMAX + 1) then return Opcode.Any -- full set end elseif count == 0 then return Opcode.Fail -- empty set elseif count == 1 then return Opcode.Char, candidate -- single char end return Opcode.Set -- neither full nor empty nor singleton end -- A few basic operations on charsets; returns new object function cs_clone(cs) local result = newcharset() for i,_ in pairs(cs.set) do result.set[i] = true end result.rest = cs.rest return result end function cs_complement(cs) local result = newcharset() for i=0,CHARMAX do if not cs.set[i] then result.set[i] = true end end result.rest = not cs.rest return result end function cs_intersection(a, b) local result = newcharset() for i,_ in pairs(a.set) do if a.set[i] and b.set[i] then result.set[i] = true end end result.rest = a.rest and b.rest return result end function cs_union(a, b) local result = newcharset() for i=0,CHARMAX do if a.set[i] or b.set[i] then result.set[i] = true end end result.rest = a.rest or b.rest return result end function cs_diff(a, b) local result = newcharset() for i=0,CHARMAX do if a.set[i] and not b.set[i] then result.set[i] = true end end result.rest = a.rest and not b.rest return result end function cs_disjoint(a, b) if a.rest == b.rest then return false end for i,_ in pairs(a.set) do if b.set[i] then return false end end for i,_ in pairs(b.set) do if a.set[i] then return false end end return true end function cs_equal(a, b) if a.rest ~= b.rest then return false end for i,_ in pairs(a.set) do if not b.set[i] then return false end end for i,_ in pairs(b.set) do if not a.set[i] then return false end end return true end --[[ ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a ** charset and return it; else return nil. ]]-- local tocharset = define_tree_visitor{ [TTag.Set] = function(v) return v -- copy set end, [TTag.Char] = function(v) -- only one char if v.n <= CHARMAX then local t = newcharset() t.set[v.n] = true return t else return nil end end, [TTag.Any] = function(v) return fullset end, [TTag.False] = function(v) return newcharset() end, default = function(v) return nil end, } register_fname("tocharset", tocharset) --[[ ** Visit a TCall node taking care to stop recursion. If node not yet ** visited, return 'f(rule for call)', otherwise return 'def' (default ** value) ]]-- function CompileState:callrecursive(tree, f, default_value, ...) if tree.tag ~= TTag.Call then error("unexpected tree tag") end local rule = self.grammar.ruletab[tree.key] if rule.tag ~= TTag.Rule then error("unexpected tree sibling") end if tree.seen == true then return default_value -- node already visited else -- first visit local oldseen = tree.seen tree.seen = true local result = f(rule, ...) tree.seen = oldseen -- restore tree return result end end --[[ ** Check whether a pattern tree has captures ]]-- local hascaptures hascaptures = define_tree_visitor{ [{TTag.Capture, TTag.RunTime}] = function(tree, cs) return true end, [TTag.Call] = function(tree, cs) assert(cs ~= nil) return cs:callrecursive(tree, hascaptures, false, cs) end, [TTag.Rule] = function(tree, cs) -- do not follow siblings return hascaptures(tree.sib1, cs) end, [TTag.OpenCall] = function(tree, cs) error("should not happen") end, [TTag.Grammar] = function(tree, cs) -- make a fake compile state to hold the grammar, if necessary if cs == nil then cs = CompileState:new(nil) end return cs:withGrammar(tree, hascaptures, tree.sib1, cs) end, default = function(tree, cs) local n = numsiblings[tree.tag] if n == 1 then return hascaptures(tree.sib1, cs) -- tail call elseif n == 2 then if hascaptures(tree.sib1, cs) then return true end return hascaptures(tree.sib2, cs) -- tail call elseif n == 0 then return false else error("how many siblings does this have?") end end, } function CompileState:hascaptures(t) return hascaptures(t, self) end register_fname("hascaptures", hascaptures) --[[ ** Checks how a pattern behaves regarding the empty string, ** in one of two different ways: ** A pattern is *nullable* if it can match without consuming any character; ** A pattern is *nofail* if it never fails for any string ** (including the empty string). ** The difference is only for predicates and run-time captures; ** for other patterns, the two properties are equivalent. ** (With predicates, &'a' is nullable but not nofail. Of course, ** nofail => nullable.) ** These functions are all convervative in the following way: ** p is nullable => nullable(p) ** nofail(p) => p cannot fail ** The function assumes that TOpenCall is not nullable; ** this will be checked again when the grammar is fixed. ** Run-time captures can do whatever they want, so the result ** is conservative. ]]-- local checkaux checkaux = define_tree_visitor{ [{ TTag.Char, TTag.Set, TTag.Any, TTag.UTFR, TTag.False, TTag.OpenCall, TTag.Throw, }] = function(tree, pred, cs) return false -- not nullable end, [{TTag.Rep,TTag.True}] = function(tree, pred, cs) return true -- no fail end, [{TTag.Not,TTag.Behind}] = function(tree, pred, cs) -- can match empty, but can fail if pred == PE.nofail then return false else return true end end, [TTag.And] = function(tree, pred, cs) -- can match empty; fail iff body does if pred == PE.nullable then return true end return checkaux(tree.sib1, pred, cs) -- tail call end, [TTag.RunTime] = function(tree, pred, cs) -- can fail; match empty iff body does if pred == PE.nofail then return false end return checkaux(tree.sib1, pred, cs) -- tail call end, [TTag.Seq] = function(tree, pred, cs) if not checkaux(tree.sib1, pred, cs) then return false end return checkaux(tree.sib2, pred, cs) -- tail call end, [TTag.Choice] = function(tree, pred, cs) if checkaux(tree.sib2, pred, cs) then return true end return checkaux(tree.sib1, pred, cs) -- tail call end, [{ TTag.Capture, TTag.Rule, TTag.XInfo, }] = function(tree, pred, cs) return checkaux(tree.sib1, pred, cs) end, [TTag.Grammar] = function(tree, pred, cs) -- make a fake compile state to hold the grammar, if necessary if cs == nil then cs = CompileState:new(nil) end return cs:withGrammar(tree, checkaux, tree.sib1, pred, cs) end, [TTag.Call] = function(tree, pred, cs) -- open calls are assumed not nullable; checked again after grammar -- is fixed if cs == nil then return false end return checkaux(cs.grammar.ruletab[tree.key], pred, cs) end, } register_fname("checkaux", checkaux) function nofail(t, cs) return checkaux(t, PE.nofail, cs) end function CompileState:nofail(t) return nofail(t, self) end function nullable(t, cs) return checkaux(t, PE.nullable, cs) end function CompileState:nullable(t) return nullable(t, self) end function nullable_with_grammar(t, grm) local cs = CompileState:new(nil) return cs:withGrammar(grm, nullable, t, cs) end -- Note that we are counting characters, not bytes local fixedlen, fixedlen_helper fixedlen_helper = define_tree_visitor{ [{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR}] = function(tree, len) return len + 1 end, [{TTag.False, TTag.True, TTag.Not, TTag.And, TTag.Behind}] = function(tree, len) return len end, [{TTag.Rep, TTag.RunTime, TTag.OpenCall, TTag.Throw,}] = function(tree, len) return -1 -- variable end, [{TTag.Capture, TTag.Rule, TTag.XInfo,}] = function(tree, len, cs) return fixedlen_helper(tree.sib1, len, cs) end, [TTag.Grammar] = function(tree, len, cs) -- make a fake compile state to hold the grammar, if necessary if cs == nil then cs = CompileState:new(nil) end return cs:withGrammar(tree, fixedlen_helper, tree.sib1, len, cs) end, [TTag.Call] = function(tree, len, cs) -- If evaluating outside the context of a grammar, conservatively -- return "variable" if cs == nil then return -1 end -- otherwise, carefully recurse local n1 = cs:callrecursive(tree, fixedlen, -1, cs) if n1 < 0 then return -1 end -- variable return len + n1 end, [TTag.Seq] = function(tree, len, cs) local n1 = fixedlen_helper(tree.sib1, len, cs) if n1 < 0 then return -1 end -- variable return fixedlen_helper(tree.sib2, n1, cs) end, [TTag.Choice] = function(tree, len, cs) local n1 = fixedlen_helper(tree.sib1, len, cs) local n2 = fixedlen_helper(tree.sib2, len, cs) if n1 ~= n2 or n1 < 0 then return -1 else return n1 end end, } function fixedlen(tree, cs) return fixedlen_helper(tree, 0, cs) -- supply default 0 for 'len' end function CompileState:fixedlen(t) return fixedlen(t, self) end register_fname("fixedlen_helper", fixedlen_helper) --[[ ** Computes the 'first set' of a pattern. ** The result is a conservative aproximation: ** match p ax -> x (for some x) ==> a belongs to first(p) ** or ** a not in first(p) ==> match p ax -> fail (for all x) ** ** The set 'follow' is the first set of what follows the ** pattern (full set if nothing follows it). ** ** The function returns 0 when this resulting set can be used for ** test instructions that avoid the pattern altogether. ** A non-zero return can happen for two reasons: ** 1) match p '' -> '' ==> return has bit 1 set ** (tests cannot be used because they would always fail for an empty input); ** 2) there is a match-time capture ==> return has bit 2 set ** (optimizations should not bypass match-time captures). ]]-- local getfirst getfirst = define_tree_visitor{ [TTag.Char] = function(t, follow, cs) if t.n <= CHARMAX then return 0, tocharset(t) end -- conservative approximation! local s = newcharset() s.rest = true return 0, s end, [{ TTag.Set, TTag.Any, TTag.False }] = function(t, follow, cs) return 0, tocharset(t) end, [TTag.UTFR] = function(t, follow, cs) -- conservative approximation! local firstset = newcharset() if t.from <= CHARMAX then for i=t.from, math.min(CHARMAX, t.to) do firstset.set[i] = true end end if t.to > CHARMAX then -- conservative approximation of non-ascii unicode range firstset.rest = true end return 0, firstset end, [TTag.True] = function(t, follow, cs) return 1, follow -- 1 because this accepts the empty string end, [TTag.Throw] = function(t, follow, cs) -- labeled failure: must always throw the label return 1, fullset end, [TTag.Choice] = function(t, follow, cs) local firstset = newcharset() local e1,e1set = getfirst(t.sib1, follow, cs) local e2,e2set = getfirst(t.sib2, follow, cs) local firstset = cs_union(e1set, e2set) local ret = 0 -- awkward lua5.1 way to say "e1 | e2" if (e1 % 2) == 1 or (e2 % 2) == 1 then ret = ret + 1 end e1,e2 = compat.rshift(e1, 1), compat.rshift(e2, 1) if (e1 % 2) == 1 or (e2 % 2) == 1 then ret = ret + 2 end return ret, firstset end, [TTag.Seq] = function(t, follow, cs) if not nullable(t.sib1, cs) then -- when p1 is not nullable, p2 has nothing to contribute return getfirst(t.sib1, fullset, cs) -- tail call else -- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) local e2,csaux = getfirst(t.sib2, follow, cs) local e1,firstset = getfirst(t.sib1, csaux, cs) if e1 == 0 then return 0, firstset -- 'e1' ensures that first can be used elseif compat.rshift(e1, 1) % 2 == 1 or compat.rshift(e2, 1) % 2 == 1 then -- one of the children has a matchtime? return 2, firstset -- pattern has a matchtime capture else return e2, firstset -- else depends on e2 end end end, [TTag.Rep] = function(t, follow, cs) local _,firstset = getfirst(t.sib1, follow, cs) return 1, cs_union(firstset, follow, cs) -- accepts the empty string end, [{ TTag.Capture,TTag.Rule,TTag.XInfo }] = function(t, follow, cs) return getfirst(t.sib1, follow, cs) -- tail call end, [TTag.Grammar] = function(t, follow, cs) return cs:withGrammar(t, getfirst, t.sib1, follow, cs) end, [TTag.RunTime] = function(t, follow, cs) -- function invalidates any follow info local e,firstset = getfirst(t.sib1, fullset, cs) if e ~= 0 then -- function is not "protected"? return 2,firstset else -- pattern inside capture ensures first can be used return 0,firstset end end, [TTag.Call] = function(t, follow, cs) local rule = cs.grammar.ruletab[t.key] return getfirst(rule, follow, cs) -- tail call end, [TTag.And] = function(t, follow, cs) local e,firstset = getfirst(t.sib1, follow, cs) return e, cs_intersection(firstset, follow, cs) end, [{ TTag.Not, TTag.Behind }] = function(t, follow, cs) if t.tag == TTag.Not then local firstset = tocharset(t.sib1) if firstset ~= nil then return 1,cs_complement(firstset) -- could match empty input end end -- the TNot or TBehind gives no new information -- call getfirst only to check for math-time captures local e,_ = getfirst(t.sib1, follow, cs) if e%2 == 0 then e = e + 1 end -- set the lsb; could match empty input return e, follow -- uses follow end, } function CompileState:getfirst(t, follow) return getfirst(t, follow, self) end register_fname("getfirst", getfirst) --[[ ** If 'headfail(tree)' true, then 'tree' can fail only depending on the ** next character of the subject. -- ie, a single character of lookahead is enough to evaluate the pattern -- rooted at this node ]]-- local headfail headfail = define_tree_visitor{ [{TTag.Char, TTag.Set, TTag.Any, TTag.False}] = function(t, cs) return true end, [{TTag.True, TTag.Rep, TTag.RunTime, TTag.Not, -- even though we are codepoint-based, we don't have a TestUTFR instruction -- so we can't use a Test instruction on the first character, which is -- implicitly what headfail is checking for. TTag.UTFR, TTag.Behind, TTag.Throw}] = function(t, cs) return false end, [{TTag.Capture, TTag.Rule, TTag.XInfo, TTag.And}] = function(t, cs) return headfail(t.sib1, cs) -- tail call end, [TTag.Grammar] = function(t, cs) return cs:withGrammar(t, headfail, t.sib1, cs) end, [TTag.Call] = function(t, cs) local rule = cs.grammar.ruletab[t.key] return headfail(rule, cs) -- tail call end, [TTag.Seq] = function(t, cs) if not nofail(t.sib2, cs) then -- if the second child could possibly fail, then we can't -- evaluate the entire seq based just on the first child return false end return headfail(t.sib1, cs) -- tail call end, [TTag.Choice] = function(t, cs) -- both children need to be headfail for this to be headfail if not headfail(t.sib1, cs) then return false end return headfail(t.sib2, cs) -- tail call end, } function CompileState:headfail(t) return headfail(t, self) end register_fname("headfail", headfail) --[[ ** Check whether the code generation for the given tree can benefit ** from a follow set (to avoid computing the follow set when it is ** not needed) ]]-- local needfollow needfollow = define_tree_visitor{ [{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR, TTag.False, TTag.True, TTag.And, TTag.Not, TTag.RunTime, TTag.Grammar, TTag.Call, TTag.Behind, TTag.Throw, }] = function(tree) return false end, [{TTag.Choice, TTag.Rep}] = function(tree) return true end, [TTag.Capture] = function(tree) return needfollow(tree.sib1) end, [TTag.Seq] = function(tree) return needfollow(tree.sib2) end, } register_fname("needfollow", needfollow) --[[ ** ====================================================== ** Code generation ** ====================================================== ]]-- local Instruction = {} Instruction.__index = Instruction function Instruction:new(arg) local opcode = arg[1] if opcode == nil then error("no opcode") end -- target is rule # for open calls before correction, and absolute pc after local instr = { code = opcode, exec = opcode.exec, -- copy the exec function from the opcode! aux = arg.aux, -- used for the "primary argument" key = arg.key, -- used for string-valued arguments target = arg.target, -- used for jmp-like instructions from = arg.from, -- inclusive start, for ranges to = arg.to, -- inclusive end, for ranges set = arg.set, -- charset <= CHARMAX rest = arg.rest, -- include characters above CHARMAX? cap = arg.cap, -- used for "capture kind" } setmetatable(instr, self) instr:setCode(opcode) -- opportunity to add tracing logic! return instr end function Instruction:setCode(opcode) self.code = opcode local exec = opcode.exec if TRACE_INSTRUCTIONS then local str self.exec = function(self, state, ...) if str == nil then str = table.concat(printinst(0, self, { "Executing " })):gsub("\n","") end print(state.bytePos, state.codepoint, str) return exec(self, state, ...) end else self.exec = exec end end -- state for the compiler function CompileState:new(p) local cs = { p = p, } setmetatable(cs, self) return cs end function CompileState:withGrammar(g, f, ...) local lastGrammar = self.grammar self.grammar = g local result = compat.pack(f(...)) self.grammar = lastGrammar return compat.unpack(result) end function CompileState:codegen(tree, opt, tt, fl) assert(fl.tag == TTag.Set) -- just a little helper return codegen(tree, self, opt, tt, fl) end function CompileState:getinstr(i) return self.p.code[i] end function CompileState:addinstruction(arg) local code = self.p.code table.insert(code, Instruction:new(arg)) return #code end function CompileState:gethere() local code = self.p.code return 1 + #code end function CompileState:jumptothere(pc, where) if pc ~= NOINST then local code = self.p.code code[pc].target = where end end function CompileState:jumptohere(pc) self:jumptothere(pc, self:gethere()) end function codethrow(cs, throw) local rule = nil if cs.grammar ~= nil then -- we only lookup/match *string* rule names, not numeric indices rule = cs.grammar.ruletab[tostring(throw.key)] end if rule ~= nil then return cs:addinstruction{ Opcode.ThrowRec, key=throw.key, -- rule name / error label target=rule.n -- recovery rule number } else return cs:addinstruction{ Opcode.Throw, key=throw.key, -- rule name / error label -- no recovery rule } end end function codeutfr(cs, tree) return cs:addinstruction{ Opcode.UTFR, from = tree.from, to = tree.to, } end --[[ ** Code an IChar instruction, or IAny if there is an equivalent ** test dominating it ]]-- function codechar(cs, codepoint, tt) if tt ~= NOINST and cs:getinstr(tt).code == Opcode.TestChar and cs:getinstr(tt).aux == codepoint then cs:addinstruction{Opcode.Any} else cs:addinstruction{Opcode.Char, aux=codepoint,} end end --[[ ** Add a charset posfix to an instruction ]]-- function addcharset(cs, codepoint) --[[ static void addcharset (CompileState *compst, const byte *cs) { int p = gethere(compst); int i; for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) nextinstruction(compst); /* space for buffer */ /* fill buffer with charset */ loopset(j, getinstr(compst, p).buff[j] = cs[j]); ]]-- end --[[ ** code a char set, optimizing unit sets for IChar, "complete" ** sets for IAny, and empty sets for IFail; also use an IAny ** when instruction is dominated by an equivalent test. ]]-- function codecharset(cs, tree, tt) local op,codepoint = charsettype(tree) if op == Opcode.Char then return codechar(cs, codepoint, tt) elseif op == Opcode.Set then -- non-trivial set? if tt ~= NOINST and cs:getinstr(tt).code == Opcode.TestSet and cs_equal(tree, cs:getinstr(tt)) then return cs:addinstruction{Opcode.Any} else return cs:addinstruction{ Opcode.Set, set = tree.set, -- XXX ensure immutable rest = tree.rest, } end else return cs:addinstruction{op} -- Any or Fail end end --[[ ** code a test set, optimizing unit sets for ITestChar, "complete" ** sets for ITestAny, and empty sets for IJmp (always fails). ** 'e' is nonzero iff test should accept the empty string. (Test ** instructions in the current VM never accept the empty string.) ]]-- function codetestset(cs, tree, e) if e ~= 0 then return NOINST end local op,codepoint = charsettype(tree) if op == Opcode.Fail then return cs:addinstruction{Opcode.Jmp, target = NOINST} -- always jump elseif op == Opcode.Any then return cs:addinstruction{Opcode.TestAny, target = NOINST} elseif op == Opcode.Char then return cs:addinstruction{ Opcode.TestChar, target = NOINST, aux = codepoint, } elseif op == Opcode.Set then return cs:addinstruction{ Opcode.TestSet, target = NOINST, set = tree.set, -- XXX ensure immutable rest = tree.rest, } else error("unreachable") end end --[[ ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) ]]-- function codebehind(cs, tree) if tree.n > 0 then cs:addinstruction{ Opcode.Behind, aux = tree.n } end return cs:codegen(tree.sib1, false, NOINST, fullset) end --[[ ** Choice; optimizations: ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, ** than a character not in first(p1) cannot go to p1 and a character ** in first(p1) cannot go to p2, either because p1 will accept ** (headfail) or because it is not in first(p2) (disjoint). ** (The second case is not valid if p1 accepts the empty string, ** as then there is no character at all...) ** - when p2 is empty and opt is true; a IPartialCommit can reuse ** the Choice already active in the stack. ]]-- function codechoice(cs, p1, p2, opt, fl) local emptyp2 = (p2.tag == TTag.True) local e1, cs1 = cs:getfirst(p1, fullset) local headfailp1 = cs:headfail(p1) local e2, cs2 if not headfailp1 and e1 == 0 then e2, cs2 = cs:getfirst(p2, fl) -- avoid computing unless necessary end if headfailp1 or (e1 == 0 and cs_disjoint(cs1, cs2)) then -- <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: local test = codetestset(cs, cs1, 0) local jmp = NOINST cs:codegen(p1, false, test, fl) if not emptyp2 then jmp = cs:addinstruction{Opcode.Jmp, target = NOINST } end cs:jumptohere(test) cs:codegen(p2, opt, NOINST, fl) cs:jumptohere(jmp) elseif opt and emptyp2 then -- p1? == IPartialCommit; p1 cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST}) cs:codegen(p1, true, NOINST, fullset) else -- <p1 / p2> == -- test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: local test = codetestset(cs, cs1, e1) local pchoice = cs:addinstruction{Opcode.Choice, target = NOINST} cs:codegen(p1, emptyp2, test, fullset) local pcommit = cs:addinstruction{Opcode.Commit, target = NOINST} cs:jumptohere(pchoice) cs:jumptohere(test) cs:codegen(p2, opt, NOINST, fl) cs:jumptohere(pcommit) end end --[[ ** And predicate ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n ** (valid only when 'p' has no captures) ]]-- function codeand(cs, tree, tt) --[[ labeled failure: optimization disabled because in case of a failure it does not report the expected error position (the current subject position when begin the matching of <&p>) ]]-- local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST} cs:codegen(tree, false, tt, fullset) local pcommit = cs:addinstruction{Opcode.BackCommit, target = NOINST} cs:jumptohere(pchoice) cs:addinstruction{Opcode.Fail} cs:jumptohere(pcommit) end --[[ ** Captures: if pattern has fixed (and not too big) length, and it ** has no nested captures, use a single IFullCapture instruction ** after the match; otherwise, enclose the pattern with OpenCapture - ** CloseCapture. ]]-- function codecapture(cs, tree, tt, fl) local len = cs:fixedlen(tree.sib1) if len >= 0 and len <= MAXOFF and not cs:hascaptures(tree.sib1) then cs:codegen(tree.sib1, false, tt, fl) cs:addinstruction{ Opcode.FullCapture, cap = tree.cap, key = tree.key, -- capture name aux = len, } else assert(tree.cap ~= nil) cs:addinstruction({ Opcode.OpenCapture, cap = tree.cap, key = tree.key, -- capture name }) cs:codegen(tree.sib1, false, tt, fl) cs:addinstruction({ Opcode.CloseCapture, cap = CapKind.close, }) end end function coderuntime(cs, tree, tt) cs:addinstruction({ Opcode.OpenCapture, cap = CapKind.group, key = tree.key, -- capture *function* }) cs:codegen(tree.sib1, false, tt, fullset) cs:addinstruction({ Opcode.CloseRunTime, cap = CapKind.close, }) end --[[ ** Repetition; optimizations: ** When pattern is a charset, can use special instruction ISpan. ** When pattern is head fail, or if it starts with characters that ** are disjoint from what follows the repetions, a simple test ** is enough (a fail inside the repetition would backtrack to fail ** again in the following pattern, so there is no need for a choice). ** When 'opt' is true, the repetion can reuse the Choice already ** active in the stack. ]]-- function coderep(cs, tree, opt, fl) local st = tocharset(tree) if st ~= nil then return cs:addinstruction{ Opcode.Span, set = st.set, rest = st.rest, } end local e1,st = cs:getfirst(tree, fullset) if cs:headfail(tree) or (e1 == 0 and cs_disjoint(st, fl)) then -- L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: local test = codetestset(cs, st, 0) cs:codegen(tree, false, test, fullset) local jmp = cs:addinstruction{Opcode.Jmp, target = NOINST} cs:jumptohere(test) cs:jumptothere(jmp, test) else -- test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: -- or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; local test = codetestset(cs, st, e1) local pchoice = NOINST if opt then cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST}) else pchoice = cs:addinstruction{Opcode.Choice, target = NOINST} end local l2 = cs:gethere() cs:codegen(tree, false, NOINST, fullset) local commit = cs:addinstruction{Opcode.PartialCommit, target = NOINST} cs:jumptothere(commit, l2) cs:jumptohere(pchoice) cs:jumptohere(test) end end --[[ ** Not predicate; optimizations: ** In any case, if first test fails, 'not' succeeds, so it can jump to ** the end. If pattern is headfail, that is all (it cannot fail ** in other parts); this case includes 'not' of simple sets. Otherwise, ** use the default code (a choice plus a failtwice). ]]-- function codenot(cs, tree) local e,st = cs:getfirst(tree, fullset) local test = codetestset(cs, st, e) if cs:headfail(tree) then -- test (fail(p1)) -> L1; fail; L1: cs:addinstruction{Opcode.Fail} else -- test(fail(p))-> L1; choice L1; <p>; failtwice; L1: local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST } cs:codegen(tree, false, NOINST, fullset) cs:addinstruction{Opcode.FailTwice} cs:jumptohere(pchoice) end cs:jumptohere(test) end -- find the final destination of a sequence of jumps function finaltarget(code, pc) while code[pc].code == Opcode.Jmp do pc = code[pc].target end return pc end -- final label (after traversing any jumps) function finallabel(code, pc) return finaltarget(code, code[pc].target) end --[[ ** change open calls to calls, using list 'positions' to find ** correct offsets; also optimize tail calls ]]-- function correctcalls(cs, positions, from, to) local code = cs.p.code for i=from,(to-1) do local op = code[i] if op.code == Opcode.OpenCall or op.code == Opcode.ThrowRec then local n = op.target -- rule number local rule = positions[n] -- rule position if rule == from or code[rule - 1].code == Opcode.Ret then -- sanity check! ok! else error("bad rule position") end if op.code == Opcode.OpenCall then if code[finaltarget(code, i+1)].code == Opcode.Ret then -- call; ret => tail call op:setCode(Opcode.Jmp) else op:setCode(Opcode.Call) -- open call no more end end op.target = rule end end end --[[ ** Code for a grammar: ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: ]]-- function codegrammar(cs, tree) local firstcall = cs:addinstruction{Opcode.Call, target = NOINST} -- call initial rule local jumptoend = cs:addinstruction{Opcode.Jmp, target = NOINST} -- jump to the end local start = cs:gethere() -- here starts the initial rule cs:jumptohere(firstcall) local positions = {} local rule = tree.sib1 for i=1,tree.n do local pattern = rule.sib1 positions[i] = cs:gethere() -- save rule position cs:codegen(rule.sib1, false, NOINST, fullset) -- code rule cs:addinstruction{Opcode.Ret} rule = rule.sib2 end if rule.tag ~= TTag.True then error("impossible") end cs:jumptohere(jumptoend) correctcalls(cs, positions, start, cs:gethere()) end function codecall(cs, tree) local rule = cs.grammar.ruletab[tree.key] assert(rule ~= nil) assert(rule.n ~= nil) return cs:addinstruction{ Opcode.OpenCall, -- to be corrected later target = rule.n -- rule number } end --[[ ** Code first child of a sequence ** (second child is called in-place to allow tail call) ** Return 'tt' for second child ]]-- function codeseq1(cs, p1, p2, tt, fl) assert(fl.tag == TTag.Set) if needfollow(p1) then local _, fl1 = cs:getfirst(p2, fl) -- p1 follow is p2 first cs:codegen(p1, false, tt, fl1) else -- use fullset as follow cs:codegen(p1, false, tt, fullset) end if cs:fixedlen(p1) ~= 0 then -- can 'p1' consume anything? return NOINST -- invalidate test else return tt -- else 'tt' still protects sib2 end end --[[ ** Main code-generation function: dispatch to auxiliar functions ** according to kind of tree. ('needfollow' should return true ** only for consructions that use 'fl'.) ]]-- --[[ ** code generation is recursive; 'opt' indicates that the code is being ** generated as the last thing inside an optional pattern (so, if that ** code is optional too, it can reuse the 'IChoice' already in place for ** the outer pattern). 'tt' points to a previous test protecting this ** code (or NOINST). 'fl' is the follow set of the pattern. ]]-- codegen = define_tree_visitor{ [TTag.Char] = function(tree, cs, opt, tt, fl) return codechar(cs, tree.n, tt) end, [TTag.Any] = function(tree, cs, opt, tt, fl) return cs:addinstruction{Opcode.Any} end, [TTag.Set] = function(tree, cs, opt, tt, fl) return codecharset(cs, tree, tt) end, [TTag.True] = function(tree, cs, opt, tt, fl) return -- do nothing end, [TTag.False] = function(tree, cs, opt, tt, fl) return cs:addinstruction{Opcode.Fail} end, [TTag.UTFR] = function(tree, cs, opt, tt, fl) return codeutfr(cs, tree) end, [TTag.Choice] = function(tree, cs, opt, tt, fl) return codechoice(cs, tree.sib1, tree.sib2, opt, fl) end, [TTag.Rep] = function(tree, cs, opt, tt, fl) return coderep(cs, tree.sib1, opt, fl) end, [TTag.Behind] = function(tree, cs, opt, tt, fl) return codebehind(cs, tree) end, [TTag.Not] = function(tree, cs, opt, tt, fl) return codenot(cs, tree.sib1) end, [TTag.And] = function(tree, cs, opt, tt, fl) return codeand(cs, tree.sib1, tt) end, [TTag.Capture] = function(tree, cs, opt, tt, fl) return codecapture(cs, tree, tt, fl) end, [TTag.RunTime] = function(tree, cs, opt, tt, fl) return coderuntime(cs, tree, tt) end, [TTag.Grammar] = function(tree, cs, opt, tt, fl) return cs:withGrammar(tree, codegrammar, cs, tree) end, [TTag.Call] = function(tree, cs, opt, tt, fl) return codecall(cs, tree) end, [TTag.Seq] = function(tree, cs, opt, tt, fl) tt = codeseq1(cs, tree.sib1, tree.sib2, tt, fl) -- code 'p1' return cs:codegen(tree.sib2, opt, tt, fl) -- tail call end, [TTag.Throw] = function(tree, cs, opt, tt, fl) return codethrow(cs, tree) end, ["assert"] = function(tree, cs, opt, tt, fl) assert(fl.tag == TTag.Set) assert(opt ~= 0) end, } register_fname("codegen", codegen) --[[ ** Optimize jumps and other jump-like instructions. ** * Update labels of instructions with labels to their final ** destinations (e.g., choice L1; ... L1: jmp L2: becomes ** choice L2) ** * Jumps to other instructions that do jumps become those ** instructions (e.g., jump to return becomes a return; jump ** to commit becomes a commit) ]]-- function peephole(cs) local code = cs.p.code local jmpswitch local switch = define_opcode_visitor{ -- instructions with labels [{Opcode.Choice, Opcode.Call, Opcode.Commit, Opcode.PartialCommit, Opcode.BackCommit, Opcode.TestChar, Opcode.TestSet, Opcode.TestAny}] = function(op, i) cs:jumptothere(i, finallabel(code, i)) end, [Opcode.Jmp] = function(op, i) local ft = finaltarget(code, i) jmpswitch(code[ft], i, ft) -- jumping to what? end, default = function() end, } jmpswitch = define_opcode_visitor{ -- instructions with unconditional implicit jumps [{Opcode.Ret,Opcode.Fail,Opcode.FailTwice,Opcode.End}] = function(op, i, ft) code[i]:setCode(code[ft].code) -- jump becomes that instruction end, -- instructions with unconditional explicit jumps [{Opcode.Commit, Opcode.PartialCommit, Opcode.BackCommit}] = function(op, i, ft) local fft = finallabel(code, ft) code[i]:setCode(code[ft].code) -- jump becomes that instruction cs:jumptothere(i, fft) -- with an optimized target switch(code[i], i) -- reoptimize the label end, default = function(op, i, ft) cs:jumptothere(i, ft) -- optimize label end, } for i=1,#code do switch(code[i], i) end end -- thread the instructions to speed up dispatch during execution function thread(cs) local code = cs.p.code for i=1,#code-1 do code[i].next = code[i+1] if code[i].target ~= nil then code[i].branch = code[code[i].target] end end end function compile(p) local compst = CompileState:new(p) p.code = {} assert(fullset.tag == TTag.Set) compst:codegen(p, false, NOINST, fullset) compst:addinstruction{Opcode.End} peephole(compst) thread(compst) return p.code end return { Instruction = Instruction, compile = compile, cs_clone = cs_clone, cs_complement = cs_complement, cs_diff = cs_diff, cs_intersection = cs_intersection, cs_union = cs_union, fixedlen = fixedlen, hascaptures = hascaptures, nofail = nofail, nullable = nullable, nullable_with_grammar = nullable_with_grammar, tocharset = tocharset, } end) register('llpeg.utf8util', function(myrequire) myrequire('strict') local utf8util = {} function utf8util.codepointAt(s, pos) local c1 = string.byte(s, pos) if c1 <= 0x7F then return c1, 1 end local c2 = string.byte(s, pos + 1) if c1 <= 0xDF then return ((c1 % 0x20 ) * 0x40) + (c2 % 0x40), 2 end local c3 = string.byte(s, pos + 2) if c1 <= 0xEF then return (((c1 % 0x10) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40), 3 end local c4 = string.byte(s, pos + 3) if c1 <= 0xF7 then return ((((c1 % 0x08) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40)) * 0x40 + (c4 % 0x40), 4 end error( "bad utf8" ) end -- same as utf8.offset in Lua 5.3 standard library function utf8util.offset(s, n, i) if n > 0 then error("unimplemented") end while n < 0 do i = i - 1 if i < 1 then return nil end local c = string.byte(s, i) if c < 0x80 or c > 0xBF then n = n + 1 end end return i end return utf8util end) register('llpeg.list', function(myrequire) local List = {} List.__index = List function List:new() return setmetatable({ n = 0 }, self) end function List:__len() return self.n end function List:push(val) local n = self.n + 1 self[n] = val self.n = n end function List:pop() local n = self.n assert(n > 0) local old = self[n] self[n] = nil self.n = n - 1 return old end function List:insert(pos, val) for i=self.n,pos,-1 do self[i+1] = self[i] end self[pos] = val self.n = self.n + 1 end return List end) register('llpeg.cap', function(myrequire) myrequire('strict') local compat = myrequire('advent.compat') local from = myrequire('llpeg.types').from local CapKind, _ = from(myrequire('llpeg.types'), { 'CapKind', }) local printcaplist, _ = from(myrequire('llpeg.print'), { 'printcaplist', }) local List = myrequire('llpeg.list') local MAXRECLEVEL = 200 local Capture = {} Capture.__index = Capture -- kind is CapKind of the capture -- bytePos is the subject position (in bytes) -- byteLen is the length of the capture (in bytes) -- extra is extra info (group name, arg index, etc) function Capture:new(kind, bytePos, byteLen, extra) assert(getmetatable(kind) == CapKind) return setmetatable({ kind = kind, bytePos = bytePos, byteLen = byteLen, extra = extra, }, self) end function Capture:__tostring() return string.format("Capture{kind=%s, pos=%d, len=%s, extra=%s}", self.kind, self.bytePos, self.byteLen, self.extra) end function Capture:isopencap() return self.byteLen == nil end -- true if c2 is (any number of levels) inside self function Capture:inside(c2) if self:isopencap() then return not c2:isclosecap() else return c2.bytePos < (self.bytePos + self.byteLen) end end function Capture:isclosecap() return self.kind == CapKind.close end --[[ ** Return the size of capture 'cap'. If it is an open capture, 'close' ** must be its corresponding close. ]]-- function Capture:size(close) if self:isopencap() then assert(close:isclosecap()) return close.bytePos - self.bytePos else return self.byteLen end end function CapKind:newCapture(bytePos, byteLen, extra) return Capture:new(self, bytePos, byteLen, extra) end local CapState = {} CapState.__index = CapState -- Capture cap: current capture -- Capture ocap: (original) capture list -- int ptop: index of last argument to 'match' -- string s: original string -- int valuecached: value stored in cache slot -- int reclevel: recursion level function CapState:new(captures, source, extraArgs) return setmetatable({ captures = captures, index = 1, source = source, valuecached = {}, reclevel = 0, extraArgs = extraArgs, }, self) end function CapState:cap() -- helper return self.captures[self.index] end function CapState:advance() -- helper local i = self.index local cap = self.captures[i] self.index = i + 1 return cap, i end function CapState:substr(start, len) -- helper return string.sub(self.source, start, start + len - 1) end function CapState:skipclose(head) if head:isopencap() then assert(self.captures[self.index]:isclosecap()) self.index = self.index + 1 end end function CapState:closesize(head) return head:size(self:cap()) end --[[ ** Go to the next capture at the same level ]]-- function CapState:nextcap() local cap = self:cap() if cap:isopencap() then -- must look for a close local n = 0 -- number of opens waiting a close while true do -- look for corresponding close self.index = self.index + 1 cap = self:cap() if cap:isopencap() then n = n + 1 elseif cap:isclosecap() then if n == 0 then break end n = n - 1 end end self.index = self.index + 1 -- skip last close (or entire single capture) else self.index = self.index + 1 while cap:inside(self:cap()) do self.index = self.index + 1 -- skip captures inside the current one end end end --[[ ** Goes back in a list of captures looking for an open capture ** corresponding to a close ]]-- function CapState:findopen(i) -- captures[i] is the close that we want to match assert(self.captures[i]:isclosecap()) local n = 0 -- number of closes waiting an open while i > 1 do i = i - 1 local cap = self.captures[i] if cap:isclosecap() then n = n + 1 -- one more open to skip elseif cap:isopencap() then if n == 0 then return cap,i end n = n - 1 end end error("couldn't find open") end --[[ ** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is ** not nested inside a full capture that does not contain 'ref'. (We ** only need to care for full captures because the search at 'findback' ** skips open-end blocks; so, if 'grp' is nested in a non-full capture, ** 'ref' is also inside it.) To check this, we search backward for the ** inner full capture enclosing 'grp'. A full capture cannot contain ** non-full captures, so a close capture means we cannot be inside a ** full capture anymore. ]]-- function CapState:capvisible(igrp, ref) local i = igrp local grp = self.captures[igrp] while i > 1 do i = i - 1 local cap = self.captures[i] if cap:isclosecap() then return true -- can stop the search elseif cap:inside(grp) then -- is 'grp' inside cap? return cap:inside(ref) -- ok iff cap also contains ref end end return true -- 'grp' is not inside any capture end --[[ ** Try to find a named group capture with the name given; ** goes backward from 'i'. ]]-- function CapState:findback(name, i) if i == nil then i = self.index end local ref = self.captures[i] while i > 1 do i = i - 1 local cap = self.captures[i] if cap:isclosecap() or not cap:inside(ref) then if cap:isclosecap() then cap,i = self:findopen(i) end if cap.kind == CapKind.group and self:capvisible(i, ref) then if cap.extra == name then return cap,i end end end end error("back reference '"..name.."' not found") end function CapState:getcaptures() local result = List:new() while not self:cap():isclosecap() do self:pushcapture(result) end return result end function CapState:pushcapture(result) self.reclevel = self.reclevel + 1 if self.reclevel > MAXRECLEVEL then error("subcapture nesting too deep") end local cap = self.captures[self.index] assert(cap.kind.push ~= nil) local res = cap.kind.push(self, cap, result) self.reclevel = self.reclevel - 1 return res end -- helper functions for pushcapture --[[ ** Push on the Lua stack all values generated by nested captures inside ** the current capture. Returns number of values pushed. 'addextra' ** makes it push the entire match after all captured values. The ** entire match is pushed also if there are no other nested values, ** so the function never returns zero. ]]-- function CapState:pushnestedvalues(result, addextra) local head = self:advance() local n = 0 -- number of pushed subvalues -- repeat for all nested patterns while head:inside(self:cap()) do n = n + self:pushcapture(result) end if addextra or n == 0 then -- need extra? result:push(self:substr(head.bytePos, self:closesize(head))) n = n + 1 end self:skipclose(head) return n end --[[ ** Push only the first value generated by nested captures ]]-- function CapState:pushonenestedvalue(result) local n = self:pushnestedvalues(result, false) if n == 0 then result:push(nil) -- ensure there's exactly one value return 1 end while n > 1 do result:pop() -- pop extra values n = n - 1 end return n end -- visitor patterns for pushcapture function CapKind.position.push(capstate, cap, result) result:push(cap.bytePos) capstate.index = capstate.index + 1 return 1 end function CapKind.const.push(capstate, cap, result) result:push(cap.extra) capstate.index = capstate.index + 1 return 1 end function CapKind.arg.push(capstate, cap, result) local n = cap.extra if n > capstate.extraArgs.n then error(string.format("reference to absent extra argument #%d", n)) end result:push(capstate.extraArgs[n]) capstate.index = capstate.index + 1 return 1 end function CapKind.simple.push(capstate, cap, result) local k = capstate:pushnestedvalues(result, true) -- reorder so that the whole match is the first result, not the last local last = result:pop() result:insert(2 + #result - k, last) return k end -- missing a bunch --[[ ** Table capture: creates a new table and populates it with nested ** captures. ]]-- function CapKind.table.push(capstate, cap, result) -- aka tablecap local t = {} result:push(t) local head = capstate:advance() local n = 0 while head:inside(capstate:cap()) do cap = capstate:cap() if cap.kind == CapKind.group and cap.extra ~= nil then -- named group? capstate:pushonenestedvalue(result) t[cap.extra] = result:pop() -- move it into table else -- not a named group local k = capstate:pushcapture(result) for i=k,1,-1 do t[n + i] = result:pop() -- move it into table (indexed) end n = n + k end end capstate:skipclose(head) return 1 -- number of values pushed (only the table) end --[[ ** Table-query capture ]]-- function CapKind.query.push(capstate, cap, result) -- aka querycap capstate:pushonenestedvalue(result) local key = result:pop() local tbl = cap.extra assert(type(tbl) == "table") local val = tbl[key] if val ~= nil then result:push(val) return 1 else return 0 end end --[[ ** Fold capture ]]-- function CapKind.fold.push(capstate, cap, result) -- aka foldcap local f = cap.extra assert(type(f) == "function") local head = capstate:advance() if capstate:cap():isclosecap() then -- no nested captures? (large subject) error("no initial value for fold capture") end local args = List:new() local n = capstate:pushcapture(args) if n == 0 then -- nested captures with no values? error("no initial value for fold capture") end local accum = args[1] -- leave only one result for accumulator while head:inside(capstate:cap()) do args = List:new() args:push( accum ) -- put accumulator first n = capstate:pushcapture(args) -- get next capture's values accum = f(compat.unpack(args)) end capstate:skipclose(head) -- only accumulator left in result result:push(accum) return 1 end --[[ ** Function capture ]]-- CapKind["function"].push = function(capstate, cap, result) local f = cap.extra assert(type(f) == "function") local args = List:new() local n = capstate:pushnestedvalues(args, false) local r = compat.pack(f(compat.unpack(args))) for i=1,r.n do result:push(r[i]) end return r.n end --[[ ** Accumulator capture ]]-- function CapKind.acc.push(capstate, cap, result) -- aka accumulatorcap if #result == 0 then error("no previous value for accumulator capture") end local f = cap.extra assert(type(f) == "function") local prev = #result local args = List:new() args:push(result[prev]) local n = capstate:pushnestedvalues(args, false) result[prev] = f(compat.unpack(args)) return 0 -- did not add any extra value end --[[ ** Select capture ]]-- function CapKind.num.push(capstate, cap, result) -- aka numcap local idx = cap.extra -- value to select if idx == 0 then -- no values? capstate:nextcap() -- skip entire capture return 0 -- no value produced else local vals = List:new() local n = capstate:pushnestedvalues(vals, false) if n < idx then -- invalid index? error("no capture '"..idx.."'") else result:push(vals[idx]) return 1 end end end function CapState:runtimecap(closePos) local close = self.captures[closePos] local open,openPos = self:findopen(closePos) -- get open group capture assert(open.kind == CapKind.group) self.index = openPos -- set state to the open capture local args = List:new() args:push( self.source) -- original subject args:push( close.bytePos ) -- current position local n = self:pushnestedvalues(args, false) -- push nested captures local func = open.extra local funcRet = compat.pack(func(compat.unpack(args))) local res = closePos - openPos -- number of captures to be removed return res, funcRet end function CapKind.runtime.push(capstate, cap, result) -- aka runtimecap result:push(cap.extra) capstate.index = capstate.index + 1 return 1 end local MAXSTRCAPS = 10 --[[ ** Collect values from current capture into array 'cps'. Current ** capture must be Cstring (first call) or Csimple (recursive calls). ** (In first call, fills %0 with whole match for Cstring.) ** Returns number of elements in the array that were filled. ]]-- function CapState:getstrcaps(cps, n) local k = n n = n + 1 cps[k] = { isstring = true, -- get string value bytePos = self:cap().bytePos, -- starts here } local head = self:advance() while head:inside(self:cap()) do if n > MAXSTRCAPS then -- too many captures? self:nextcap() -- skip extra captures (will not need them) elseif self:cap().kind == CapKind.Simple then -- string? n = self:getstrcaps(cps, n) -- put info into array else cps[n] = { isstring = false, -- not a string cap = self.index, -- keep original capture } self:nextcap() n = n + 1 end end cps[k].endPos = head.bytePos + self:closesize(head) self:skipclose(head) return n end function CapState:addonestring(buffer, what) local cap = self:cap() if cap.kind == CapKind.string then -- add capture directly to buffer return stringcap(self, buffer) elseif cap.kind == CapKind.subst then -- add capture directly to buffer return substcap(self, buffer) elseif cap.kind == CapKind.acc then error("invalid context for an accumulator capture") end local result = List:new() local n = self:pushcapture(result) if n == 0 then return 0 end -- no values to add local val = result[1] -- take only one result (the first) if type(val) == "number" then val = tostring(val) elseif type(val) ~= "string" then error("invalid "..what.." value (a "..type(val)..")") end table.insert(buffer, val) return 1 end --[[ ** String capture: add result to buffer 'b' (instead of pushing ** it into the stack) ]]-- function stringcap(capstate, buffer) local fmt = capstate:cap().extra local cps = {} local n = capstate:getstrcaps(cps, 1) - 1 -- collect nested captures local sawEscape = false for _,c in compat.utf8codes(fmt) do if sawEscape then sawEscape = false if c < 48 or c > 57 then -- not followed by a digit table.insert(buffer, compat.utf8char(c)) else local l = 1 + c - 48 -- capture index (1-based) if l > n then error("invalid capture index ("..(l-1)..")") elseif cps[l].isstring then table.insert(buffer, capstate:substr(cps[l].bytePos, cps[l].endPos - cps[l].bytePos)) else -- go back to evaluate that nested capture local curr = capstate.index capstate.index = cps[l].cap if capstate:addonestring(buffer, "capture") == 0 then error("no values in capture index "..l) end capstate.index = curr end end elseif c ~= 37 then -- not a % escape? table.insert(buffer, compat.utf8char(c)) else sawEscape = true end end return 1 end --[[ ** Substitution capture: add result to buffer 'b' ]]-- function substcap(capstate, buffer) local head = capstate:advance() local curr = head.bytePos while head:inside(capstate:cap()) do local cap = capstate:cap() local nextPos = cap.bytePos local s = capstate:substr(curr, nextPos - curr) table.insert(buffer, s) -- add text up to capture if capstate:addonestring(buffer, "replacement") == 0 then -- no capture value, keep original text in final result curr = nextPos else -- continue after match local lastCap = capstate.captures[capstate.index - 1] curr = nextPos + cap:size(lastCap) end end -- add last piece of text local s = capstate:substr(curr, head.bytePos + capstate:closesize(head) - curr) table.insert(buffer, s) capstate:skipclose(head) end function CapKind.subst.push(capstate, cap, result) -- aka substcap local buffer = {} substcap(capstate, buffer) result:push(table.concat(buffer)) return 1 end function CapKind.string.push(capstate, cap, result) -- aka stringcap local buffer = {} stringcap(capstate, buffer) result:push(table.concat(buffer)) return 1 end function CapKind.group.push(capstate, cap, result) if cap.extra == nil then -- anonymous group? return capstate:pushnestedvalues(result, false) -- add all nested values else -- named group: add no values capstate:nextcap() return 0 end end --[[ ** Back-reference capture. Return number of values pushed. ]]-- function CapKind.backref.push(capstate, cap, result) local curr = capstate.index local _,i = capstate:findback(cap.extra) capstate.index = i local n = capstate:pushnestedvalues(result, false) capstate.index = curr + 1 return n end return { CapState = CapState, Capture = Capture, } end) register('llpeg.vm', function(myrequire) myrequire('strict') local compat = myrequire('advent.compat') local utf8util = myrequire('llpeg.utf8util') local from = myrequire('llpeg.types').from local CHARMAX, CapKind, Opcode, enum, _ = from(myrequire('llpeg.types'), { 'CHARMAX', 'CapKind', 'Opcode', 'enum', }) local CapState, Capture, __ = from(myrequire('llpeg.cap'), { 'CapState', 'Capture', }) local Instruction, ___ = from(myrequire('llpeg.code'), { 'Instruction', }) local printcaplist, ___ = from(myrequire('llpeg.print'), { 'printcaplist', }) local LFAIL = {} local InsidePred = enum{ OUTPRED = 0, INPRED = 1, } local Stack = {} Stack.__index = Stack -- Stack prev: previous entry in the stack -- int bytePos: saved position, or NULL for calls -- Instruction pc: saved instruction -- int caplevel -- int labenv -- for labeled failure -- bool predchoice -- for labeled failure function Stack:new(prev, bytePos, pc, caplevel, labenv, predchoice) return setmetatable({ prev = prev, bytePos = bytePos, pc = pc, caplevel = caplevel, labenv = labenv, predchoice = predchoice, }, self) end function Stack:__tostring() return string.format( "Stack{ bytePos=%d, caplevel=%d, labenv=%s, predchoice=%s }", self.bytePos, self.caplevel, self.labenv, self.predchoice ) end function Stack:print() local s = self while s ~= nil do print("Stack", s) s = s.prev end end local MatchResult = {} MatchResult.__index = MatchResult function MatchResult:new() return setmetatable({ labelf = nil, -- failure label sfail = -1, -- farthest failure }, self) end local State = {} State.__index = State function State:new(source, bytePos, ...) local giveup = Instruction:new{Opcode.Giveup} local insidepred = InsidePred.OUTPRED -- label environment is off inside predicates local stack = Stack:new(nil, bytePos, giveup, 0, insidepred, nil) local cp,cpLen if bytePos <= #source then cp, cpLen = utf8util.codepointAt(source, bytePos) else cp, cpLen = nil, nil end assert(bytePos ~= nil) return setmetatable({ source = source, -- the source string bytePos = bytePos, -- current position in the string, in bytes codepoint = cp, -- the codepoint at 'bytePos' in 'source' codepointLen = cpLen, -- the length of the codepoint at 'bytePos' stack = stack, -- top of stack captures = {}, -- list of captures captop = 1, -- point to first empty slot in captures (1-based) extraArgs = compat.pack(...), -- labeled failures: insidepred = insidepred, labelf = nil, -- failure label sfail = -1, -- farthest failure }, self) end function State:advance() return self:resetPosTo(self.bytePos + self.codepointLen) end function State:resetPosTo(newPos) assert(newPos ~= nil) self.bytePos = newPos local source = self.source if newPos <= #source then local cp, cpLen = utf8util.codepointAt(source, newPos) self.codepoint = cp self.codepointLen = cpLen return cp else self.codepoint = nil self.codepointLen = nil return nil end end function State:backtrack(n) local off = utf8util.offset(self.source, -n, self.bytePos) if off == nil then return false end -- can't backtrack that far! self:resetPosTo(off) return true end function State:updatefarthest(label) self.labelf = label if self.bytePos > self.sfail then self.sfail = self.bytePos end end function State:pushcapture(cap) self.captures[self.captop] = cap self.captop = self.captop + 1 end function State:fail() -- pattern failed, try to backtrack local lastStack repeat lastStack = self.stack self.stack = lastStack.prev until lastStack.bytePos ~= nil self:resetPosTo(lastStack.bytePos) self.captop = lastStack.caplevel self.insidepred = lastStack.labenv -- labeled failure return lastStack.pc:exec(self) end function State:giveup() local r = nil local lab = "fail" local errpos = self.sfail if self.labelf ~= nil and self.labelf ~= LFAIL then lab = self.labelf end return r, lab, errpos end function State:getcaptures() local results = {} if self.captures[1].kind == CapKind.close then -- are there any captures? return results -- no captures end return CapState:new(self.captures, self.source, self.extraArgs):getcaptures() end function Opcode.End:exec(state) state:pushcapture(CapKind.close:newCapture(state.bytePos, 0)) -- trim table to proper length while #state.captures > state.captop - 1 do table.remove(state.captures) end -- printcaplist(state.captures, #state.captures) -- for debugging local results = state:getcaptures() if #results == 0 then -- no captured values? return state.bytePos -- return only end position else return compat.unpack(results) end end function Opcode.Giveup:exec(state) return state:giveup() end function Opcode.Ret:exec(state) local lastStack = state.stack state.stack = lastStack.prev return lastStack.pc:exec(state) end function Opcode.Any:exec(state) if state.codepoint ~= nil then state:advance() return self.next:exec(state) end state:updatefarthest(LFAIL) return state:fail() end function Opcode.TestAny:exec(state) if state.codepoint ~= nil then return self.next:exec(state) else return self.branch:exec(state) end end function Opcode.UTFR:exec(state) local cp = state.codepoint if cp ~= nil and self.from <= cp and cp <= self.to then state:advance() return self.next:exec(state) end state:updatefarthest(LFAIL) return state:fail() end function Opcode.Char:exec(state) if state.codepoint == self.aux then state:advance() return self.next:exec(state) end state:updatefarthest(LFAIL) return state:fail() end function Opcode.TestChar:exec(state) if state.codepoint == self.aux then return self.next:exec(state) else return self.branch:exec(state) end end function Opcode.Set:exec(state) local cp = state.codepoint if cp ~= nil then if cp <= CHARMAX then if self.set[cp] then state:advance() return self.next:exec(state) end else if self.rest then state:advance() return self.next:exec(state) end end end state:updatefarthest(LFAIL) return state:fail() end function Opcode.TestSet:exec(state) local cp = state.codepoint if cp ~= nil then if cp <= CHARMAX then if self.set[cp] then return self.next:exec(state) end elseif self.rest then return self.next:exec(state) end end return self.branch:exec(state) end function Opcode.Behind:exec(state) local n = self.aux -- XXX SLOW self.aux is in *characters* not *bytes* if state:backtrack(n) then return self.next:exec(state) end state:updatefarthest(LFAIL) return state:fail() end function Opcode.Span:exec(state) local cp = state.codepoint while true do local match = false if cp ~= nil then if cp <= CHARMAX then if self.set[cp] then match = true end else if self.rest then match = true end end end if not match then break end cp = state:advance() end return self.next:exec(state) end function Opcode.Jmp:exec(state) return self.branch:exec(state) end function Opcode.Choice:exec(state) state.stack = Stack:new( state.stack, state.bytePos, self.branch, state.captop, state.insidepred ) return self.next:exec(state) end function Opcode.PredChoice:exec(state) state.stack = Stack:new( state.stack, state.bytePos, self.branch, state.captop, state.insidepred, true -- predchoice ) state.insidepred = InsidePred.INPRED return self.next:exec(state) end function Opcode.Call:exec(state) state.stack = Stack:new( state.stack, nil, self.next ) return self.branch:exec(state) end function Opcode.Commit:exec(state) state.stack = state.stack.prev return self.branch:exec(state) end function Opcode.PartialCommit:exec(state) local stack = state.stack stack.bytePos = state.bytePos stack.caplevel = state.captop return self.branch:exec(state) end function Opcode.BackCommit:exec(state) local stack = state.stack state.stack = stack.prev -- pop the stack state:resetPosTo(stack.bytePos) -- but reset the position to that stored state.insidepred = stack.labenv -- labeled failure state.captop = stack.caplevel return self.branch:exec(state) end function Opcode.Throw:exec(state) if state.insidepred == InsidePred.OUTPRED then state.labelf = self.key -- pop entire stack while state.stack.prev ~= nil do state.stack = state.stack.prev end else state.labelf = LFAIL -- pop until you read a 'predchoice' state while not state.stack.predchoice do state.stack = state.stack.prev end end state.sfail = state.bytePos return state:fail() end function Opcode.ThrowRec:exec(state) -- with recovery state.sfail = state.bytePos if state.insidepred == InsidePred.OUTPRED then state.labelf = self.key state.stack = Stack:new( state.stack, nil, self.next, state.captop ) return self.branch:exec(state) else state.labelf = LFAIL -- pop until you read a 'predchoice' state while not state.stack.predchoice do state.stack = state.stack.prev end return state:fail() end end function Opcode.FailTwice:exec(state) state.stack = state.stack.prev state:updatefarthest(LFAIL) return state:fail() end function Opcode.Fail:exec(state) state:updatefarthest(LFAIL) return state:fail() end function Opcode.CloseRunTime:exec(state) -- close the group state:pushcapture(self.cap:newCapture(state.bytePos, 0)) -- trim table to proper length while #state.captures > state.captop - 1 do table.remove(state.captures) end local cs = CapState:new(state.captures, state.source, state.extraArgs) local n, funcRet = cs:runtimecap(state.captop - 1) state.captop = state.captop - n -- remove nested captures -- resdyncaptures: resolve returned values in `funcRet` -- first argument false=fail, true=keep current pos, number=next position local firstArg = funcRet[1] if funcRet.n == 0 then firstArg = false -- returning void means we'll fail end if not firstArg then -- if it is falsey, discard rest of returned vals & fail state:updatefarthest(LFAIL) return state:fail() -- tail call elseif type(firstArg) == "boolean" then -- keep current position, nothing needs to be done else local npos = tonumber(firstArg) if npos < state.bytePos or npos > (1 + #(state.source)) then error("invalid position returned by match-time capture") end state:resetPosTo(npos) end -- push the rest of the funcRet values as new captures local n = funcRet.n - 1 -- number of new captures if n == 0 then -- no new captures? state.captop = state.captop - 1 -- remove open group else -- new captures, keep original open group -- add new captures + close group to 'capture' list -- adddyncaptures: assert(state.captures[state.captop - 1].kind == CapKind.group) assert(state.captures[state.captop - 1]:isopencap()) -- make group capture an anonymous group (this used to hold match-time f) state.captures[state.captop - 1].extra = nil for i=2,funcRet.n do -- add runtime captures state:pushcapture(CapKind.runtime:newCapture(state.bytePos, 0, funcRet[i])) end -- close group state:pushcapture(CapKind.close:newCapture(state.bytePos, 0)) end return self.next:exec(state) end local MAXLOP = 20 function findopen(captures, i, currPos) i = i - 1 -- check last captop local cap = captures[i] if (not cap:isopencap()) and cap.bytePos == currPos then return nil -- current one cannot be a full capture end -- else, look for an 'open' capture for j=1,MAXLOP do if cap:isopencap() then -- open capture? return cap,i -- that's the one to be closed elseif cap.kind == CapKind.close then return nil -- a full capture should not nest a non-full one end i = i - 1 if i<1 then break end cap = captures[i] end return nil -- not found within allowed search limit end function Opcode.CloseCapture:exec(state) -- if possible, turn capture into a full capture assert(state.captop > 1) local open,_ = findopen(state.captures, state.captop, state.bytePos) if open ~= nil then -- if possible, turn capture into a full capture open.byteLen = state.bytePos - open.bytePos else -- non-nil length to mark entry as closed state:pushcapture(self.cap:newCapture(state.bytePos, 0)) end return self.next:exec(state) end function Opcode.OpenCapture:exec(state) state:pushcapture(self.cap:newCapture( -- byteLen = nil marks entry as open state.bytePos, nil, self.key )) return self.next:exec(state) end function Opcode.FullCapture:exec(state) -- XXX SLOW: self.aux is in *characters* not *bytes* local nPos = utf8util.offset(state.source, -self.aux, state.bytePos) state:pushcapture(self.cap:newCapture( nPos, state.bytePos - nPos, self.key )) return self.next:exec(state) end function match(s, init, code, ...) local state = State:new(s, init, ...) return code[1]:exec(state) end return { match = match, } end) register('llpeg', function(myrequire) local VERSION = '0.0.1' local MAXSTACK = 400 -- maximum backtracking local MAXBEHIND = 255 -- maximum look-behind local MAXRULES = 1000 -- maximum number of rules local LPEG_COMPAT = true myrequire('strict') local compat = myrequire('advent.compat') local from = myrequire('llpeg.types').from local CHARMAX, CapKind, TTag, define, define_tree_visitor, metareg, newcharset, numsiblings, _ = from(myrequire('llpeg.types'), { 'CHARMAX', 'CapKind', 'TTag', 'define', 'define_tree_visitor', 'metareg', 'newcharset', 'numsiblings', }) local compile, cs_diff, cs_union, fixedlen, hascaptures, nofail, nullable, nullable_with_grammar, tocharset, __ = from(myrequire('llpeg.code'), { 'compile', 'cs_diff', 'cs_union', 'fixedlen', 'hascaptures', 'nofail', 'nullable', 'nullable_with_grammar', 'tocharset', }) local match, ___ = from(myrequire('llpeg.vm'), { 'match', }) local printpatt, printrepl, printtree, ____ = from(myrequire('llpeg.print'), { 'printpatt', 'printrepl', 'printtree', }) function checkint(v) if type(v) == 'string' then v = tonumber(v) end if type(v) ~= "number" then error("not a number") end return math.floor(v) end local is_pattern = define_type_visitor{ pattern = function() return true end, default = function() return false end, } local ptype = define_type_visitor{ pattern = function() return "pattern" end, default = function(v) return type(v) end, } function val2str(v) if type(v) == 'number' then return tostring(v) end if type(v) == 'string' then return v end return string.format("(a %s)", ptype(v)) end --[[ lpltree.c ]]-- function newtree(tag) local t = { tag = tag, code = nil, } setmetatable(t, metareg) return t end function newleaf(tag, n) return setmetatable({ tag = tag, code = nil, n = n, }, metareg) end function newroot1sib(tag, sib1) return setmetatable({ tag = tag, code = nil, sib1 = sib1, }, metareg) end function newroot2sib(tag, sib1, sib2) return setmetatable({ tag = tag, code = nil, sib1 = sib1, sib2 = sib2, }, metareg) end --[[ Build a sequence of #s nodes from the array 's' with the tag 'tag' ]]-- function fillseq(tag, s) if type(s) == 'number' then local len = checkint(s) s = setmetatable({}, {__len = function() return len end}) end if #s == 0 then return newleaf(tag, 0) end local i = #s local result = newleaf(tag, s[i]) while i > 1 do i = i - 1 result = newroot2sib( TTag.Seq, newleaf(tag, s[i]), result ) end return result end --[[ Numbers as patterns: 0 == true (always match); n == TAny repeated 'n' times; -n == not (TAny repeated 'n' times) ]]-- function numtree(n) n = checkint(n) if n == 0 then return newleaf(TTag.True) elseif n > 0 then return fillseq(TTag.Any, n) -- sequence of 'n' anys else return newroot1sib(TTag.Not, fillseq(TTag.Any, -n)) end end -- Convert value v to a pattern local getpatt = define_type_visitor{ ["string"] = function(s) if #s == 0 then return newleaf(TTag.True) -- always match if string is empty end local cp = {} for _,c in compat.utf8codes(s) do table.insert(cp, c) end return fillseq(TTag.Char, cp) end, ["number"] = function(n) return numtree(n) end, ["boolean"] = function(b) if b then return newleaf(TTag.True) else return newleaf(TTag.False) end end, ["function"] = function(f) return setmetatable({ tag = TTag.RunTime, code = nil, key = f, sib1 = newleaf(TTag.True), }, metareg) end, ["pattern"] = function(v) return v end, ["table"] = function(v) return newgrammar(v) end, default = function(v) error("Not a pattern") end, } -- labeled failure begin function newthrowleaf(label) return setmetatable({ tag = TTag.Throw, code = nil, sib2 = nil, -- no recovery rule associated (yet) key = label, }, metareg) end -- labeled failure end function lp_P(v) return getpatt(v) end --[[ ** sequence operator; optimizations: ** false x => false, x true => x, true x => x ** (cannot do x . false => false because x may have runtime captures) ]]-- function lp_seq(tree1, tree2) tree1 = getpatt(tree1) tree2 = getpatt(tree2) if tree1.tag == TTag.False or tree2.tag == TTag.True then -- false . x = false, x . true = x return tree1 elseif tree1.tag == TTag.True then -- true . x = x return tree2 else return newroot2sib(TTag.Seq, tree1, tree2) end end --[[ ** choice operator; optimizations: ** charset / charset => charset ** true / x => true, x / false => x, false / x => x ** (x / true is not equivalent to true) ]]-- function lp_choice(t1, t2) t1 = getpatt(t1) t2 = getpatt(t2) local t1c = tocharset(t1) local t2c = tocharset(t2) if t1c ~= nil and t2c ~= nil then local t = cs_union(t1c, t2c) return t elseif nofail(t1) or t2.tag == TTag.False then -- true / x => true, x / false => x return t1 elseif t1.tag == TTag.False then -- false / x => x return t2 else return newroot2sib(TTag.Choice, t1, t2) end end --[[ p^n ]]-- function lp_star(p, n) local tree1 = getpatt(p) n = checkint(n) if n >= 0 then -- seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) if nullable(tree1) then error("loop body may accept empty string") end local tree = newroot1sib(TTag.Rep, tree1) while n > 0 do tree = newroot2sib(TTag.Seq, tree1, tree) n = n - 1 end return tree else -- choice (seq tree1 ... choice tree1 true ...) true n = -n local tree = newroot2sib( -- at most 1 TTag.Choice, tree1, newleaf(TTag.True) ) while n > 1 do tree = newroot2sib( -- at most (n-1) TTag.Seq, tree1, tree ) tree = newroot2sib(TTag.Choice, tree, newleaf(TTag.True)) n = n - 1 end return tree end end --[[ ** #p == &p ]]-- function lp_and(v) return newroot1sib(TTag.And, getpatt(v)) end --[[ ** -p == !p ]]-- function lp_not(v) return newroot1sib(TTag.Not, getpatt(v)) end --[[ ** [t1 - t2] == Seq (Not t2) t1 ** If t1 and t2 are charsets, make their difference. ]]-- function lp_sub(t1, t2) t1 = getpatt(t1) t2 = getpatt(t2) local t1c = tocharset(t1) local t2c = tocharset(t2) if t1c ~= nil and t2c ~= nil then return cs_diff(t1c, t2c) else return newroot2sib( TTag.Seq, newroot1sib(TTag.Not, t2), t1 ) end end --[[ A set with the given characters ]]-- function lp_set(s) local t = newcharset() local extra = nil for _,c in compat.utf8codes(s) do if c > CHARMAX then -- non ascii, we can't use charset for these local one = newleaf(TTag.Char, c) if extra == nil then extra = one else extra = newroot2sib(TTag.Choice, extra, one) end else t.set[c] = true end end if extra == nil then return t else return newroot2sib(TTag.Choice, t, extra) end end function lp_range(...) local t = newcharset() local extra = nil for _,v in ipairs{...} do if type(v) ~= "string" then error("argument must be string") else local first, second for _,c in compat.utf8codes(v) do if first == nil then first = c elseif second == nil then second = c else error("range must have two characters") end end if first == nil or second == nil then error("range must have two characters") end if first > second then if LPEG_COMPAT then -- ignore, just silently create an empty range else error("empty range") end elseif second <= CHARMAX then -- ascii range for i = first, second do t.set[i] = true end else local r = lp_utfr(first, second) if extra == nil then extra = r else extra = newroot2sib(TTag.Choice, extra, one) end end end end if extra == nil then return t else return newroot2sib(TTag.Choice, t, extra) end end function lp_utfr(from, to) from = checkint(from) to = checkint(to) if from > to then error("empty range") end if to > 0x10FFFF then error("invalid code point") end if to <= CHARMAX then -- ascii range? local t = newcharset() -- code it as a regular charset for i = from, to do t.set[i] = true end return t end -- multibyte utf-8 range return setmetatable({ tag = TTag.UTFR, code = nil, from = from, to = to, }, metareg) end --[[ Look-behind predicate ]]-- function lp_behind(v) local tree1 = getpatt(v) local n = fixedlen(tree1) if n < 0 then error("pattern may not have fixed length") end if hascaptures(tree1) then error("pattern has captures") end if n > MAXBEHIND then error("pattern too long to look behind") end return setmetatable({ tag = TTag.Behind, code = nil, sib1 = tree1, n = n, }, metareg) end --[[ labeled failure begin ]]-- --[[ ** Throws a label ]]-- local lp_throw = define_type_visitor{ [{"string","number"}] = newthrowleaf, default = function() error("not a string") end, } --[[ labeled failure end ]]-- --[[ ** Create a non-terminal ]]-- function lp_V(v) if v == nil then error("non-nil value expected") end return setmetatable({ tag = TTag.Call, code = nil, key = v, }, metareg) end --[[ ** Create a tree for a non-empty capture, with a body and ** optionally with an associated Lua value (at index 'labelidx' in the ** stack) ]]-- function capture_aux(capkind, patt, val) local t = newroot1sib(TTag.Capture, getpatt(patt)) t.cap = capkind t.key = val return t end function newemptycap(capkind, val) return capture_aux(capkind, newleaf(TTag.True), val) end --[[ ** Captures with syntax p / v ** (function capture, query capture, string capture, or number capture) ]]-- local divcapture_helper = define_type_visitor{ ["function"] = function(v, p) return capture_aux(CapKind["function"], p, v) end, ["table"] = function(v, p) return capture_aux(CapKind.query, p, v) end, ["string"] = function(v, p) return capture_aux(CapKind.string, p, v) end, ["number"] = function(v, p) v = checkint(v) if v < 0 or v > 65536 then error("invalid number") end return capture_aux(CapKind.num, p, v) end, default = function(v) error("unexpected "..ptype(v).." as 2nd operand to LPeg '/'") end, } function lp_divcapture(p, v) return divcapture_helper(v, p) -- dispatch on v end function lp_acccapture(p, v) return capture_aux(CapKind.acc, p, v) end -- the match for patt with the values from nested captures replacing their -- matches function lp_substcapture(patt) return capture_aux(CapKind.subst, patt) end -- a table with all captures from patt function lp_tablecapture(patt) return capture_aux(CapKind.table, patt) end -- the values produced by patt, optionally tagged with key function lp_groupcapture(patt, key) -- key can be nil return capture_aux(CapKind.group, patt, key) end -- folding capture (deprecated) function lp_foldcapture(patt, func) if type(func) ~= "function" then error("Bad function argument") end return capture_aux(CapKind.fold, patt, func) end -- the match for patt plus all captures made by patt function lp_simplecapture(patt) return capture_aux(CapKind.simple, patt) end -- the current position (matches the empty string) function lp_poscapture() return newemptycap(CapKind.position) end -- the value of the nth extra argument to lpeg.match (matches the empty string) function lp_argcapture(n) n = checkint(n) if n <= 0 or n > 65536 then error("invalid argument index") end return newemptycap(CapKind.arg, n) end -- the value produced by the previous group capture named `key` -- (matches the empty string) function lp_backref(key) return newemptycap(CapKind.backref, key) end -- Constant capture (matches the empty string) function lp_constcapture(...) local arg = compat.pack(...) if arg.n == 0 then -- no values? return newleaf(TTag.True) -- no capture else local i = arg.n local tree = newemptycap(CapKind.const, arg[i]) while i > 1 do i = i - 1 tree = newroot2sib( TTag.Seq, newemptycap(CapKind.const, arg[i]), tree ) end if arg.n == 1 then -- single constant capture return tree else -- create a group capture with all values return lp_groupcapture( tree ) end end end -- the returns of function applied to the captures of patt; the application -- is done at match time function lp_matchtime(patt, func) if type(func) ~= 'function' then error('not a function') end return setmetatable({ tag = TTag.RunTime, code = nil, key = func, sib1 = getpatt(patt), }, metareg) end --[[======================================================]]-- --[[ ** ====================================================== ** Grammar - Tree generation ** ====================================================== ]]-- --[[ ** push on the stack the index and the pattern for the ** initial rule of grammar at index 'arg' in the stack; ** also add that index into position table. ]]-- function getfirstrule(tbl) local first_name, first_rule first_name = tbl[1] -- is this the name of an initial rule? if type(first_name) == 'number' or type(first_name) == 'string' then first_rule = tbl[first_name] -- get associated rule else first_name,first_rule = 1,first_name end if not is_pattern(first_rule) then if first_rule == nil then error("grammar has no initial rule") else error(string.format("initial rule '%s' is not a pattern", first_name)) end end -- rule position (after TGrammar) -- return map from name to position, and from position to name return { [first_name] = 1 }, { first_name } end --[[ ** traverse grammar at index 'arg', pushing all its keys and patterns ** into the stack. Create a new table (before all pairs key-pattern) to ** collect all keys and their associated positions in the final tree ** (the "position table"). ** Return the number of rules and (in 'totalsize') the total size ** for the new tree. ]]-- function collectrules(tbl) -- find the first rule and put in position table local postab, rpostab = getfirstrule(tbl) -- collect and sort rule names (for repeatability) local names = {} for k,v in pairs(tbl) do if k == 1 or postab[k] == 1 then -- initial rule? -- skip the initial rules, it's already in the position table else table.insert(names, k) end end table.sort(names, function(a,b) return tostring(a) < tostring(b) end) -- fill out rule, name, and position maps for _,k in ipairs(names) do local v = tbl[k] if not is_pattern(v) then error("rule '" .. val2str(k) .. "' is not a pattern") end table.insert(rpostab, k) postab[k] = #rpostab end return postab, rpostab end function buildgrammar(g, tbl, postab, rpostab) local trees = {} for i,name in ipairs(rpostab) do local rule = setmetatable({ tag = TTag.Rule, code = nil, key = nil, -- will be fixed when rule is used n = i, -- rule number name = name, sib1 = tbl[name], -- pattern sib2 = nil, }, metareg) table.insert(trees, rule) g.ruletab[name] = rule end -- link up siblings for i = 1, #trees-1 do trees[i].sib2 = trees[i+1] end trees[#trees].sib2 = newleaf(TTag.True) -- finish list of rules g.sib1 = trees[1] end --[[ ** Check whether a tree has potential infinite loops ]]-- function checkloops(grammar, tree) local n = numsiblings[tree.tag] if tree.tag == TTag.Rep and nullable_with_grammar(tree.sib1, grammar) then return true elseif tree.tag == TTag.Grammar then return false -- sub-grammars already checked elseif n == 1 then return checkloops(grammar, tree.sib1) -- tail call elseif n == 2 then if checkloops(grammar, tree.sib1) then return true else return checkloops(grammar, tree.sib2) -- tail call end elseif n == 0 then return false else error("surprising number of siblings") end end --[[ ** Give appropriate error message for 'verifyrule'. If a rule appears ** twice in 'passed', there is path from it back to itself without ** advancing the subject. ]]-- function verifyerror(grammar, passed, npassed) local i, j for i = npassed,1,-1 do -- search for a repetition for j = i-1,1,-1 do if passed[i] == passed[j] then error(string.format("rule '%s' may be left recursive", val2str(passed[i]))) end end end error("too many left calls in grammar") end --[[ ** Check whether a rule can be left recursive; raise an error in that ** case; otherwise return 1 iff pattern is nullable. ** The return value is used to check sequences, where the second pattern ** is only relevant if the first is nullable. ** Parameter 'nb' works as an accumulator, to allow tail calls in ** choices. ('nb' true makes function returns true.) ** Parameter 'passed' is a list of already visited rules, 'npassed' ** counts the elements in 'passed'. ** Assume ktable at the top of the stack. ]]-- local verifyrule verifyrule = define_tree_visitor{ [{ TTag.Char, TTag.Set, TTag.Any, TTag.False, TTag.UTFR, TTag.Throw, -- labeled failure }] = function(tree, g, passed, n, nb) return nb -- cannot pass from here end, [{ TTag.True, TTag.Behind, -- look-behind cannot have calls }] = function(tree, g, passed, n, nb) return true end, [{ TTag.Not, TTag.And, TTag.Rep, }] = function(tree, g, passed, n, nb) return verifyrule(tree.sib1, g, passed, n, true) -- tail call end, [{ TTag.Capture, TTag.RunTime, TTag.XInfo, }] = function(tree, g, passed, n, nb) return verifyrule(tree.sib1, g, passed, n, nb) -- tail call end, [ TTag.Call ] = function(tree, g, passed, n, nb) local rule = g.ruletab[tree.key] -- look up rule return verifyrule(rule, g, passed, n, nb) -- tail call end, [ TTag.Seq ] = function(tree, g, passed, n, nb) -- only check 2nd child if first is nb if not verifyrule(tree.sib1, g, passed, n, false) then return nb else -- note that we don't propagate new npassed from 1st child return verifyrule(tree.sib2, g, passed, n, nb) -- tail call end end, [ TTag.Choice ] = function(tree, g, passed, n, nb) -- must check both children nb = verifyrule(tree.sib1, g, passed, n, nb) -- note that we don't propagate new npassed from 1st child return verifyrule(tree.sib2, g, passed, n, nb) -- tail call end, [ TTag.Rule ] = function(tree, g, passed, n, nb) if n >= MAXRULES then -- too many steps? return verifyerror(g, passed, n) -- error else passed[n+1] = tree.key -- add rule to path return verifyrule(tree.sib1, g, passed, n + 1, nb) -- tail call end end, [ TTag.Grammar ] = function(tree, g, passed, n, nb) return nullable(tree) -- sub-grammar cannot be left recursive end, } function verifygrammar(grammar) local passed = {} -- check left-recursive rules local rule = grammar.sib1 while rule.tag == TTag.Rule do if rule.key ~= nil then -- skip unused rules verifyrule(rule.sib1, grammar, passed, 0, false) end rule = rule.sib2 end if rule.tag ~= TTag.True then error("assertion failure") end -- check infinite loops inside rules rule = grammar.sib1 while rule.tag == TTag.Rule do if rule.key ~= nil then -- skip unused rules if checkloops(grammar, rule.sib1) then error("empty loop in rule '" .. val2str(rule.name) .. "'") end end rule = rule.sib2 end if rule.tag ~= TTag.True then error("assertion failure") end end --[[ ** Fix a TOpenCall into a TCall node, using table 'postable' to ** translate a key to its rule address in the tree. Raises an ** error if key does not exist. ]]-- function fixonecall(g, t, postab) local name = t.key local rule = g.ruletab[name] if t.tag == TTag.Call then if rule == nil then error(string.format("rule '%s' undefined in given grammar", val2str(name))) end -- unlike our upstream, we don't clone patterns when we build a grammar -- so we can't mutate this tree w/o possibly breaking any other grammars -- which might hold an alias of this call. So we don't distinguish -- Call and OpenCall and we don't mutate the tag here and -- don't link it up. However, we can mutate the Rule -- as those are not shared rule.key = name -- mark this as used elseif rule ~= nil then -- TTag.Throw -- As before, we can't mutate the tree rule.key = name -- mark this as used end end --[[ ** Transform left associative constructions into right ** associative ones, for sequence and choice; that is: ** (t11 + t12) + t2 => t11 + (t12 + t2) ** (t11 * t12) * t2 => t11 * (t12 * t2) ** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) ]]-- function correctassociativity (tree) local tag = tree.tag if tag ~= TTag.Choice and tag ~= TTag.Seq then error("impossible") end local t1 = tree.sib1 while t1.tag == tree.tag do local t11, t12 = t1.sib1, t1.sib2 local t2 = tree.sib2 -- don't mutate t1 in place as others may be keeping copies of it; -- mutating 'tree' in place is okay as we're not changing its semantics tree.sib1 = t11 tree.sib2 = newroot2sib(tag, t12, t2) t1 = tree.sib1 end return tree end --[[ ** Make final adjustments in a tree. Fix open calls in tree 't', ** making them refer to their respective rules or raising appropriate ** errors (if not inside a grammar). Correct associativity of associative ** constructions (making them right associative). Assume that tree's ** ktable is at the top of the stack (for error messages). ]]-- local finalfix_helper = define_tree_visitor{ [TTag.Grammar] = function(t) return t -- subgrammars were already fixed end, [TTag.Call] = function(t, g, postab) if g == nil then error("rule '" .. val2str(t.key) .. "' used outside a grammar") else return fixonecall(g, t, postab) end end, [TTag.Throw] = function(t, g, postab) if g ~= nil then return fixonecall(g, t, postab) end end, [{TTag.Seq, TTag.Choice}] = function(t, g, postab) return correctassociativity(t) end, default = function(t) return t end, } function finalfix(g, t, postab) finalfix_helper(t, g, postab) if t.tag == TTag.Grammar then return end local n = numsiblings[t.tag] if n == 1 then return finalfix(g, t.sib1, postab) -- tail call elseif n == 2 then finalfix(g, t.sib1, postab) return finalfix(g, t.sib2, postab) -- tail call elseif n == 0 then return else error("strange number of siblings") end end --[[ ** Give a name for the initial rule if it is not referenced ]]-- function initialrulename(grammar) if grammar.sib1.key == nil then -- initial rule is not referenced? grammar.sib1.key = grammar.sib1.name end end function newgrammar(tbl) local postab, rpostab = collectrules(tbl) local g = setmetatable({ tag = TTag.Grammar, code = nil, sib1 = nil, -- will fill this in n = #rpostab, -- number of rules ruletab = {}, -- map rule names to rules }, metareg) buildgrammar(g, tbl, postab, rpostab) finalfix(g, g.sib1, postab) initialrulename(g) verifygrammar(g) return g end --[[ ====================================================== ]]-- function prepcompile(p) finalfix(nil, p, {}) -- correct associativity return compile(p) end function lp_printtree(patt, c) local tree = getpatt(patt) if c then finalfix(nil, tree, {}) -- correct associativity end print("[]") -- for compatibility, this is a fake 'ktable' io.write(table.concat(printtree(tree, 0, {}))) end function lp_printcode(patt) local p = getpatt(patt) if p.code == nil then prepcompile(p) end print("[]") -- for compatibility, this is a fake 'ktable' io.write(table.concat(printpatt(p.code, {}))) end --[[ ** Get the initial position for the match, interpreting negative ** values from the end of the subject. Result is 1-based. ]]-- function initposition(ii, len) if ii > 0 then -- positive index? if ii <= len then -- inside the string? return ii -- return it (no correction to 0-base) else return len + 1 -- crop at the end end else -- negative index if (-ii) <= len then -- inside the string? return len + 1 - (-ii) -- return position from the end else return 1 end end end -- Main match function function lp_match(pattern, subject, init, ...) local p = getpatt(pattern) if p.code == nil then prepcompile(p) end local code = p.code if type(subject) ~= 'string' then error("subject is not a string") end local i if init == nil then i = 1 else i = initposition(checkint(init), #subject) end return match(subject, i, code, ...) end --[[ ** ====================================================== ** Library creation and functions not related to matching ** ====================================================== ]]-- function lp_setmax(lim) lim = 0 + lim -- convert to integer if lim <= 0 then error("out of range") end MAXSTACK = lim end local lp_type = define_type_visitor{ pattern = function() return "pattern" end, default = function() return nil end, } function lp_gc(p) p._code = nil end function createcat(charspec) local t = newcharset() for i=0,CHARMAX do -- XXX not unicode safe local s = compat.utf8char(i) if s:find(charspec) ~= nil then t.set[i] = true end end return t end function lp_locale(tbl) if tbl == nil then tbl = {} end tbl.alnum = createcat("%w") tbl.alpha = createcat("%a") tbl.cntrl = createcat("%c") tbl.digit = createcat("%d") tbl.graph = createcat("[%p%w]") -- printable except space tbl.lower = createcat("%l") tbl.print = createcat("%C") -- printable = "not a control character"? tbl.punct = createcat("%p") -- "printable but not space or alnum tbl.space = createcat("%s") tbl.upper = createcat("%u") tbl.xdigit = createcat("%x") return tbl end --[[ lpltree.c ]]-- metareg.__mul = lp_seq metareg.__add = lp_choice metareg.__pow = lp_star metareg.__gc = lp_gc metareg.__len = lp_and metareg.__div = lp_divcapture metareg.__mod = lp_acccapture metareg.__unm = lp_not metareg.__sub = lp_sub metareg.__tostring = printrepl local pattreg = { ptree = lp_printtree, pcode = lp_printcode, match = lp_match, B = lp_behind, V = lp_V, C = lp_simplecapture, Cc = lp_constcapture, Cmt = lp_matchtime, Cb = lp_backref, Carg = lp_argcapture, Cp = lp_poscapture, Cs = lp_substcapture, Ct = lp_tablecapture, Cf = lp_foldcapture, Cg = lp_groupcapture, P = lp_P, S = lp_set, R = lp_range, utfR = lp_utfr, locale = lp_locale, version = "LLPegLabel " .. VERSION, setmaxstack = lp_setmax, type = lp_type, T = lp_throw, -- labeled failure throw } metareg.__index = pattreg return pattreg end) local modules = {} modules['bit32'] = require('bit32') modules['string'] = require('string') modules['strict'] = {} modules['table'] = require('table') local function myrequire(name) if modules[name] == nil then modules[name] = true modules[name] = (builders[name])(myrequire) end return modules[name] end return myrequire('llpeg') end)()
Summary:
Please note that all contributions to Humanipedia may be edited, altered, or removed by other contributors. If you do not want your writing to be edited mercilessly, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see
Humanipedia:Copyrights
for details).
Do not submit copyrighted work without permission!
Cancel
Editing help
(opens in new window)
Templates used on this page:
Template:Error
(
edit
)
Template:Module other
(
edit
)
Template:Module rating
(
edit
)
Template:Ombox
(
edit
)
Template:Pp
(
edit
)
Template:Protection padlock
(
edit
)
Template:Sandbox other
(
edit
)
Template:Template link
(
edit
)
Template:Tl
(
edit
)
Module:Error
(
edit
)
Module:File link
(
edit
)
Module:Message box
(
edit
)
Module:Protection banner
(
edit
)
Module:String
(
edit
)
Module:User:Cscott/llpeg/doc
(
edit
)