123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355 |
- --[[--------------------------------------------------------------------
- llex.lua: Lua 5.1 lexical analyzer in Lua
- This file is part of LuaSrcDiet, based on Yueliang material.
- Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
- The COPYRIGHT file describes the conditions
- under which this software may be distributed.
- See the ChangeLog for more information.
- ----------------------------------------------------------------------]]
- --[[--------------------------------------------------------------------
- -- NOTES:
- -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
- -- with significant modifications to handle LuaSrcDiet's needs:
- -- (1) llex.error is an optional error function handler
- -- (2) seminfo for strings include their delimiters and no
- -- translation operations are performed on them
- -- * ADDED shbang handling has been added to support executable scripts
- -- * NO localized decimal point replacement magic
- -- * NO limit to number of lines
- -- * NO support for compatible long strings (LUA_COMPAT_LSTR)
- -- * Please read technotes.txt for more technical details.
- ----------------------------------------------------------------------]]
- local base = _G
- local string = require "string"
- module "llex"
- local find = string.find
- local match = string.match
- local sub = string.sub
- ----------------------------------------------------------------------
- -- initialize keyword list, variables
- ----------------------------------------------------------------------
- local kw = {}
- for v in string.gmatch([[
- and break do else elseif end false for function if in
- local nil not or repeat return then true until while]], "%S+") do
- kw[v] = true
- end
- -- NOTE: see init() for module variables (externally visible):
- -- tok, seminfo, tokln
- local z, -- source stream
- sourceid, -- name of source
- I, -- position of lexer
- buff, -- buffer for strings
- ln -- line number
- ----------------------------------------------------------------------
- -- add information to token listing
- ----------------------------------------------------------------------
- local function addtoken(token, info)
- local i = #tok + 1
- tok[i] = token
- seminfo[i] = info
- tokln[i] = ln
- end
- ----------------------------------------------------------------------
- -- handles line number incrementation and end-of-line characters
- ----------------------------------------------------------------------
- local function inclinenumber(i, is_tok)
- local sub = sub
- local old = sub(z, i, i)
- i = i + 1 -- skip '\n' or '\r'
- local c = sub(z, i, i)
- if (c == "\n" or c == "\r") and (c ~= old) then
- i = i + 1 -- skip '\n\r' or '\r\n'
- old = old..c
- end
- if is_tok then addtoken("TK_EOL", old) end
- ln = ln + 1
- I = i
- return i
- end
- ----------------------------------------------------------------------
- -- initialize lexer for given source _z and source name _sourceid
- ----------------------------------------------------------------------
- function init(_z, _sourceid)
- z = _z -- source
- sourceid = _sourceid -- name of source
- I = 1 -- lexer's position in source
- ln = 1 -- line number
- tok = {} -- lexed token list*
- seminfo = {} -- lexed semantic information list*
- tokln = {} -- line numbers for messages*
- -- (*) externally visible thru' module
- --------------------------------------------------------------------
- -- initial processing (shbang handling)
- --------------------------------------------------------------------
- local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
- if p then -- skip first line
- I = I + #q
- addtoken("TK_COMMENT", q)
- if #r > 0 then inclinenumber(I, true) end
- end
- end
- ----------------------------------------------------------------------
- -- returns a chunk name or id, no truncation for long names
- ----------------------------------------------------------------------
- function chunkid()
- if sourceid and match(sourceid, "^[=@]") then
- return sub(sourceid, 2) -- remove first char
- end
- return "[string]"
- end
- ----------------------------------------------------------------------
- -- formats error message and throws error
- -- * a simplified version, does not report what token was responsible
- ----------------------------------------------------------------------
- function errorline(s, line)
- local e = error or base.error
- e(string.format("%s:%d: %s", chunkid(), line or ln, s))
- end
- local errorline = errorline
- ------------------------------------------------------------------------
- -- count separators ("=") in a long string delimiter
- ------------------------------------------------------------------------
- local function skip_sep(i)
- local sub = sub
- local s = sub(z, i, i)
- i = i + 1
- local count = #match(z, "=*", i) -- note, take the length
- i = i + count
- I = i
- return (sub(z, i, i) == s) and count or (-count) - 1
- end
- ----------------------------------------------------------------------
- -- reads a long string or long comment
- ----------------------------------------------------------------------
- local function read_long_string(is_str, sep)
- local i = I + 1 -- skip 2nd '['
- local sub = sub
- local c = sub(z, i, i)
- if c == "\r" or c == "\n" then -- string starts with a newline?
- i = inclinenumber(i) -- skip it
- end
- local j = i
- while true do
- local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
- if not p then
- errorline(is_str and "unfinished long string" or
- "unfinished long comment")
- end
- i = p
- if r == "]" then -- delimiter test
- if skip_sep(i) == sep then
- buff = sub(z, buff, I)
- I = I + 1 -- skip 2nd ']'
- return buff
- end
- i = I
- else -- newline
- buff = buff.."\n"
- i = inclinenumber(i)
- end
- end--while
- end
- ----------------------------------------------------------------------
- -- reads a string
- ----------------------------------------------------------------------
- local function read_string(del)
- local i = I
- local find = find
- local sub = sub
- while true do
- local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
- if p then
- if r == "\n" or r == "\r" then
- errorline("unfinished string")
- end
- i = p
- if r == "\\" then -- handle escapes
- i = i + 1
- r = sub(z, i, i)
- if r == "" then break end -- (EOZ error)
- p = find("abfnrtv\n\r", r, 1, true)
- ------------------------------------------------------
- if p then -- special escapes
- if p > 7 then
- i = inclinenumber(i)
- else
- i = i + 1
- end
- ------------------------------------------------------
- elseif find(r, "%D") then -- other non-digits
- i = i + 1
- ------------------------------------------------------
- else -- \xxx sequence
- local p, q, s = find(z, "^(%d%d?%d?)", i)
- i = q + 1
- if s + 1 > 256 then -- UCHAR_MAX
- errorline("escape sequence too large")
- end
- ------------------------------------------------------
- end--if p
- else
- i = i + 1
- if r == del then -- ending delimiter
- I = i
- return sub(z, buff, i - 1) -- return string
- end
- end--if r
- else
- break -- (error)
- end--if p
- end--while
- errorline("unfinished string")
- end
- ------------------------------------------------------------------------
- -- main lexer function
- ------------------------------------------------------------------------
- function llex()
- local find = find
- local match = match
- while true do--outer
- local i = I
- -- inner loop allows break to be used to nicely section tests
- while true do--inner
- ----------------------------------------------------------------
- local p, _, r = find(z, "^([_%a][_%w]*)", i)
- if p then
- I = i + #r
- if kw[r] then
- addtoken("TK_KEYWORD", r) -- reserved word (keyword)
- else
- addtoken("TK_NAME", r) -- identifier
- end
- break -- (continue)
- end
- ----------------------------------------------------------------
- local p, _, r = find(z, "^(%.?)%d", i)
- if p then -- numeral
- if r == "." then i = i + 1 end
- local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
- i = q + 1
- if #r == 1 then -- optional exponent
- if match(z, "^[%+%-]", i) then -- optional sign
- i = i + 1
- end
- end
- local _, q = find(z, "^[_%w]*", i)
- I = q + 1
- local v = sub(z, p, q) -- string equivalent
- if not base.tonumber(v) then -- handles hex test also
- errorline("malformed number")
- end
- addtoken("TK_NUMBER", v)
- break -- (continue)
- end
- ----------------------------------------------------------------
- local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
- if p then
- if t == "\n" or t == "\r" then -- newline
- inclinenumber(i, true)
- else
- I = q + 1 -- whitespace
- addtoken("TK_SPACE", r)
- end
- break -- (continue)
- end
- ----------------------------------------------------------------
- local r = match(z, "^%p", i)
- if r then
- buff = i
- local p = find("-[\"\'.=<>~", r, 1, true)
- if p then
- -- two-level if block for punctuation/symbols
- --------------------------------------------------------
- if p <= 2 then
- if p == 1 then -- minus
- local c = match(z, "^%-%-(%[?)", i)
- if c then
- i = i + 2
- local sep = -1
- if c == "[" then
- sep = skip_sep(i)
- end
- if sep >= 0 then -- long comment
- addtoken("TK_LCOMMENT", read_long_string(false, sep))
- else -- short comment
- I = find(z, "[\n\r]", i) or (#z + 1)
- addtoken("TK_COMMENT", sub(z, buff, I - 1))
- end
- break -- (continue)
- end
- -- (fall through for "-")
- else -- [ or long string
- local sep = skip_sep(i)
- if sep >= 0 then
- addtoken("TK_LSTRING", read_long_string(true, sep))
- elseif sep == -1 then
- addtoken("TK_OP", "[")
- else
- errorline("invalid long string delimiter")
- end
- break -- (continue)
- end
- --------------------------------------------------------
- elseif p <= 5 then
- if p < 5 then -- strings
- I = i + 1
- addtoken("TK_STRING", read_string(r))
- break -- (continue)
- end
- r = match(z, "^%.%.?%.?", i) -- .|..|... dots
- -- (fall through)
- --------------------------------------------------------
- else -- relational
- r = match(z, "^%p=?", i)
- -- (fall through)
- end
- end
- I = i + #r
- addtoken("TK_OP", r) -- for other symbols, fall through
- break -- (continue)
- end
- ----------------------------------------------------------------
- local r = sub(z, i, i)
- if r ~= "" then
- I = i + 1
- addtoken("TK_OP", r) -- other single-char tokens
- break
- end
- addtoken("TK_EOS", "") -- end of stream,
- return -- exit here
- ----------------------------------------------------------------
- end--while inner
- end--while outer
- end
- return base.getfenv()
|