llex.lua 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. --[[--------------------------------------------------------------------
  2. llex.lua: Lua 5.1 lexical analyzer in Lua
  3. This file is part of LuaSrcDiet, based on Yueliang material.
  4. Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
  5. The COPYRIGHT file describes the conditions
  6. under which this software may be distributed.
  7. See the ChangeLog for more information.
  8. ----------------------------------------------------------------------]]
  9. --[[--------------------------------------------------------------------
  10. -- NOTES:
  11. -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
  12. -- with significant modifications to handle LuaSrcDiet's needs:
  13. -- (1) llex.error is an optional error function handler
  14. -- (2) seminfo for strings include their delimiters and no
  15. -- translation operations are performed on them
  16. -- * ADDED shbang handling has been added to support executable scripts
  17. -- * NO localized decimal point replacement magic
  18. -- * NO limit to number of lines
  19. -- * NO support for compatible long strings (LUA_COMPAT_LSTR)
  20. -- * Please read technotes.txt for more technical details.
  21. ----------------------------------------------------------------------]]
  22. local base = _G
  23. local string = require "string"
  24. module "llex"
  25. local find = string.find
  26. local match = string.match
  27. local sub = string.sub
  28. ----------------------------------------------------------------------
  29. -- initialize keyword list, variables
  30. ----------------------------------------------------------------------
  31. local kw = {}
  32. for v in string.gmatch([[
  33. and break do else elseif end false for function if in
  34. local nil not or repeat return then true until while]], "%S+") do
  35. kw[v] = true
  36. end
  37. -- NOTE: see init() for module variables (externally visible):
  38. -- tok, seminfo, tokln
  39. local z, -- source stream
  40. sourceid, -- name of source
  41. I, -- position of lexer
  42. buff, -- buffer for strings
  43. ln -- line number
  44. ----------------------------------------------------------------------
  45. -- add information to token listing
  46. ----------------------------------------------------------------------
  47. local function addtoken(token, info)
  48. local i = #tok + 1
  49. tok[i] = token
  50. seminfo[i] = info
  51. tokln[i] = ln
  52. end
  53. ----------------------------------------------------------------------
  54. -- handles line number incrementation and end-of-line characters
  55. ----------------------------------------------------------------------
  56. local function inclinenumber(i, is_tok)
  57. local sub = sub
  58. local old = sub(z, i, i)
  59. i = i + 1 -- skip '\n' or '\r'
  60. local c = sub(z, i, i)
  61. if (c == "\n" or c == "\r") and (c ~= old) then
  62. i = i + 1 -- skip '\n\r' or '\r\n'
  63. old = old..c
  64. end
  65. if is_tok then addtoken("TK_EOL", old) end
  66. ln = ln + 1
  67. I = i
  68. return i
  69. end
  70. ----------------------------------------------------------------------
  71. -- initialize lexer for given source _z and source name _sourceid
  72. ----------------------------------------------------------------------
  73. function init(_z, _sourceid)
  74. z = _z -- source
  75. sourceid = _sourceid -- name of source
  76. I = 1 -- lexer's position in source
  77. ln = 1 -- line number
  78. tok = {} -- lexed token list*
  79. seminfo = {} -- lexed semantic information list*
  80. tokln = {} -- line numbers for messages*
  81. -- (*) externally visible thru' module
  82. --------------------------------------------------------------------
  83. -- initial processing (shbang handling)
  84. --------------------------------------------------------------------
  85. local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
  86. if p then -- skip first line
  87. I = I + #q
  88. addtoken("TK_COMMENT", q)
  89. if #r > 0 then inclinenumber(I, true) end
  90. end
  91. end
  92. ----------------------------------------------------------------------
  93. -- returns a chunk name or id, no truncation for long names
  94. ----------------------------------------------------------------------
  95. function chunkid()
  96. if sourceid and match(sourceid, "^[=@]") then
  97. return sub(sourceid, 2) -- remove first char
  98. end
  99. return "[string]"
  100. end
  101. ----------------------------------------------------------------------
  102. -- formats error message and throws error
  103. -- * a simplified version, does not report what token was responsible
  104. ----------------------------------------------------------------------
  105. function errorline(s, line)
  106. local e = error or base.error
  107. e(string.format("%s:%d: %s", chunkid(), line or ln, s))
  108. end
  109. local errorline = errorline
  110. ------------------------------------------------------------------------
  111. -- count separators ("=") in a long string delimiter
  112. ------------------------------------------------------------------------
  113. local function skip_sep(i)
  114. local sub = sub
  115. local s = sub(z, i, i)
  116. i = i + 1
  117. local count = #match(z, "=*", i) -- note, take the length
  118. i = i + count
  119. I = i
  120. return (sub(z, i, i) == s) and count or (-count) - 1
  121. end
  122. ----------------------------------------------------------------------
  123. -- reads a long string or long comment
  124. ----------------------------------------------------------------------
  125. local function read_long_string(is_str, sep)
  126. local i = I + 1 -- skip 2nd '['
  127. local sub = sub
  128. local c = sub(z, i, i)
  129. if c == "\r" or c == "\n" then -- string starts with a newline?
  130. i = inclinenumber(i) -- skip it
  131. end
  132. local j = i
  133. while true do
  134. local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
  135. if not p then
  136. errorline(is_str and "unfinished long string" or
  137. "unfinished long comment")
  138. end
  139. i = p
  140. if r == "]" then -- delimiter test
  141. if skip_sep(i) == sep then
  142. buff = sub(z, buff, I)
  143. I = I + 1 -- skip 2nd ']'
  144. return buff
  145. end
  146. i = I
  147. else -- newline
  148. buff = buff.."\n"
  149. i = inclinenumber(i)
  150. end
  151. end--while
  152. end
  153. ----------------------------------------------------------------------
  154. -- reads a string
  155. ----------------------------------------------------------------------
  156. local function read_string(del)
  157. local i = I
  158. local find = find
  159. local sub = sub
  160. while true do
  161. local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
  162. if p then
  163. if r == "\n" or r == "\r" then
  164. errorline("unfinished string")
  165. end
  166. i = p
  167. if r == "\\" then -- handle escapes
  168. i = i + 1
  169. r = sub(z, i, i)
  170. if r == "" then break end -- (EOZ error)
  171. p = find("abfnrtv\n\r", r, 1, true)
  172. ------------------------------------------------------
  173. if p then -- special escapes
  174. if p > 7 then
  175. i = inclinenumber(i)
  176. else
  177. i = i + 1
  178. end
  179. ------------------------------------------------------
  180. elseif find(r, "%D") then -- other non-digits
  181. i = i + 1
  182. ------------------------------------------------------
  183. else -- \xxx sequence
  184. local p, q, s = find(z, "^(%d%d?%d?)", i)
  185. i = q + 1
  186. if s + 1 > 256 then -- UCHAR_MAX
  187. errorline("escape sequence too large")
  188. end
  189. ------------------------------------------------------
  190. end--if p
  191. else
  192. i = i + 1
  193. if r == del then -- ending delimiter
  194. I = i
  195. return sub(z, buff, i - 1) -- return string
  196. end
  197. end--if r
  198. else
  199. break -- (error)
  200. end--if p
  201. end--while
  202. errorline("unfinished string")
  203. end
  204. ------------------------------------------------------------------------
  205. -- main lexer function
  206. ------------------------------------------------------------------------
  207. function llex()
  208. local find = find
  209. local match = match
  210. while true do--outer
  211. local i = I
  212. -- inner loop allows break to be used to nicely section tests
  213. while true do--inner
  214. ----------------------------------------------------------------
  215. local p, _, r = find(z, "^([_%a][_%w]*)", i)
  216. if p then
  217. I = i + #r
  218. if kw[r] then
  219. addtoken("TK_KEYWORD", r) -- reserved word (keyword)
  220. else
  221. addtoken("TK_NAME", r) -- identifier
  222. end
  223. break -- (continue)
  224. end
  225. ----------------------------------------------------------------
  226. local p, _, r = find(z, "^(%.?)%d", i)
  227. if p then -- numeral
  228. if r == "." then i = i + 1 end
  229. local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
  230. i = q + 1
  231. if #r == 1 then -- optional exponent
  232. if match(z, "^[%+%-]", i) then -- optional sign
  233. i = i + 1
  234. end
  235. end
  236. local _, q = find(z, "^[_%w]*", i)
  237. I = q + 1
  238. local v = sub(z, p, q) -- string equivalent
  239. if not base.tonumber(v) then -- handles hex test also
  240. errorline("malformed number")
  241. end
  242. addtoken("TK_NUMBER", v)
  243. break -- (continue)
  244. end
  245. ----------------------------------------------------------------
  246. local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
  247. if p then
  248. if t == "\n" or t == "\r" then -- newline
  249. inclinenumber(i, true)
  250. else
  251. I = q + 1 -- whitespace
  252. addtoken("TK_SPACE", r)
  253. end
  254. break -- (continue)
  255. end
  256. ----------------------------------------------------------------
  257. local r = match(z, "^%p", i)
  258. if r then
  259. buff = i
  260. local p = find("-[\"\'.=<>~", r, 1, true)
  261. if p then
  262. -- two-level if block for punctuation/symbols
  263. --------------------------------------------------------
  264. if p <= 2 then
  265. if p == 1 then -- minus
  266. local c = match(z, "^%-%-(%[?)", i)
  267. if c then
  268. i = i + 2
  269. local sep = -1
  270. if c == "[" then
  271. sep = skip_sep(i)
  272. end
  273. if sep >= 0 then -- long comment
  274. addtoken("TK_LCOMMENT", read_long_string(false, sep))
  275. else -- short comment
  276. I = find(z, "[\n\r]", i) or (#z + 1)
  277. addtoken("TK_COMMENT", sub(z, buff, I - 1))
  278. end
  279. break -- (continue)
  280. end
  281. -- (fall through for "-")
  282. else -- [ or long string
  283. local sep = skip_sep(i)
  284. if sep >= 0 then
  285. addtoken("TK_LSTRING", read_long_string(true, sep))
  286. elseif sep == -1 then
  287. addtoken("TK_OP", "[")
  288. else
  289. errorline("invalid long string delimiter")
  290. end
  291. break -- (continue)
  292. end
  293. --------------------------------------------------------
  294. elseif p <= 5 then
  295. if p < 5 then -- strings
  296. I = i + 1
  297. addtoken("TK_STRING", read_string(r))
  298. break -- (continue)
  299. end
  300. r = match(z, "^%.%.?%.?", i) -- .|..|... dots
  301. -- (fall through)
  302. --------------------------------------------------------
  303. else -- relational
  304. r = match(z, "^%p=?", i)
  305. -- (fall through)
  306. end
  307. end
  308. I = i + #r
  309. addtoken("TK_OP", r) -- for other symbols, fall through
  310. break -- (continue)
  311. end
  312. ----------------------------------------------------------------
  313. local r = sub(z, i, i)
  314. if r ~= "" then
  315. I = i + 1
  316. addtoken("TK_OP", r) -- other single-char tokens
  317. break
  318. end
  319. addtoken("TK_EOS", "") -- end of stream,
  320. return -- exit here
  321. ----------------------------------------------------------------
  322. end--while inner
  323. end--while outer
  324. end
  325. return base.getfenv()