optlex.lua 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832
  1. --[[--------------------------------------------------------------------
  2. optlex.lua: does lexer-based optimizations
  3. This file is part of LuaSrcDiet.
  4. Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
  5. The COPYRIGHT file describes the conditions
  6. under which this software may be distributed.
  7. See the ChangeLog for more information.
  8. ----------------------------------------------------------------------]]
  9. --[[--------------------------------------------------------------------
  10. -- NOTES:
  11. -- * For more lexer-based optimization ideas, see the TODO items or
  12. -- look at technotes.txt.
  13. -- * TODO: general string delimiter conversion optimizer
  14. -- * TODO: (numbers) warn if overly significant digit
  15. ----------------------------------------------------------------------]]
  16. local base = _G
  17. local string = require "string"
  18. module "optlex"
  19. local match = string.match
  20. local sub = string.sub
  21. local find = string.find
  22. local rep = string.rep
  23. local print
  24. ------------------------------------------------------------------------
  25. -- variables and data structures
  26. ------------------------------------------------------------------------
  27. -- error function, can override by setting own function into module
  28. error = base.error
  29. warn = {} -- table for warning flags
  30. local stoks, sinfos, stoklns -- source lists
  31. local is_realtoken = { -- significant (grammar) tokens
  32. TK_KEYWORD = true,
  33. TK_NAME = true,
  34. TK_NUMBER = true,
  35. TK_STRING = true,
  36. TK_LSTRING = true,
  37. TK_OP = true,
  38. TK_EOS = true,
  39. }
  40. local is_faketoken = { -- whitespace (non-grammar) tokens
  41. TK_COMMENT = true,
  42. TK_LCOMMENT = true,
  43. TK_EOL = true,
  44. TK_SPACE = true,
  45. }
  46. local opt_details -- for extra information
  47. ------------------------------------------------------------------------
  48. -- true if current token is at the start of a line
  49. -- * skips over deleted tokens via recursion
  50. ------------------------------------------------------------------------
  51. local function atlinestart(i)
  52. local tok = stoks[i - 1]
  53. if i <= 1 or tok == "TK_EOL" then
  54. return true
  55. elseif tok == "" then
  56. return atlinestart(i - 1)
  57. end
  58. return false
  59. end
  60. ------------------------------------------------------------------------
  61. -- true if current token is at the end of a line
  62. -- * skips over deleted tokens via recursion
  63. ------------------------------------------------------------------------
  64. local function atlineend(i)
  65. local tok = stoks[i + 1]
  66. if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then
  67. return true
  68. elseif tok == "" then
  69. return atlineend(i + 1)
  70. end
  71. return false
  72. end
  73. ------------------------------------------------------------------------
  74. -- counts comment EOLs inside a long comment
  75. -- * in order to keep line numbering, EOLs need to be reinserted
  76. ------------------------------------------------------------------------
  77. local function commenteols(lcomment)
  78. local sep = #match(lcomment, "^%-%-%[=*%[")
  79. local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims
  80. local i, c = 1, 0
  81. while true do
  82. local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
  83. if not p then break end -- if no matches, done
  84. i = p + 1
  85. c = c + 1
  86. if #s > 0 and r ~= s then -- skip CRLF or LFCR
  87. i = i + 1
  88. end
  89. end
  90. return c
  91. end
  92. ------------------------------------------------------------------------
  93. -- compares two tokens (i, j) and returns the whitespace required
  94. -- * important! see technotes.txt for more information
  95. -- * only two grammar/real tokens are being considered
  96. -- * if "", no separation is needed
  97. -- * if " ", then at least one whitespace (or EOL) is required
  98. ------------------------------------------------------------------------
  99. local function checkpair(i, j)
  100. local match = match
  101. local t1, t2 = stoks[i], stoks[j]
  102. --------------------------------------------------------------------
  103. if t1 == "TK_STRING" or t1 == "TK_LSTRING" or
  104. t2 == "TK_STRING" or t2 == "TK_LSTRING" then
  105. return ""
  106. --------------------------------------------------------------------
  107. elseif t1 == "TK_OP" or t2 == "TK_OP" then
  108. if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or
  109. (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then
  110. return ""
  111. end
  112. if t1 == "TK_OP" and t2 == "TK_OP" then
  113. -- for TK_OP/TK_OP pairs, see notes in technotes.txt
  114. local op, op2 = sinfos[i], sinfos[j]
  115. if (match(op, "^%.%.?$") and match(op2, "^%.")) or
  116. (match(op, "^[~=<>]$") and op2 == "=") or
  117. (op == "[" and (op2 == "[" or op2 == "=")) then
  118. return " "
  119. end
  120. return ""
  121. end
  122. -- "TK_OP" + "TK_NUMBER" case
  123. local op = sinfos[i]
  124. if t2 == "TK_OP" then op = sinfos[j] end
  125. if match(op, "^%.%.?%.?$") then
  126. return " "
  127. end
  128. return ""
  129. --------------------------------------------------------------------
  130. else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then
  131. return " "
  132. --------------------------------------------------------------------
  133. end
  134. end
  135. ------------------------------------------------------------------------
  136. -- repack tokens, removing deletions caused by optimization process
  137. ------------------------------------------------------------------------
  138. local function repack_tokens()
  139. local dtoks, dinfos, dtoklns = {}, {}, {}
  140. local j = 1
  141. for i = 1, #stoks do
  142. local tok = stoks[i]
  143. if tok ~= "" then
  144. dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i]
  145. j = j + 1
  146. end
  147. end
  148. stoks, sinfos, stoklns = dtoks, dinfos, dtoklns
  149. end
  150. ------------------------------------------------------------------------
  151. -- number optimization
  152. -- * optimization using string formatting functions is one way of doing
  153. -- this, but here, we consider all cases and handle them separately
  154. -- (possibly an idiotic approach...)
  155. -- * scientific notation being generated is not in canonical form, this
  156. -- may or may not be a bad thing, feedback welcome
  157. -- * note: intermediate portions need to fit into a normal number range
  158. -- * optimizations can be divided based on number patterns:
  159. -- * hexadecimal:
  160. -- (1) no need to remove leading zeros, just skip to (2)
  161. -- (2) convert to integer if size equal or smaller
  162. -- * change if equal size -> lose the 'x' to reduce entropy
  163. -- (3) number is then processed as an integer
  164. -- (4) note: does not make 0[xX] consistent
  165. -- * integer:
  166. -- (1) note: includes anything with trailing ".", ".0", ...
  167. -- (2) remove useless fractional part, if present, e.g. 123.000
  168. -- (3) remove leading zeros, e.g. 000123
  169. -- (4) switch to scientific if shorter, e.g. 123000 -> 123e3
  170. -- * with fraction:
  171. -- (1) split into digits dot digits
  172. -- (2) if no integer portion, take as zero (can omit later)
  173. -- (3) handle degenerate .000 case, after which the fractional part
  174. -- must be non-zero (if zero, it's matched as an integer)
  175. -- (4) remove trailing zeros for fractional portion
  176. -- (5) p.q where p > 0 and q > 0 cannot be shortened any more
  177. -- (6) otherwise p == 0 and the form is .q, e.g. .000123
  178. -- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6
  179. -- * scientific:
  180. -- (1) split into (digits dot digits) [eE] ([+-] digits)
  181. -- (2) if significand has ".", shift it out so it becomes an integer
  182. -- (3) if significand is zero, just use zero
  183. -- (4) remove leading zeros for significand
  184. -- (5) shift out trailing zeros for significand
  185. -- (6) examine exponent and determine which format is best:
  186. -- integer, with fraction, scientific
  187. ------------------------------------------------------------------------
  188. local function do_number(i)
  189. local before = sinfos[i] -- 'before'
  190. local z = before -- working representation
  191. local y -- 'after', if better
  192. --------------------------------------------------------------------
  193. if match(z, "^0[xX]") then -- hexadecimal number
  194. local v = base.tostring(base.tonumber(z))
  195. if #v <= #z then
  196. z = v -- change to integer, AND continue
  197. else
  198. return -- no change; stick to hex
  199. end
  200. end
  201. --------------------------------------------------------------------
  202. if match(z, "^%d+%.?0*$") then -- integer or has useless frac
  203. z = match(z, "^(%d+)%.?0*$") -- int portion only
  204. if z + 0 > 0 then
  205. z = match(z, "^0*([1-9]%d*)$") -- remove leading zeros
  206. local v = #match(z, "0*$")
  207. local nv = base.tostring(v)
  208. if v > #nv + 1 then -- scientific is shorter
  209. z = sub(z, 1, #z - v).."e"..nv
  210. end
  211. y = z
  212. else
  213. y = "0" -- basic zero
  214. end
  215. --------------------------------------------------------------------
  216. elseif not match(z, "[eE]") then -- number with fraction part
  217. local p, q = match(z, "^(%d*)%.(%d+)$") -- split
  218. if p == "" then p = 0 end -- int part zero
  219. if q + 0 == 0 and p == 0 then
  220. y = "0" -- degenerate .000 case
  221. else
  222. -- now, q > 0 holds and p is a number
  223. local v = #match(q, "0*$") -- remove trailing zeros
  224. if v > 0 then
  225. q = sub(q, 1, #q - v)
  226. end
  227. -- if p > 0, nothing else we can do to simplify p.q case
  228. if p + 0 > 0 then
  229. y = p.."."..q
  230. else
  231. y = "."..q -- tentative, e.g. .000123
  232. local v = #match(q, "^0*") -- # leading spaces
  233. local w = #q - v -- # significant digits
  234. local nv = base.tostring(#q)
  235. -- e.g. compare 123e-6 versus .000123
  236. if w + 2 + #nv < 1 + #q then
  237. y = sub(q, -w).."e-"..nv
  238. end
  239. end
  240. end
  241. --------------------------------------------------------------------
  242. else -- scientific number
  243. local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$")
  244. ex = base.tonumber(ex)
  245. -- if got ".", shift out fractional portion of significand
  246. local p, q = match(sig, "^(%d*)%.(%d*)$")
  247. if p then
  248. ex = ex - #q
  249. sig = p..q
  250. end
  251. if sig + 0 == 0 then
  252. y = "0" -- basic zero
  253. else
  254. local v = #match(sig, "^0*") -- remove leading zeros
  255. sig = sub(sig, v + 1)
  256. v = #match(sig, "0*$") -- shift out trailing zeros
  257. if v > 0 then
  258. sig = sub(sig, 1, #sig - v)
  259. ex = ex + v
  260. end
  261. -- examine exponent and determine which format is best
  262. local nex = base.tostring(ex)
  263. if ex == 0 then -- it's just an integer
  264. y = sig
  265. elseif ex > 0 and (ex <= 1 + #nex) then -- a number
  266. y = sig..rep("0", ex)
  267. elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123
  268. v = #sig + ex
  269. y = sub(sig, 1, v).."."..sub(sig, v + 1)
  270. elseif ex < 0 and (#nex >= -ex - #sig) then
  271. -- e.g. compare 1234e-5 versus .01234
  272. -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig
  273. -- -> #nex >= -ex - #sig
  274. v = -ex - #sig
  275. y = "."..rep("0", v)..sig
  276. else -- non-canonical scientific representation
  277. y = sig.."e"..ex
  278. end
  279. end--if sig
  280. end
  281. --------------------------------------------------------------------
  282. if y and y ~= sinfos[i] then
  283. if opt_details then
  284. print("<number> (line "..stoklns[i]..") "..sinfos[i].." -> "..y)
  285. opt_details = opt_details + 1
  286. end
  287. sinfos[i] = y
  288. end
  289. end
  290. ------------------------------------------------------------------------
  291. -- string optimization
  292. -- * note: works on well-formed strings only!
  293. -- * optimizations on characters can be summarized as follows:
  294. -- \a\b\f\n\r\t\v -- no change
  295. -- \\ -- no change
  296. -- \"\' -- depends on delim, other can remove \
  297. -- \[\] -- remove \
  298. -- \<char> -- general escape, remove \
  299. -- \<eol> -- normalize the EOL only
  300. -- \ddd -- if \a\b\f\n\r\t\v, change to latter
  301. -- if other < ascii 32, keep ddd but zap leading zeros
  302. -- if >= ascii 32, translate it into the literal, then also
  303. -- do escapes for \\,\",\' cases
  304. -- <other> -- no change
  305. -- * switch delimiters if string becomes shorter
  306. ------------------------------------------------------------------------
  307. local function do_string(I)
  308. local info = sinfos[I]
  309. local delim = sub(info, 1, 1) -- delimiter used
  310. local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> '
  311. local z = sub(info, 2, -2) -- actual string
  312. local i = 1
  313. local c_delim, c_ndelim = 0, 0 -- "/' counts
  314. --------------------------------------------------------------------
  315. while i <= #z do
  316. local c = sub(z, i, i)
  317. ----------------------------------------------------------------
  318. if c == "\\" then -- escaped stuff
  319. local j = i + 1
  320. local d = sub(z, j, j)
  321. local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true)
  322. ------------------------------------------------------------
  323. if not p then -- \<char> -- remove \
  324. z = sub(z, 1, i - 1)..sub(z, j)
  325. i = i + 1
  326. ------------------------------------------------------------
  327. elseif p <= 8 then -- \a\b\f\n\r\t\v\\
  328. i = i + 2 -- no change
  329. ------------------------------------------------------------
  330. elseif p <= 10 then -- \<eol> -- normalize EOL
  331. local eol = sub(z, j, j + 1)
  332. if eol == "\r\n" or eol == "\n\r" then
  333. z = sub(z, 1, i).."\n"..sub(z, j + 2)
  334. elseif p == 10 then -- \r case
  335. z = sub(z, 1, i).."\n"..sub(z, j + 1)
  336. end
  337. i = i + 2
  338. ------------------------------------------------------------
  339. elseif p <= 12 then -- \"\' -- remove \ for ndelim
  340. if d == delim then
  341. c_delim = c_delim + 1
  342. i = i + 2
  343. else
  344. c_ndelim = c_ndelim + 1
  345. z = sub(z, 1, i - 1)..sub(z, j)
  346. i = i + 1
  347. end
  348. ------------------------------------------------------------
  349. else -- \ddd -- various steps
  350. local s = match(z, "^(%d%d?%d?)", j)
  351. j = i + 1 + #s -- skip to location
  352. local cv = s + 0
  353. local cc = string.char(cv)
  354. local p = find("\a\b\f\n\r\t\v", cc, 1, true)
  355. if p then -- special escapes
  356. s = "\\"..sub("abfnrtv", p, p)
  357. elseif cv < 32 then -- normalized \ddd
  358. s = "\\"..cv
  359. elseif cc == delim then -- \<delim>
  360. s = "\\"..cc
  361. c_delim = c_delim + 1
  362. elseif cc == "\\" then -- \\
  363. s = "\\\\"
  364. else -- literal character
  365. s = cc
  366. if cc == ndelim then
  367. c_ndelim = c_ndelim + 1
  368. end
  369. end
  370. z = sub(z, 1, i - 1)..s..sub(z, j)
  371. i = i + #s
  372. ------------------------------------------------------------
  373. end--if p
  374. ----------------------------------------------------------------
  375. else-- c ~= "\\" -- <other> -- no change
  376. i = i + 1
  377. if c == ndelim then -- count ndelim, for switching delimiters
  378. c_ndelim = c_ndelim + 1
  379. end
  380. ----------------------------------------------------------------
  381. end--if c
  382. end--while
  383. --------------------------------------------------------------------
  384. -- switching delimiters, a long-winded derivation:
  385. -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes
  386. -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes
  387. -- simplifying the condition (1)>(2) --> c_delim > c_ndelim
  388. if c_delim > c_ndelim then
  389. i = 1
  390. while i <= #z do
  391. local p, q, r = find(z, "([\'\"])", i)
  392. if not p then break end
  393. if r == delim then -- \<delim> -> <delim>
  394. z = sub(z, 1, p - 2)..sub(z, p)
  395. i = p
  396. else-- r == ndelim -- <ndelim> -> \<ndelim>
  397. z = sub(z, 1, p - 1).."\\"..sub(z, p)
  398. i = p + 2
  399. end
  400. end--while
  401. delim = ndelim -- actually change delimiters
  402. end
  403. --------------------------------------------------------------------
  404. z = delim..z..delim
  405. if z ~= sinfos[I] then
  406. if opt_details then
  407. print("<string> (line "..stoklns[I]..") "..sinfos[I].." -> "..z)
  408. opt_details = opt_details + 1
  409. end
  410. sinfos[I] = z
  411. end
  412. end
  413. ------------------------------------------------------------------------
  414. -- long string optimization
  415. -- * note: warning flagged if trailing whitespace found, not trimmed
  416. -- * remove first optional newline
  417. -- * normalize embedded newlines
  418. -- * reduce '=' separators in delimiters if possible
  419. ------------------------------------------------------------------------
  420. local function do_lstring(I)
  421. local info = sinfos[I]
  422. local delim1 = match(info, "^%[=*%[") -- cut out delimiters
  423. local sep = #delim1
  424. local delim2 = sub(info, -sep, -1)
  425. local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims
  426. local y = ""
  427. local i = 1
  428. --------------------------------------------------------------------
  429. while true do
  430. local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
  431. -- deal with a single line
  432. local ln
  433. if not p then
  434. ln = sub(z, i)
  435. elseif p >= i then
  436. ln = sub(z, i, p - 1)
  437. end
  438. if ln ~= "" then
  439. -- flag a warning if there are trailing spaces, won't optimize!
  440. if match(ln, "%s+$") then
  441. warn.lstring = "trailing whitespace in long string near line "..stoklns[I]
  442. end
  443. y = y..ln
  444. end
  445. if not p then -- done if no more EOLs
  446. break
  447. end
  448. -- deal with line endings, normalize them
  449. i = p + 1
  450. if p then
  451. if #s > 0 and r ~= s then -- skip CRLF or LFCR
  452. i = i + 1
  453. end
  454. -- skip first newline, which can be safely deleted
  455. if not(i == 1 and i == p) then
  456. y = y.."\n"
  457. end
  458. end
  459. end--while
  460. --------------------------------------------------------------------
  461. -- handle possible deletion of one or more '=' separators
  462. if sep >= 3 then
  463. local chk, okay = sep - 1
  464. -- loop to test ending delimiter with less of '=' down to zero
  465. while chk >= 2 do
  466. local delim = "%]"..rep("=", chk - 2).."%]"
  467. if not match(y, delim) then okay = chk end
  468. chk = chk - 1
  469. end
  470. if okay then -- change delimiters
  471. sep = rep("=", okay - 2)
  472. delim1, delim2 = "["..sep.."[", "]"..sep.."]"
  473. end
  474. end
  475. --------------------------------------------------------------------
  476. sinfos[I] = delim1..y..delim2
  477. end
  478. ------------------------------------------------------------------------
  479. -- long comment optimization
  480. -- * note: does not remove first optional newline
  481. -- * trim trailing whitespace
  482. -- * normalize embedded newlines
  483. -- * reduce '=' separators in delimiters if possible
  484. ------------------------------------------------------------------------
  485. local function do_lcomment(I)
  486. local info = sinfos[I]
  487. local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters
  488. local sep = #delim1
  489. local delim2 = sub(info, -sep, -1)
  490. local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims
  491. local y = ""
  492. local i = 1
  493. --------------------------------------------------------------------
  494. while true do
  495. local p, q, r, s = find(z, "([\r\n])([\r\n]?)", i)
  496. -- deal with a single line, extract and check trailing whitespace
  497. local ln
  498. if not p then
  499. ln = sub(z, i)
  500. elseif p >= i then
  501. ln = sub(z, i, p - 1)
  502. end
  503. if ln ~= "" then
  504. -- trim trailing whitespace if non-empty line
  505. local ws = match(ln, "%s*$")
  506. if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end
  507. y = y..ln
  508. end
  509. if not p then -- done if no more EOLs
  510. break
  511. end
  512. -- deal with line endings, normalize them
  513. i = p + 1
  514. if p then
  515. if #s > 0 and r ~= s then -- skip CRLF or LFCR
  516. i = i + 1
  517. end
  518. y = y.."\n"
  519. end
  520. end--while
  521. --------------------------------------------------------------------
  522. -- handle possible deletion of one or more '=' separators
  523. sep = sep - 2
  524. if sep >= 3 then
  525. local chk, okay = sep - 1
  526. -- loop to test ending delimiter with less of '=' down to zero
  527. while chk >= 2 do
  528. local delim = "%]"..rep("=", chk - 2).."%]"
  529. if not match(y, delim) then okay = chk end
  530. chk = chk - 1
  531. end
  532. if okay then -- change delimiters
  533. sep = rep("=", okay - 2)
  534. delim1, delim2 = "--["..sep.."[", "]"..sep.."]"
  535. end
  536. end
  537. --------------------------------------------------------------------
  538. sinfos[I] = delim1..y..delim2
  539. end
  540. ------------------------------------------------------------------------
  541. -- short comment optimization
  542. -- * trim trailing whitespace
  543. ------------------------------------------------------------------------
  544. local function do_comment(i)
  545. local info = sinfos[i]
  546. local ws = match(info, "%s*$") -- just look from end of string
  547. if #ws > 0 then
  548. info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace
  549. end
  550. sinfos[i] = info
  551. end
  552. ------------------------------------------------------------------------
  553. -- returns true if string found in long comment
  554. -- * this is a feature to keep copyright or license texts
  555. ------------------------------------------------------------------------
  556. local function keep_lcomment(opt_keep, info)
  557. if not opt_keep then return false end -- option not set
  558. local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters
  559. local sep = #delim1
  560. local delim2 = sub(info, -sep, -1)
  561. local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims
  562. if find(z, opt_keep, 1, true) then -- try to match
  563. return true
  564. end
  565. end
  566. ------------------------------------------------------------------------
  567. -- main entry point
  568. -- * currently, lexer processing has 2 passes
  569. -- * processing is done on a line-oriented basis, which is easier to
  570. -- grok due to the next point...
  571. -- * since there are various options that can be enabled or disabled,
  572. -- processing is a little messy or convoluted
  573. ------------------------------------------------------------------------
  574. function optimize(option, toklist, semlist, toklnlist)
  575. --------------------------------------------------------------------
  576. -- set option flags
  577. --------------------------------------------------------------------
  578. local opt_comments = option["opt-comments"]
  579. local opt_whitespace = option["opt-whitespace"]
  580. local opt_emptylines = option["opt-emptylines"]
  581. local opt_eols = option["opt-eols"]
  582. local opt_strings = option["opt-strings"]
  583. local opt_numbers = option["opt-numbers"]
  584. local opt_keep = option.KEEP
  585. opt_details = option.DETAILS and 0 -- upvalues for details display
  586. print = print or base.print
  587. if opt_eols then -- forced settings, otherwise won't work properly
  588. opt_comments = true
  589. opt_whitespace = true
  590. opt_emptylines = true
  591. end
  592. --------------------------------------------------------------------
  593. -- variable initialization
  594. --------------------------------------------------------------------
  595. stoks, sinfos, stoklns -- set source lists
  596. = toklist, semlist, toklnlist
  597. local i = 1 -- token position
  598. local tok, info -- current token
  599. local prev -- position of last grammar token
  600. -- on same line (for TK_SPACE stuff)
  601. --------------------------------------------------------------------
  602. -- changes a token, info pair
  603. --------------------------------------------------------------------
  604. local function settoken(tok, info, I)
  605. I = I or i
  606. stoks[I] = tok or ""
  607. sinfos[I] = info or ""
  608. end
  609. --------------------------------------------------------------------
  610. -- processing loop (PASS 1)
  611. --------------------------------------------------------------------
  612. while true do
  613. tok, info = stoks[i], sinfos[i]
  614. ----------------------------------------------------------------
  615. local atstart = atlinestart(i) -- set line begin flag
  616. if atstart then prev = nil end
  617. ----------------------------------------------------------------
  618. if tok == "TK_EOS" then -- end of stream/pass
  619. break
  620. ----------------------------------------------------------------
  621. elseif tok == "TK_KEYWORD" or -- keywords, identifiers,
  622. tok == "TK_NAME" or -- operators
  623. tok == "TK_OP" then
  624. -- TK_KEYWORD and TK_OP can't be optimized without a big
  625. -- optimization framework; it would be more of an optimizing
  626. -- compiler, not a source code compressor
  627. -- TK_NAME that are locals needs parser to analyze/optimize
  628. prev = i
  629. ----------------------------------------------------------------
  630. elseif tok == "TK_NUMBER" then -- numbers
  631. if opt_numbers then
  632. do_number(i) -- optimize
  633. end
  634. prev = i
  635. ----------------------------------------------------------------
  636. elseif tok == "TK_STRING" or -- strings, long strings
  637. tok == "TK_LSTRING" then
  638. if opt_strings then
  639. if tok == "TK_STRING" then
  640. do_string(i) -- optimize
  641. else
  642. do_lstring(i) -- optimize
  643. end
  644. end
  645. prev = i
  646. ----------------------------------------------------------------
  647. elseif tok == "TK_COMMENT" then -- short comments
  648. if opt_comments then
  649. if i == 1 and sub(info, 1, 1) == "#" then
  650. -- keep shbang comment, trim whitespace
  651. do_comment(i)
  652. else
  653. -- safe to delete, as a TK_EOL (or TK_EOS) always follows
  654. settoken() -- remove entirely
  655. end
  656. elseif opt_whitespace then -- trim whitespace only
  657. do_comment(i)
  658. end
  659. ----------------------------------------------------------------
  660. elseif tok == "TK_LCOMMENT" then -- long comments
  661. if keep_lcomment(opt_keep, info) then
  662. ------------------------------------------------------------
  663. -- if --keep, we keep a long comment if <msg> is found;
  664. -- this is a feature to keep copyright or license texts
  665. if opt_whitespace then -- trim whitespace only
  666. do_lcomment(i)
  667. end
  668. prev = i
  669. elseif opt_comments then
  670. local eols = commenteols(info)
  671. ------------------------------------------------------------
  672. -- prepare opt_emptylines case first, if a disposable token
  673. -- follows, current one is safe to dump, else keep a space;
  674. -- it is implied that the operation is safe for '-', because
  675. -- current is a TK_LCOMMENT, and must be separate from a '-'
  676. if is_faketoken[stoks[i + 1]] then
  677. settoken() -- remove entirely
  678. tok = ""
  679. else
  680. settoken("TK_SPACE", " ")
  681. end
  682. ------------------------------------------------------------
  683. -- if there are embedded EOLs to keep and opt_emptylines is
  684. -- disabled, then switch the token into one or more EOLs
  685. if not opt_emptylines and eols > 0 then
  686. settoken("TK_EOL", rep("\n", eols))
  687. end
  688. ------------------------------------------------------------
  689. -- if optimizing whitespaces, force reinterpretation of the
  690. -- token to give a chance for the space to be optimized away
  691. if opt_whitespace and tok ~= "" then
  692. i = i - 1 -- to reinterpret
  693. end
  694. ------------------------------------------------------------
  695. else -- disabled case
  696. if opt_whitespace then -- trim whitespace only
  697. do_lcomment(i)
  698. end
  699. prev = i
  700. end
  701. ----------------------------------------------------------------
  702. elseif tok == "TK_EOL" then -- line endings
  703. if atstart and opt_emptylines then
  704. settoken() -- remove entirely
  705. elseif info == "\r\n" or info == "\n\r" then
  706. -- normalize the rest of the EOLs for CRLF/LFCR only
  707. -- (note that TK_LCOMMENT can change into several EOLs)
  708. settoken("TK_EOL", "\n")
  709. end
  710. ----------------------------------------------------------------
  711. elseif tok == "TK_SPACE" then -- whitespace
  712. if opt_whitespace then
  713. if atstart or atlineend(i) then
  714. -- delete leading and trailing whitespace
  715. settoken() -- remove entirely
  716. else
  717. ------------------------------------------------------------
  718. -- at this point, since leading whitespace have been removed,
  719. -- there should be a either a real token or a TK_LCOMMENT
  720. -- prior to hitting this whitespace; the TK_LCOMMENT case
  721. -- only happens if opt_comments is disabled; so prev ~= nil
  722. local ptok = stoks[prev]
  723. if ptok == "TK_LCOMMENT" then
  724. -- previous TK_LCOMMENT can abut with anything
  725. settoken() -- remove entirely
  726. else
  727. -- prev must be a grammar token; consecutive TK_SPACE
  728. -- tokens is impossible when optimizing whitespace
  729. local ntok = stoks[i + 1]
  730. if is_faketoken[ntok] then
  731. -- handle special case where a '-' cannot abut with
  732. -- either a short comment or a long comment
  733. if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and
  734. ptok == "TK_OP" and sinfos[prev] == "-" then
  735. -- keep token
  736. else
  737. settoken() -- remove entirely
  738. end
  739. else--is_realtoken
  740. -- check a pair of grammar tokens, if can abut, then
  741. -- delete space token entirely, otherwise keep one space
  742. local s = checkpair(prev, i + 1)
  743. if s == "" then
  744. settoken() -- remove entirely
  745. else
  746. settoken("TK_SPACE", " ")
  747. end
  748. end
  749. end
  750. ------------------------------------------------------------
  751. end
  752. end
  753. ----------------------------------------------------------------
  754. else
  755. error("unidentified token encountered")
  756. end
  757. ----------------------------------------------------------------
  758. i = i + 1
  759. end--while
  760. repack_tokens()
  761. --------------------------------------------------------------------
  762. -- processing loop (PASS 2)
  763. --------------------------------------------------------------------
  764. if opt_eols then
  765. i = 1
  766. -- aggressive EOL removal only works with most non-grammar tokens
  767. -- optimized away because it is a rather simple scheme -- basically
  768. -- it just checks 'real' token pairs around EOLs
  769. if stoks[1] == "TK_COMMENT" then
  770. -- first comment still existing must be shbang, skip whole line
  771. i = 3
  772. end
  773. while true do
  774. tok, info = stoks[i], sinfos[i]
  775. --------------------------------------------------------------
  776. if tok == "TK_EOS" then -- end of stream/pass
  777. break
  778. --------------------------------------------------------------
  779. elseif tok == "TK_EOL" then -- consider each TK_EOL
  780. local t1, t2 = stoks[i - 1], stoks[i + 1]
  781. if is_realtoken[t1] and is_realtoken[t2] then -- sanity check
  782. local s = checkpair(i - 1, i + 1)
  783. if s == "" then
  784. settoken() -- remove entirely
  785. end
  786. end
  787. end--if tok
  788. --------------------------------------------------------------
  789. i = i + 1
  790. end--while
  791. repack_tokens()
  792. end
  793. --------------------------------------------------------------------
  794. if opt_details and opt_details > 0 then print() end -- spacing
  795. return stoks, sinfos, stoklns
  796. end