lexer.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. /*
  2. * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include <stdbool.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <ctype.h>
  20. #include <regex.h>
  21. #include "ast.h"
  22. #include "lexer.h"
  23. #include "parser.h"
  24. struct token {
  25. int type;
  26. const char *pat;
  27. int plen;
  28. int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
  29. };
  30. #define dec(o) \
  31. ((o) - '0')
  32. #define hex(x) \
  33. (((x) >= 'a') ? (10 + (x) - 'a') : \
  34. (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
  35. /*
  36. * Stores the given codepoint as a utf8 multibyte sequence into the given
  37. * output buffer and substracts the required amount of bytes from the given
  38. * length pointer.
  39. *
  40. * Returns false if the multibyte sequence would not fit into the buffer,
  41. * otherwise true.
  42. */
  43. static bool
  44. utf8enc(char **out, int *rem, int code)
  45. {
  46. if (code > 0 && code <= 0x7F)
  47. {
  48. if (*rem < 1)
  49. return false;
  50. *(*out)++ = code; (*rem)--;
  51. return true;
  52. }
  53. else if (code > 0 && code <= 0x7FF)
  54. {
  55. if (*rem < 2)
  56. return false;
  57. *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--;
  58. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  59. return true;
  60. }
  61. else if (code > 0 && code <= 0xFFFF)
  62. {
  63. if (*rem < 3)
  64. return false;
  65. *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
  66. *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
  67. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  68. return true;
  69. }
  70. else if (code > 0 && code <= 0x10FFFF)
  71. {
  72. if (*rem < 4)
  73. return false;
  74. *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
  75. *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
  76. *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
  77. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  78. return true;
  79. }
  80. return true;
  81. }
  82. /*
  83. * Parses a string literal from the given buffer.
  84. *
  85. * Returns a negative value on error, otherwise the amount of consumed
  86. * characters from the given buffer.
  87. *
  88. * Error values:
  89. * -1 Unterminated string
  90. * -2 Invalid escape sequence
  91. * -3 String literal too long
  92. */
  93. static int
  94. parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
  95. {
  96. char q = *(buf++);
  97. char str[128] = { 0 };
  98. char *out = str;
  99. const char *in = buf;
  100. bool esc = false;
  101. int rem = sizeof(str) - 1;
  102. int code;
  103. while (*in)
  104. {
  105. /* continuation of escape sequence */
  106. if (esc)
  107. {
  108. /* \uFFFF */
  109. if (in[0] == 'u')
  110. {
  111. if (isxdigit(in[1]) && isxdigit(in[2]) &&
  112. isxdigit(in[3]) && isxdigit(in[4]))
  113. {
  114. if (!utf8enc(&out, &rem,
  115. hex(in[1]) * 16 * 16 * 16 +
  116. hex(in[2]) * 16 * 16 +
  117. hex(in[3]) * 16 +
  118. hex(in[4])))
  119. {
  120. s->error_pos = s->off + (in - buf);
  121. return -3;
  122. }
  123. in += 5;
  124. }
  125. else
  126. {
  127. s->error_pos = s->off + (in - buf);
  128. return -2;
  129. }
  130. }
  131. /* \xFF */
  132. else if (in[0] == 'x')
  133. {
  134. if (isxdigit(in[1]) && isxdigit(in[2]))
  135. {
  136. if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
  137. {
  138. s->error_pos = s->off + (in - buf);
  139. return -3;
  140. }
  141. in += 3;
  142. }
  143. else
  144. {
  145. s->error_pos = s->off + (in - buf);
  146. return -2;
  147. }
  148. }
  149. /* \377, \77 or \7 */
  150. else if (in[0] >= '0' && in[0] <= '7')
  151. {
  152. /* \377 */
  153. if (in[1] >= '0' && in[1] <= '7' &&
  154. in[2] >= '0' && in[2] <= '7')
  155. {
  156. code = dec(in[0]) * 8 * 8 +
  157. dec(in[1]) * 8 +
  158. dec(in[2]);
  159. if (code > 255)
  160. {
  161. s->error_pos = s->off + (in - buf);
  162. return -2;
  163. }
  164. if (!utf8enc(&out, &rem, code))
  165. {
  166. s->error_pos = s->off + (in - buf);
  167. return -3;
  168. }
  169. in += 3;
  170. }
  171. /* \77 */
  172. else if (in[1] >= '0' && in[1] <= '7')
  173. {
  174. if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
  175. {
  176. s->error_pos = s->off + (in - buf);
  177. return -3;
  178. }
  179. in += 2;
  180. }
  181. /* \7 */
  182. else
  183. {
  184. if (!utf8enc(&out, &rem, dec(in[0])))
  185. {
  186. s->error_pos = s->off + (in - buf);
  187. return -3;
  188. }
  189. in += 1;
  190. }
  191. }
  192. /* single character escape */
  193. else
  194. {
  195. if (rem-- < 1)
  196. {
  197. s->error_pos = s->off + (in - buf);
  198. return -3;
  199. }
  200. switch (in[0])
  201. {
  202. case 'a': *out = '\a'; break;
  203. case 'b': *out = '\b'; break;
  204. case 'e': *out = '\e'; break;
  205. case 'f': *out = '\f'; break;
  206. case 'n': *out = '\n'; break;
  207. case 'r': *out = '\r'; break;
  208. case 't': *out = '\t'; break;
  209. case 'v': *out = '\v'; break;
  210. default:
  211. /* in regexp mode, retain backslash */
  212. if (q == '/')
  213. {
  214. if (rem-- < 1)
  215. {
  216. s->error_pos = s->off + (in - buf);
  217. return -3;
  218. }
  219. *out++ = '\\';
  220. }
  221. *out = *in;
  222. break;
  223. }
  224. in++;
  225. out++;
  226. }
  227. esc = false;
  228. }
  229. /* begin of escape sequence */
  230. else if (*in == '\\')
  231. {
  232. in++;
  233. esc = true;
  234. }
  235. /* terminating quote */
  236. else if (*in == q)
  237. {
  238. op->str = strdup(str);
  239. return (in - buf) + 2;
  240. }
  241. /* ordinary char */
  242. else
  243. {
  244. if (rem-- < 1)
  245. {
  246. s->error_pos = s->off + (in - buf);
  247. return -3;
  248. }
  249. *out++ = *in++;
  250. }
  251. }
  252. return -1;
  253. }
  254. /*
  255. * Parses a regexp literal from the given buffer.
  256. *
  257. * Returns a negative value on error, otherwise the amount of consumed
  258. * characters from the given buffer.
  259. *
  260. * Error values:
  261. * -1 Unterminated regexp
  262. * -2 Invalid escape sequence
  263. * -3 Regexp literal too long
  264. */
  265. static int
  266. parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
  267. {
  268. int len = parse_string(buf, op, s);
  269. const char *p;
  270. if (len >= 2)
  271. {
  272. op->num = REG_NOSUB | REG_NEWLINE;
  273. for (p = buf + len; p; p++)
  274. {
  275. switch (*p)
  276. {
  277. case 'e':
  278. op->num |= REG_EXTENDED;
  279. len++;
  280. break;
  281. case 'i':
  282. op->num |= REG_ICASE;
  283. len++;
  284. break;
  285. case 's':
  286. op->num &= ~REG_NEWLINE;
  287. len++;
  288. break;
  289. default:
  290. return len;
  291. }
  292. }
  293. }
  294. return len;
  295. }
  296. /*
  297. * Parses a label from the given buffer.
  298. *
  299. * Returns a negative value on error, otherwise the amount of consumed
  300. * characters from the given buffer.
  301. *
  302. * Error values:
  303. * -3 Label too long
  304. */
  305. static int
  306. parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
  307. {
  308. char str[128] = { 0 };
  309. char *out = str;
  310. const char *in = buf;
  311. int rem = sizeof(str) - 1;
  312. while (*in == '_' || isalnum(*in))
  313. {
  314. if (rem-- < 1)
  315. {
  316. s->error_pos = s->off + (in - buf);
  317. return -3;
  318. }
  319. *out++ = *in++;
  320. }
  321. if (!strcmp(str, "true") || !strcmp(str, "false"))
  322. {
  323. op->num = (str[0] == 't');
  324. op->type = T_BOOL;
  325. }
  326. else
  327. {
  328. op->str = strdup(str);
  329. }
  330. return (in - buf);
  331. }
  332. /*
  333. * Parses a number literal from the given buffer.
  334. *
  335. * Returns a negative value on error, otherwise the amount of consumed
  336. * characters from the given buffer.
  337. *
  338. * Error values:
  339. * -2 Invalid number character
  340. */
  341. static int
  342. parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
  343. {
  344. char *e;
  345. int n = strtol(buf, &e, 10);
  346. if (e == buf)
  347. {
  348. s->error_pos = s->off;
  349. return -2;
  350. }
  351. op->num = n;
  352. return (e - buf);
  353. }
  354. static const struct token tokens[] = {
  355. { 0, " ", 1 },
  356. { 0, "\t", 1 },
  357. { 0, "\n", 1 },
  358. { T_LE, "<=", 2 },
  359. { T_GE, ">=", 2 },
  360. { T_NE, "!=", 2 },
  361. { T_AND, "&&", 2 },
  362. { T_OR, "||", 2 },
  363. { T_DOT, ".", 1 },
  364. { T_BROPEN, "[", 1 },
  365. { T_BRCLOSE, "]", 1 },
  366. { T_POPEN, "(", 1 },
  367. { T_PCLOSE, ")", 1 },
  368. { T_UNION, ",", 1 },
  369. { T_ROOT, "$", 1 },
  370. { T_THIS, "@", 1 },
  371. { T_LT, "<", 1 },
  372. { T_GT, ">", 1 },
  373. { T_EQ, "=", 1 },
  374. { T_MATCH, "~", 1 },
  375. { T_NOT, "!", 1 },
  376. { T_WILDCARD, "*", 1 },
  377. { T_REGEXP, "/", 1, parse_regexp },
  378. { T_STRING, "'", 1, parse_string },
  379. { T_STRING, "\"", 1, parse_string },
  380. { T_LABEL, "_", 1, parse_label },
  381. { T_LABEL, "az", 0, parse_label },
  382. { T_LABEL, "AZ", 0, parse_label },
  383. { T_NUMBER, "-", 1, parse_number },
  384. { T_NUMBER, "09", 0, parse_number },
  385. };
  386. const char *tokennames[25] = {
  387. [0] = "End of file",
  388. [T_AND] = "'&&'",
  389. [T_OR] = "'||'",
  390. [T_UNION] = "','",
  391. [T_EQ] = "'='",
  392. [T_NE] = "'!='",
  393. [T_GT] = "'>'",
  394. [T_GE] = "'>='",
  395. [T_LT] = "'<'",
  396. [T_LE] = "'<='",
  397. [T_MATCH] = "'~'",
  398. [T_NOT] = "'!'",
  399. [T_LABEL] = "Label",
  400. [T_ROOT] = "'$'",
  401. [T_THIS] = "'@'",
  402. [T_DOT] = "'.'",
  403. [T_WILDCARD] = "'*'",
  404. [T_REGEXP] = "/.../",
  405. [T_BROPEN] = "'['",
  406. [T_BRCLOSE] = "']'",
  407. [T_BOOL] = "Bool",
  408. [T_NUMBER] = "Number",
  409. [T_STRING] = "String",
  410. [T_POPEN] = "'('",
  411. [T_PCLOSE] = "')'",
  412. };
  413. static int
  414. match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
  415. {
  416. int i;
  417. const struct token *tok;
  418. for (i = 0, tok = &tokens[0];
  419. i < sizeof(tokens) / sizeof(tokens[0]);
  420. i++, tok = &tokens[i])
  421. {
  422. if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
  423. (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
  424. {
  425. op->type = tok->type;
  426. if (tok->parse)
  427. return tok->parse(ptr, op, s);
  428. return tok->plen;
  429. }
  430. }
  431. s->error_pos = s->off;
  432. return -4;
  433. }
  434. struct jp_opcode *
  435. jp_get_token(struct jp_state *s, const char *input, int *mlen)
  436. {
  437. struct jp_opcode op = { 0 };
  438. *mlen = match_token(input, &op, s);
  439. if (*mlen < 0)
  440. {
  441. s->error_code = *mlen;
  442. return NULL;
  443. }
  444. else if (op.type == 0)
  445. {
  446. return NULL;
  447. }
  448. return jp_alloc_op(s, op.type, op.num, op.str, NULL);
  449. }