lexer.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /*
  2. * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include <stdbool.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <ctype.h>
  20. #include "ast.h"
  21. #include "lexer.h"
  22. #include "parser.h"
  23. struct token {
  24. int type;
  25. const char *pat;
  26. int plen;
  27. int (*parse)(const char *buf, struct jp_opcode *op);
  28. };
  29. #define dec(o) \
  30. ((o) - '0')
  31. #define hex(x) \
  32. (((x) >= 'a') ? (10 + (x) - 'a') : \
  33. (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
  34. /*
  35. * Stores the given codepoint as a utf8 multibyte sequence into the given
  36. * output buffer and substracts the required amount of bytes from the given
  37. * length pointer.
  38. *
  39. * Returns false if the multibyte sequence would not fit into the buffer,
  40. * otherwise true.
  41. */
  42. static bool
  43. utf8enc(char **out, int *rem, int code)
  44. {
  45. if (code > 0 && code <= 0x7F)
  46. {
  47. if (*rem < 1)
  48. return false;
  49. *(*out++) = code; (*rem)--;
  50. return true;
  51. }
  52. else if (code > 0 && code <= 0x7FF)
  53. {
  54. if (*rem < 2)
  55. return false;
  56. *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--;
  57. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  58. return true;
  59. }
  60. else if (code > 0 && code <= 0xFFFF)
  61. {
  62. if (*rem < 3)
  63. return false;
  64. *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
  65. *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
  66. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  67. return true;
  68. }
  69. else if (code > 0 && code <= 0x10FFFF)
  70. {
  71. if (*rem < 4)
  72. return false;
  73. *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
  74. *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
  75. *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
  76. *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
  77. return true;
  78. }
  79. return true;
  80. }
  81. /*
  82. * Parses a string literal from the given buffer.
  83. *
  84. * Returns a negative value on error, otherwise the amount of consumed
  85. * characters from the given buffer.
  86. *
  87. * Error values:
  88. * -1 Unterminated string
  89. * -2 Invalid escape sequence
  90. * -3 String literal too long
  91. */
  92. static int
  93. parse_string(const char *buf, struct jp_opcode *op)
  94. {
  95. char q = *(buf++);
  96. char str[128] = { 0 };
  97. char *out = str;
  98. const char *in = buf;
  99. bool esc = false;
  100. int rem = sizeof(str) - 1;
  101. int code;
  102. while (*in)
  103. {
  104. /* continuation of escape sequence */
  105. if (esc)
  106. {
  107. /* \uFFFF */
  108. if (in[0] == 'u')
  109. {
  110. if (isxdigit(in[1]) && isxdigit(in[2]) &&
  111. isxdigit(in[3]) && isxdigit(in[4]))
  112. {
  113. if (!utf8enc(&out, &rem,
  114. hex(in[1]) * 16 * 16 * 16 +
  115. hex(in[2]) * 16 * 16 +
  116. hex(in[3]) * 16 +
  117. hex(in[4])))
  118. return -3;
  119. in += 5;
  120. }
  121. else
  122. {
  123. return -2;
  124. }
  125. }
  126. /* \xFF */
  127. else if (in[0] == 'x')
  128. {
  129. if (isxdigit(in[1]) && isxdigit(in[2]))
  130. {
  131. if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
  132. return -3;
  133. in += 3;
  134. }
  135. else
  136. {
  137. return -2;
  138. }
  139. }
  140. /* \377, \77 or \7 */
  141. else if (in[0] >= '0' && in[0] <= '7')
  142. {
  143. /* \377 */
  144. if (in[1] >= '0' && in[1] <= '7' &&
  145. in[2] >= '0' && in[2] <= '7')
  146. {
  147. code = dec(in[0]) * 8 * 8 +
  148. dec(in[1]) * 8 +
  149. dec(in[2]);
  150. if (code > 255)
  151. return -2;
  152. if (!utf8enc(&out, &rem, code))
  153. return -3;
  154. in += 3;
  155. }
  156. /* \77 */
  157. else if (in[1] >= '0' && in[1] <= '7')
  158. {
  159. if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
  160. return -3;
  161. in += 2;
  162. }
  163. /* \7 */
  164. else
  165. {
  166. if (!utf8enc(&out, &rem, dec(in[0])))
  167. return -3;
  168. in += 1;
  169. }
  170. }
  171. /* single character escape */
  172. else
  173. {
  174. if (rem-- < 1)
  175. return -3;
  176. switch (in[0])
  177. {
  178. case 'a': *out = '\a'; break;
  179. case 'b': *out = '\b'; break;
  180. case 'e': *out = '\e'; break;
  181. case 'f': *out = '\f'; break;
  182. case 'n': *out = '\n'; break;
  183. case 'r': *out = '\r'; break;
  184. case 't': *out = '\t'; break;
  185. case 'v': *out = '\v'; break;
  186. default: *out = *in; break;
  187. }
  188. in++;
  189. out++;
  190. }
  191. esc = false;
  192. }
  193. /* begin of escape sequence */
  194. else if (*in == '\\')
  195. {
  196. in++;
  197. esc = true;
  198. }
  199. /* terminating quote */
  200. else if (*in == q)
  201. {
  202. op->str = strdup(str);
  203. return (in - buf) + 2;
  204. }
  205. /* ordinary char */
  206. else
  207. {
  208. if (rem-- < 1)
  209. return -3;
  210. *out++ = *in++;
  211. }
  212. }
  213. return -1;
  214. }
  215. /*
  216. * Parses a label from the given buffer.
  217. *
  218. * Returns a negative value on error, otherwise the amount of consumed
  219. * characters from the given buffer.
  220. *
  221. * Error values:
  222. * -3 Label too long
  223. */
  224. static int
  225. parse_label(const char *buf, struct jp_opcode *op)
  226. {
  227. char str[128] = { 0 };
  228. char *out = str;
  229. const char *in = buf;
  230. int rem = sizeof(str) - 1;
  231. while (*in == '_' || isalnum(*in))
  232. {
  233. if (rem-- < 1)
  234. return -3;
  235. *out++ = *in++;
  236. }
  237. if (!strcmp(str, "true") || !strcmp(str, "false"))
  238. {
  239. op->num = (str[0] == 't');
  240. op->type = T_BOOL;
  241. }
  242. else
  243. {
  244. op->str = strdup(str);
  245. }
  246. return (in - buf);
  247. }
  248. /*
  249. * Parses a number literal from the given buffer.
  250. *
  251. * Returns a negative value on error, otherwise the amount of consumed
  252. * characters from the given buffer.
  253. *
  254. * Error values:
  255. * -2 Invalid number character
  256. */
  257. static int
  258. parse_number(const char *buf, struct jp_opcode *op)
  259. {
  260. char *e;
  261. int n = strtol(buf, &e, 10);
  262. if (e == buf)
  263. return -2;
  264. op->num = n;
  265. return (e - buf);
  266. }
  267. static const struct token tokens[] = {
  268. { 0, " ", 1 },
  269. { 0, "\t", 1 },
  270. { 0, "\n", 1 },
  271. { T_LE, "<=", 2 },
  272. { T_GE, ">=", 2 },
  273. { T_NE, "!=", 2 },
  274. { T_AND, "&&", 2 },
  275. { T_OR, "||", 2 },
  276. { T_DOT, ".", 1 },
  277. { T_BROPEN, "[", 1 },
  278. { T_BRCLOSE, "]", 1 },
  279. { T_POPEN, "(", 1 },
  280. { T_PCLOSE, ")", 1 },
  281. { T_UNION, ",", 1 },
  282. { T_ROOT, "$", 1 },
  283. { T_THIS, "@", 1 },
  284. { T_LT, "<", 1 },
  285. { T_GT, ">", 1 },
  286. { T_EQ, "=", 1 },
  287. { T_NOT, "!", 1 },
  288. { T_WILDCARD, "*", 1 },
  289. { T_STRING, "'", 1, parse_string },
  290. { T_STRING, "\"", 1, parse_string },
  291. { T_LABEL, "_", 1, parse_label },
  292. { T_LABEL, "az", 0, parse_label },
  293. { T_LABEL, "AZ", 0, parse_label },
  294. { T_NUMBER, "-", 1, parse_number },
  295. { T_NUMBER, "09", 0, parse_number },
  296. };
  297. const char *tokennames[23] = {
  298. [0] = "End of file",
  299. [T_AND] = "'&&'",
  300. [T_OR] = "'||'",
  301. [T_UNION] = "','",
  302. [T_EQ] = "'='",
  303. [T_NE] = "'!='",
  304. [T_GT] = "'>'",
  305. [T_GE] = "'>='",
  306. [T_LT] = "'<'",
  307. [T_LE] = "'<='",
  308. [T_NOT] = "'!'",
  309. [T_LABEL] = "Label",
  310. [T_ROOT] = "'$'",
  311. [T_THIS] = "'@'",
  312. [T_DOT] = "'.'",
  313. [T_WILDCARD] = "'*'",
  314. [T_BROPEN] = "'['",
  315. [T_BRCLOSE] = "']'",
  316. [T_BOOL] = "Bool",
  317. [T_NUMBER] = "Number",
  318. [T_STRING] = "String",
  319. [T_POPEN] = "'('",
  320. [T_PCLOSE] = "')'",
  321. };
  322. static int
  323. match_token(const char *ptr, struct jp_opcode *op)
  324. {
  325. int i;
  326. const struct token *tok;
  327. for (i = 0, tok = &tokens[0];
  328. i < sizeof(tokens) / sizeof(tokens[0]);
  329. i++, tok = &tokens[i])
  330. {
  331. if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
  332. (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
  333. {
  334. op->type = tok->type;
  335. if (tok->parse)
  336. return tok->parse(ptr, op);
  337. return tok->plen;
  338. }
  339. }
  340. return -1;
  341. }
  342. struct jp_opcode *
  343. jp_get_token(struct jp_state *s, const char *input, int *mlen)
  344. {
  345. struct jp_opcode op = { 0 };
  346. *mlen = match_token(input, &op);
  347. if (*mlen < 0 || op.type == 0)
  348. return NULL;
  349. return jp_alloc_op(s, op.type, op.num, op.str, NULL);
  350. }