lex.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. #include <u.h>
  10. #include <libc.h>
  11. #include <stdio.h>
  12. #include "cpp.h"
  13. /*
  14. * lexical FSM encoding
  15. * when in state state, and one of the characters
  16. * in ch arrives, enter nextstate.
  17. * States >= S_SELF are either final, or at least require special action.
  18. * In 'fsm' there is a line for each state X charset X nextstate.
  19. * List chars that overwrite previous entries later (e.g. C_ALPH
  20. * can be overridden by '_' by a later entry; and C_XX is the
  21. * the universal set, and should always be first.
  22. * States above S_SELF are represented in the big table as negative values.
  23. * S_SELF and S_SELFB encode the resulting token type in the upper bits.
  24. * These actions differ in that S_SELF doesn't have a lookahead char,
  25. * S_SELFB does.
  26. *
  27. * The encoding is blown out into a big table for time-efficiency.
  28. * Entries have
  29. * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
  30. */
  31. #define MAXSTATE 32
  32. #define ACT(tok,act) ((tok<<7)+act)
  33. #define QBSBIT 0100
  34. #define GETACT(st) (st>>7)&0x1ff
  35. #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */
  36. #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */
  37. /* character classes */
  38. #define C_WS 1
  39. #define C_ALPH 2
  40. #define C_NUM 3
  41. #define C_EOF 4
  42. #define C_XX 5
  43. enum state {
  44. START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
  45. CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
  46. CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
  47. S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
  48. S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
  49. };
  50. struct fsm {
  51. int state; /* if in this state */
  52. uint8_t ch[4]; /* and see one of these characters */
  53. int nextstate; /* enter this state if +ve */
  54. };
  55. /*const*/ struct fsm fsm[] = {
  56. /* start state */
  57. START, { C_XX }, ACT(UNCLASS,S_SELF),
  58. START, { ' ', '\t', '\v', '\r' }, WS1,
  59. START, { C_NUM }, NUM1,
  60. START, { '.' }, NUM3,
  61. START, { C_ALPH }, ID1,
  62. START, { 'L' }, ST1,
  63. START, { '"' }, ST2,
  64. START, { '\'' }, CC1,
  65. START, { '/' }, COM1,
  66. START, { EOFC }, S_EOF,
  67. START, { '\n' }, S_NL,
  68. START, { '-' }, MINUS1,
  69. START, { '+' }, PLUS1,
  70. START, { '<' }, LT1,
  71. START, { '>' }, GT1,
  72. START, { '=' }, ASG1,
  73. START, { '!' }, NOT1,
  74. START, { '&' }, AND1,
  75. START, { '|' }, OR1,
  76. START, { '#' }, SHARP1,
  77. START, { '%' }, PCT1,
  78. START, { '[' }, ACT(SBRA,S_SELF),
  79. START, { ']' }, ACT(SKET,S_SELF),
  80. START, { '(' }, ACT(LP,S_SELF),
  81. START, { ')' }, ACT(RP,S_SELF),
  82. START, { '*' }, STAR1,
  83. START, { ',' }, ACT(COMMA,S_SELF),
  84. START, { '?' }, ACT(QUEST,S_SELF),
  85. START, { ':' }, ACT(COLON,S_SELF),
  86. START, { ';' }, ACT(SEMIC,S_SELF),
  87. START, { '{' }, ACT(CBRA,S_SELF),
  88. START, { '}' }, ACT(CKET,S_SELF),
  89. START, { '~' }, ACT(TILDE,S_SELF),
  90. START, { '^' }, CIRC1,
  91. /* saw a digit */
  92. NUM1, { C_XX }, ACT(NUMBER,S_SELFB),
  93. NUM1, { C_NUM, C_ALPH, '.' }, NUM1,
  94. NUM1, { 'E', 'e' }, NUM2,
  95. NUM1, { '_' }, ACT(NUMBER,S_SELFB),
  96. /* saw possible start of exponent, digits-e */
  97. NUM2, { C_XX }, ACT(NUMBER,S_SELFB),
  98. NUM2, { '+', '-' }, NUM1,
  99. NUM2, { C_NUM, C_ALPH }, NUM1,
  100. NUM2, { '_' }, ACT(NUMBER,S_SELFB),
  101. /* saw a '.', which could be a number or an operator */
  102. NUM3, { C_XX }, ACT(DOT,S_SELFB),
  103. NUM3, { '.' }, DOTS1,
  104. NUM3, { C_NUM }, NUM1,
  105. DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB),
  106. DOTS1, { C_NUM }, NUM1,
  107. DOTS1, { '.' }, ACT(ELLIPS, S_SELF),
  108. /* saw a letter or _ */
  109. ID1, { C_XX }, ACT(NAME,S_NAME),
  110. ID1, { C_ALPH, C_NUM }, ID1,
  111. /* saw L (start of wide string?) */
  112. ST1, { C_XX }, ACT(NAME,S_NAME),
  113. ST1, { C_ALPH, C_NUM }, ID1,
  114. ST1, { '"' }, ST2,
  115. ST1, { '\'' }, CC1,
  116. /* saw " beginning string */
  117. ST2, { C_XX }, ST2,
  118. ST2, { '"' }, ACT(STRING, S_SELF),
  119. ST2, { '\\' }, ST3,
  120. ST2, { '\n' }, S_STNL,
  121. ST2, { EOFC }, S_EOFSTR,
  122. /* saw \ in string */
  123. ST3, { C_XX }, ST2,
  124. ST3, { '\n' }, S_STNL,
  125. ST3, { EOFC }, S_EOFSTR,
  126. /* saw ' beginning character const */
  127. CC1, { C_XX }, CC1,
  128. CC1, { '\'' }, ACT(CCON, S_SELF),
  129. CC1, { '\\' }, CC2,
  130. CC1, { '\n' }, S_STNL,
  131. CC1, { EOFC }, S_EOFSTR,
  132. /* saw \ in ccon */
  133. CC2, { C_XX }, CC1,
  134. CC2, { '\n' }, S_STNL,
  135. CC2, { EOFC }, S_EOFSTR,
  136. /* saw /, perhaps start of comment */
  137. COM1, { C_XX }, ACT(SLASH, S_SELFB),
  138. COM1, { '=' }, ACT(ASSLASH, S_SELF),
  139. COM1, { '*' }, COM2,
  140. COM1, { '/' }, COM4,
  141. /* saw "/*", start of comment */
  142. COM2, { C_XX }, COM2,
  143. COM2, { '\n' }, S_COMNL,
  144. COM2, { '*' }, COM3,
  145. COM2, { EOFC }, S_EOFCOM,
  146. /* saw the * possibly ending a comment */
  147. COM3, { C_XX }, COM2,
  148. COM3, { '\n' }, S_COMNL,
  149. COM3, { '*' }, COM3,
  150. COM3, { '/' }, S_COMMENT,
  151. COM3, { EOFC }, S_EOFCOM,
  152. /* // comment */
  153. COM4, { C_XX }, COM4,
  154. COM4, { '\n' }, S_NL,
  155. COM4, { EOFC }, S_EOFCOM,
  156. /* saw white space, eat it up */
  157. WS1, { C_XX }, S_WS,
  158. WS1, { ' ', '\t', '\v', '\r'}, WS1,
  159. /* saw -, check --, -=, -> */
  160. MINUS1, { C_XX }, ACT(MINUS, S_SELFB),
  161. MINUS1, { '-' }, ACT(MMINUS, S_SELF),
  162. MINUS1, { '=' }, ACT(ASMINUS,S_SELF),
  163. MINUS1, { '>' }, ACT(ARROW,S_SELF),
  164. /* saw +, check ++, += */
  165. PLUS1, { C_XX }, ACT(PLUS, S_SELFB),
  166. PLUS1, { '+' }, ACT(PPLUS, S_SELF),
  167. PLUS1, { '=' }, ACT(ASPLUS, S_SELF),
  168. /* saw <, check <<, <<=, <= */
  169. LT1, { C_XX }, ACT(LT, S_SELFB),
  170. LT1, { '<' }, LT2,
  171. LT1, { '=' }, ACT(LEQ, S_SELF),
  172. LT2, { C_XX }, ACT(LSH, S_SELFB),
  173. LT2, { '=' }, ACT(ASLSH, S_SELF),
  174. /* saw >, check >>, >>=, >= */
  175. GT1, { C_XX }, ACT(GT, S_SELFB),
  176. GT1, { '>' }, GT2,
  177. GT1, { '=' }, ACT(GEQ, S_SELF),
  178. GT2, { C_XX }, ACT(RSH, S_SELFB),
  179. GT2, { '=' }, ACT(ASRSH, S_SELF),
  180. /* = */
  181. ASG1, { C_XX }, ACT(ASGN, S_SELFB),
  182. ASG1, { '=' }, ACT(EQ, S_SELF),
  183. /* ! */
  184. NOT1, { C_XX }, ACT(NOT, S_SELFB),
  185. NOT1, { '=' }, ACT(NEQ, S_SELF),
  186. /* & */
  187. AND1, { C_XX }, ACT(AND, S_SELFB),
  188. AND1, { '&' }, ACT(LAND, S_SELF),
  189. AND1, { '=' }, ACT(ASAND, S_SELF),
  190. /* | */
  191. OR1, { C_XX }, ACT(OR, S_SELFB),
  192. OR1, { '|' }, ACT(LOR, S_SELF),
  193. OR1, { '=' }, ACT(ASOR, S_SELF),
  194. /* # */
  195. SHARP1, { C_XX }, ACT(SHARP, S_SELFB),
  196. SHARP1, { '#' }, ACT(DSHARP, S_SELF),
  197. /* % */
  198. PCT1, { C_XX }, ACT(PCT, S_SELFB),
  199. PCT1, { '=' }, ACT(ASPCT, S_SELF),
  200. /* * */
  201. STAR1, { C_XX }, ACT(STAR, S_SELFB),
  202. STAR1, { '=' }, ACT(ASSTAR, S_SELF),
  203. /* ^ */
  204. CIRC1, { C_XX }, ACT(CIRC, S_SELFB),
  205. CIRC1, { '=' }, ACT(ASCIRC, S_SELF),
  206. -1
  207. };
  208. /* first index is char, second is state */
  209. /* increase #states to power of 2 to encourage use of shift */
  210. int16_t bigfsm[256][MAXSTATE];
  211. void
  212. expandlex(void)
  213. {
  214. /*const*/ struct fsm *fp;
  215. int i, j, nstate;
  216. for (fp = fsm; fp->state>=0; fp++) {
  217. for (i=0; fp->ch[i]; i++) {
  218. nstate = fp->nextstate;
  219. if (nstate >= S_SELF)
  220. nstate = ~nstate;
  221. switch (fp->ch[i]) {
  222. case C_XX: /* random characters */
  223. for (j=0; j<256; j++)
  224. bigfsm[j][fp->state] = nstate;
  225. continue;
  226. case C_ALPH:
  227. for (j=0; j<=256; j++)
  228. if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
  229. || UTF2(j) || UTF3(j) || j=='_')
  230. bigfsm[j][fp->state] = nstate;
  231. continue;
  232. case C_NUM:
  233. for (j='0'; j<='9'; j++)
  234. bigfsm[j][fp->state] = nstate;
  235. continue;
  236. default:
  237. bigfsm[fp->ch[i]][fp->state] = nstate;
  238. }
  239. }
  240. }
  241. /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */
  242. for (i=0; i<MAXSTATE; i++) {
  243. for (j=0; j<0xFF; j++)
  244. if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) {
  245. if (bigfsm[j][i]>0)
  246. bigfsm[j][i] = ~bigfsm[j][i];
  247. bigfsm[j][i] &= ~QBSBIT;
  248. }
  249. bigfsm[EOB][i] = ~S_EOB;
  250. if (bigfsm[EOFC][i]>=0)
  251. bigfsm[EOFC][i] = ~S_EOF;
  252. }
  253. }
  254. void
  255. fixlex(void)
  256. {
  257. /* do C++ comments? */
  258. if (Cplusplus==0)
  259. bigfsm['/'][COM1] = bigfsm['x'][COM1];
  260. }
  261. /*
  262. * fill in a row of tokens from input, terminated by NL or END
  263. * First token is put at trp->lp.
  264. * Reset is non-zero when the input buffer can be "rewound."
  265. * The value is a flag indicating that possible macros have
  266. * been seen in the row.
  267. */
  268. int
  269. gettokens(Tokenrow *trp, int reset)
  270. {
  271. register int c, state, oldstate;
  272. register uint8_t *ip;
  273. register Token *tp, *maxp;
  274. int runelen;
  275. Source *s = cursource;
  276. int nmac = 0;
  277. extern char outbuf[];
  278. tp = trp->lp;
  279. ip = s->inp;
  280. if (reset) {
  281. s->lineinc = 0;
  282. if (ip>=s->inl) { /* nothing in buffer */
  283. s->inl = s->inb;
  284. fillbuf(s);
  285. ip = s->inp = s->inb;
  286. } else if (ip >= s->inb+(3*s->ins/4)) {
  287. memmove(s->inb, ip, 4+s->inl-ip);
  288. s->inl = s->inb+(s->inl-ip);
  289. ip = s->inp = s->inb;
  290. }
  291. }
  292. maxp = &trp->bp[trp->max];
  293. runelen = 1;
  294. for (;;) {
  295. continue2:
  296. if (tp>=maxp) {
  297. trp->lp = tp;
  298. tp = growtokenrow(trp);
  299. maxp = &trp->bp[trp->max];
  300. }
  301. tp->type = UNCLASS;
  302. tp->hideset = 0;
  303. tp->t = ip;
  304. tp->wslen = 0;
  305. tp->flag = 0;
  306. state = START;
  307. for (;;) {
  308. oldstate = state;
  309. c = *ip;
  310. if ((state = bigfsm[c][state]) >= 0) {
  311. ip += runelen;
  312. runelen = 1;
  313. continue;
  314. }
  315. state = ~state;
  316. reswitch:
  317. switch (state&0177) {
  318. case S_SELF:
  319. ip += runelen;
  320. runelen = 1;
  321. case S_SELFB:
  322. tp->type = GETACT(state);
  323. tp->len = ip - tp->t;
  324. tp++;
  325. goto continue2;
  326. case S_NAME: /* like S_SELFB but with nmac check */
  327. tp->type = NAME;
  328. tp->len = ip - tp->t;
  329. nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
  330. tp++;
  331. goto continue2;
  332. case S_WS:
  333. tp->wslen = ip - tp->t;
  334. tp->t = ip;
  335. state = START;
  336. continue;
  337. default:
  338. if ((state&QBSBIT)==0) {
  339. ip += runelen;
  340. runelen = 1;
  341. continue;
  342. }
  343. state &= ~QBSBIT;
  344. s->inp = ip;
  345. if (c=='?') { /* check trigraph */
  346. if (trigraph(s)) {
  347. state = oldstate;
  348. continue;
  349. }
  350. goto reswitch;
  351. }
  352. if (c=='\\') { /* line-folding */
  353. if (foldline(s)) {
  354. s->lineinc++;
  355. state = oldstate;
  356. continue;
  357. }
  358. goto reswitch;
  359. }
  360. if (UTF2(c)) {
  361. runelen = 2;
  362. goto reswitch;
  363. }
  364. if (UTF3(c)) {
  365. runelen = 3;
  366. goto reswitch;
  367. }
  368. error(WARNING, "Lexical botch in cpp");
  369. ip += runelen;
  370. runelen = 1;
  371. continue;
  372. case S_EOB:
  373. s->inp = ip;
  374. fillbuf(cursource);
  375. state = oldstate;
  376. continue;
  377. case S_EOF:
  378. tp->type = END;
  379. tp->len = 0;
  380. s->inp = ip;
  381. if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
  382. error(WARNING,"No newline at end of file");
  383. trp->lp = tp+1;
  384. return nmac;
  385. case S_STNL:
  386. error(ERROR, "Unterminated string or char const");
  387. case S_NL:
  388. tp->t = ip;
  389. tp->type = NL;
  390. tp->len = 1;
  391. tp->wslen = 0;
  392. s->lineinc++;
  393. s->inp = ip+1;
  394. trp->lp = tp+1;
  395. return nmac;
  396. case S_EOFSTR:
  397. error(FATAL, "EOF in string or char constant");
  398. break;
  399. case S_COMNL:
  400. s->lineinc++;
  401. state = COM2;
  402. ip += runelen;
  403. runelen = 1;
  404. if (ip >= s->inb+(7*s->ins/8)) { /* very long comment */
  405. memmove(tp->t, ip, 4+s->inl-ip);
  406. s->inl -= ip-tp->t;
  407. ip = tp->t+1;
  408. }
  409. continue;
  410. case S_EOFCOM:
  411. error(WARNING, "EOF inside comment");
  412. --ip;
  413. case S_COMMENT:
  414. ++ip;
  415. tp->t = ip;
  416. tp->t[-1] = ' ';
  417. tp->wslen = 1;
  418. state = START;
  419. continue;
  420. }
  421. break;
  422. }
  423. ip += runelen;
  424. runelen = 1;
  425. tp->len = ip - tp->t;
  426. tp++;
  427. }
  428. }
  429. /* have seen ?; handle the trigraph it starts (if any) else 0 */
  430. int
  431. trigraph(Source *s)
  432. {
  433. int c;
  434. while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
  435. ;
  436. if (s->inp[1]!='?')
  437. return 0;
  438. c = 0;
  439. switch(s->inp[2]) {
  440. case '=':
  441. c = '#'; break;
  442. case '(':
  443. c = '['; break;
  444. case '/':
  445. c = '\\'; break;
  446. case ')':
  447. c = ']'; break;
  448. case '\'':
  449. c = '^'; break;
  450. case '<':
  451. c = '{'; break;
  452. case '!':
  453. c = '|'; break;
  454. case '>':
  455. c = '}'; break;
  456. case '-':
  457. c = '~'; break;
  458. }
  459. if (c) {
  460. *s->inp = c;
  461. memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
  462. s->inl -= 2;
  463. }
  464. return c;
  465. }
  466. int
  467. foldline(Source *s)
  468. {
  469. int ncr = 0;
  470. recheck:
  471. while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
  472. ;
  473. if (s->inp[ncr+1] == '\r') { /* nonstandardly, ignore CR before line-folding */
  474. ncr++;
  475. goto recheck;
  476. }
  477. if (s->inp[ncr+1] == '\n') {
  478. memmove(s->inp, s->inp+2+ncr, s->inl-s->inp+3-ncr);
  479. s->inl -= 2+ncr;
  480. return 1;
  481. }
  482. return 0;
  483. }
  484. int
  485. fillbuf(Source *s)
  486. {
  487. int n;
  488. while((char *)s->inl+s->ins/8 > (char *)s->inb+s->ins) {
  489. int l = s->inl - s->inb;
  490. int p = s->inp - s->inb;
  491. if(l < 0)
  492. error(FATAL, "negative end of input!?");
  493. if(p < 0)
  494. error(FATAL, "negative input pointer!?");
  495. /* double the buffer size and try again */
  496. s->ins *= 2;
  497. s->inb = dorealloc(s->inb, s->ins);
  498. s->inl = s->inb + l;
  499. s->inp = s->inb + p;
  500. }
  501. if (s->fd<0 || (n=read(s->fd, (char *)s->inl, s->ins/8)) <= 0)
  502. n = 0;
  503. if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
  504. *s->inp = EOFC;
  505. s->inl += n;
  506. s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
  507. if (n==0) {
  508. s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
  509. return EOF;
  510. }
  511. return 0;
  512. }
  513. /*
  514. * Push down to new source of characters.
  515. * If fd>0 and str==NULL, then from a file `name';
  516. * if fd==-1 and str, then from the string.
  517. */
  518. Source *
  519. setsource(char *name, int fd, char *str)
  520. {
  521. Source *s = new(Source);
  522. int len;
  523. s->line = 1;
  524. s->lineinc = 0;
  525. s->fd = fd;
  526. s->filename = name;
  527. s->next = cursource;
  528. s->ifdepth = 0;
  529. cursource = s;
  530. /* slop at right for EOB */
  531. if (str) {
  532. len = strlen(str);
  533. s->inb = domalloc(len+4);
  534. s->inp = s->inb;
  535. strncpy((char *)s->inp, str, len);
  536. } else {
  537. Dir *d;
  538. int junk;
  539. uint32_t length = 0;
  540. d = dirfstat(fd);
  541. if (d != nil) {
  542. length = d->length;
  543. free(d);
  544. }
  545. junk = length;
  546. if (junk<INS)
  547. junk = INS;
  548. s->inb = domalloc((junk)+4);
  549. s->inp = s->inb;
  550. len = 0;
  551. }
  552. s->ins = INS;
  553. s->inl = s->inp+len;
  554. s->inl[0] = s->inl[1] = EOB;
  555. return s;
  556. }
  557. void
  558. unsetsource(void)
  559. {
  560. Source *s = cursource;
  561. if (s->fd>=0) {
  562. close(s->fd);
  563. dofree(s->inb);
  564. }
  565. cursource = s->next;
  566. dofree(s);
  567. }