lex.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <stdio.h>
  4. #include "cpp.h"
  5. /*
  6. * lexical FSM encoding
  7. * when in state state, and one of the characters
  8. * in ch arrives, enter nextstate.
  9. * States >= S_SELF are either final, or at least require special action.
  10. * In 'fsm' there is a line for each state X charset X nextstate.
  11. * List chars that overwrite previous entries later (e.g. C_ALPH
  12. * can be overridden by '_' by a later entry; and C_XX is the
  13. * the universal set, and should always be first.
  14. * States above S_SELF are represented in the big table as negative values.
  15. * S_SELF and S_SELFB encode the resulting token type in the upper bits.
  16. * These actions differ in that S_SELF doesn't have a lookahead char,
  17. * S_SELFB does.
  18. *
  19. * The encoding is blown out into a big table for time-efficiency.
  20. * Entries have
  21. * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
  22. */
  23. #define MAXSTATE 32
  24. #define ACT(tok,act) ((tok<<7)+act)
  25. #define QBSBIT 0100
  26. #define GETACT(st) (st>>7)&0x1ff
  27. #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */
  28. #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */
  29. /* character classes */
  30. #define C_WS 1
  31. #define C_ALPH 2
  32. #define C_NUM 3
  33. #define C_EOF 4
  34. #define C_XX 5
  35. enum state {
  36. START=0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
  37. CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
  38. CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
  39. S_SELF=MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
  40. S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
  41. };
  42. int tottok;
  43. int tokkind[256];
  44. struct fsm {
  45. int state; /* if in this state */
  46. uchar ch[4]; /* and see one of these characters */
  47. int nextstate; /* enter this state if +ve */
  48. };
  49. /*const*/ struct fsm fsm[] = {
  50. /* start state */
  51. START, { C_XX }, ACT(UNCLASS,S_SELF),
  52. START, { ' ', '\t', '\v', '\r' }, WS1,
  53. START, { C_NUM }, NUM1,
  54. START, { '.' }, NUM3,
  55. START, { C_ALPH }, ID1,
  56. START, { 'L' }, ST1,
  57. START, { '"' }, ST2,
  58. START, { '\'' }, CC1,
  59. START, { '/' }, COM1,
  60. START, { EOFC }, S_EOF,
  61. START, { '\n' }, S_NL,
  62. START, { '-' }, MINUS1,
  63. START, { '+' }, PLUS1,
  64. START, { '<' }, LT1,
  65. START, { '>' }, GT1,
  66. START, { '=' }, ASG1,
  67. START, { '!' }, NOT1,
  68. START, { '&' }, AND1,
  69. START, { '|' }, OR1,
  70. START, { '#' }, SHARP1,
  71. START, { '%' }, PCT1,
  72. START, { '[' }, ACT(SBRA,S_SELF),
  73. START, { ']' }, ACT(SKET,S_SELF),
  74. START, { '(' }, ACT(LP,S_SELF),
  75. START, { ')' }, ACT(RP,S_SELF),
  76. START, { '*' }, STAR1,
  77. START, { ',' }, ACT(COMMA,S_SELF),
  78. START, { '?' }, ACT(QUEST,S_SELF),
  79. START, { ':' }, ACT(COLON,S_SELF),
  80. START, { ';' }, ACT(SEMIC,S_SELF),
  81. START, { '{' }, ACT(CBRA,S_SELF),
  82. START, { '}' }, ACT(CKET,S_SELF),
  83. START, { '~' }, ACT(TILDE,S_SELF),
  84. START, { '^' }, CIRC1,
  85. /* saw a digit */
  86. NUM1, { C_XX }, ACT(NUMBER,S_SELFB),
  87. NUM1, { C_NUM, C_ALPH, '.' }, NUM1,
  88. NUM1, { 'E', 'e' }, NUM2,
  89. NUM1, { '_' }, ACT(NUMBER,S_SELFB),
  90. /* saw possible start of exponent, digits-e */
  91. NUM2, { C_XX }, ACT(NUMBER,S_SELFB),
  92. NUM2, { '+', '-' }, NUM1,
  93. NUM2, { C_NUM, C_ALPH }, NUM1,
  94. NUM2, { '_' }, ACT(NUMBER,S_SELFB),
  95. /* saw a '.', which could be a number or an operator */
  96. NUM3, { C_XX }, ACT(DOT,S_SELFB),
  97. NUM3, { '.' }, DOTS1,
  98. NUM3, { C_NUM }, NUM1,
  99. DOTS1, { C_XX }, ACT(UNCLASS, S_SELFB),
  100. DOTS1, { C_NUM }, NUM1,
  101. DOTS1, { '.' }, ACT(ELLIPS, S_SELF),
  102. /* saw a letter or _ */
  103. ID1, { C_XX }, ACT(NAME,S_NAME),
  104. ID1, { C_ALPH, C_NUM }, ID1,
  105. /* saw L (start of wide string?) */
  106. ST1, { C_XX }, ACT(NAME,S_NAME),
  107. ST1, { C_ALPH, C_NUM }, ID1,
  108. ST1, { '"' }, ST2,
  109. ST1, { '\'' }, CC1,
  110. /* saw " beginning string */
  111. ST2, { C_XX }, ST2,
  112. ST2, { '"' }, ACT(STRING, S_SELF),
  113. ST2, { '\\' }, ST3,
  114. ST2, { '\n' }, S_STNL,
  115. ST2, { EOFC }, S_EOFSTR,
  116. /* saw \ in string */
  117. ST3, { C_XX }, ST2,
  118. ST3, { '\n' }, S_STNL,
  119. ST3, { EOFC }, S_EOFSTR,
  120. /* saw ' beginning character const */
  121. CC1, { C_XX }, CC1,
  122. CC1, { '\'' }, ACT(CCON, S_SELF),
  123. CC1, { '\\' }, CC2,
  124. CC1, { '\n' }, S_STNL,
  125. CC1, { EOFC }, S_EOFSTR,
  126. /* saw \ in ccon */
  127. CC2, { C_XX }, CC1,
  128. CC2, { '\n' }, S_STNL,
  129. CC2, { EOFC }, S_EOFSTR,
  130. /* saw /, perhaps start of comment */
  131. COM1, { C_XX }, ACT(SLASH, S_SELFB),
  132. COM1, { '=' }, ACT(ASSLASH, S_SELF),
  133. COM1, { '*' }, COM2,
  134. COM1, { '/' }, COM4,
  135. /* saw "/*", start of comment */
  136. COM2, { C_XX }, COM2,
  137. COM2, { '\n' }, S_COMNL,
  138. COM2, { '*' }, COM3,
  139. COM2, { EOFC }, S_EOFCOM,
  140. /* saw the * possibly ending a comment */
  141. COM3, { C_XX }, COM2,
  142. COM3, { '\n' }, S_COMNL,
  143. COM3, { '*' }, COM3,
  144. COM3, { '/' }, S_COMMENT,
  145. COM3, { EOFC }, S_EOFCOM,
  146. /* // comment */
  147. COM4, { C_XX }, COM4,
  148. COM4, { '\n' }, S_NL,
  149. COM4, { EOFC }, S_EOFCOM,
  150. /* saw white space, eat it up */
  151. WS1, { C_XX }, S_WS,
  152. WS1, { ' ', '\t', '\v', '\r'}, WS1,
  153. /* saw -, check --, -=, -> */
  154. MINUS1, { C_XX }, ACT(MINUS, S_SELFB),
  155. MINUS1, { '-' }, ACT(MMINUS, S_SELF),
  156. MINUS1, { '=' }, ACT(ASMINUS,S_SELF),
  157. MINUS1, { '>' }, ACT(ARROW,S_SELF),
  158. /* saw +, check ++, += */
  159. PLUS1, { C_XX }, ACT(PLUS, S_SELFB),
  160. PLUS1, { '+' }, ACT(PPLUS, S_SELF),
  161. PLUS1, { '=' }, ACT(ASPLUS, S_SELF),
  162. /* saw <, check <<, <<=, <= */
  163. LT1, { C_XX }, ACT(LT, S_SELFB),
  164. LT1, { '<' }, LT2,
  165. LT1, { '=' }, ACT(LEQ, S_SELF),
  166. LT2, { C_XX }, ACT(LSH, S_SELFB),
  167. LT2, { '=' }, ACT(ASLSH, S_SELF),
  168. /* saw >, check >>, >>=, >= */
  169. GT1, { C_XX }, ACT(GT, S_SELFB),
  170. GT1, { '>' }, GT2,
  171. GT1, { '=' }, ACT(GEQ, S_SELF),
  172. GT2, { C_XX }, ACT(RSH, S_SELFB),
  173. GT2, { '=' }, ACT(ASRSH, S_SELF),
  174. /* = */
  175. ASG1, { C_XX }, ACT(ASGN, S_SELFB),
  176. ASG1, { '=' }, ACT(EQ, S_SELF),
  177. /* ! */
  178. NOT1, { C_XX }, ACT(NOT, S_SELFB),
  179. NOT1, { '=' }, ACT(NEQ, S_SELF),
  180. /* & */
  181. AND1, { C_XX }, ACT(AND, S_SELFB),
  182. AND1, { '&' }, ACT(LAND, S_SELF),
  183. AND1, { '=' }, ACT(ASAND, S_SELF),
  184. /* | */
  185. OR1, { C_XX }, ACT(OR, S_SELFB),
  186. OR1, { '|' }, ACT(LOR, S_SELF),
  187. OR1, { '=' }, ACT(ASOR, S_SELF),
  188. /* # */
  189. SHARP1, { C_XX }, ACT(SHARP, S_SELFB),
  190. SHARP1, { '#' }, ACT(DSHARP, S_SELF),
  191. /* % */
  192. PCT1, { C_XX }, ACT(PCT, S_SELFB),
  193. PCT1, { '=' }, ACT(ASPCT, S_SELF),
  194. /* * */
  195. STAR1, { C_XX }, ACT(STAR, S_SELFB),
  196. STAR1, { '=' }, ACT(ASSTAR, S_SELF),
  197. /* ^ */
  198. CIRC1, { C_XX }, ACT(CIRC, S_SELFB),
  199. CIRC1, { '=' }, ACT(ASCIRC, S_SELF),
  200. -1
  201. };
  202. /* first index is char, second is state */
  203. /* increase #states to power of 2 to encourage use of shift */
  204. short bigfsm[256][MAXSTATE];
  205. void
  206. expandlex(void)
  207. {
  208. /*const*/ struct fsm *fp;
  209. int i, j, nstate;
  210. for (fp = fsm; fp->state>=0; fp++) {
  211. for (i=0; fp->ch[i]; i++) {
  212. nstate = fp->nextstate;
  213. if (nstate >= S_SELF)
  214. nstate = ~nstate;
  215. switch (fp->ch[i]) {
  216. case C_XX: /* random characters */
  217. for (j=0; j<256; j++)
  218. bigfsm[j][fp->state] = nstate;
  219. continue;
  220. case C_ALPH:
  221. for (j=0; j<=256; j++)
  222. if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
  223. || UTF2(j) || UTF3(j) || j=='_')
  224. bigfsm[j][fp->state] = nstate;
  225. continue;
  226. case C_NUM:
  227. for (j='0'; j<='9'; j++)
  228. bigfsm[j][fp->state] = nstate;
  229. continue;
  230. default:
  231. bigfsm[fp->ch[i]][fp->state] = nstate;
  232. }
  233. }
  234. }
  235. /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */
  236. for (i=0; i<MAXSTATE; i++) {
  237. for (j=0; j<0xFF; j++)
  238. if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) {
  239. if (bigfsm[j][i]>0)
  240. bigfsm[j][i] = ~bigfsm[j][i];
  241. bigfsm[j][i] &= ~QBSBIT;
  242. }
  243. bigfsm[EOB][i] = ~S_EOB;
  244. if (bigfsm[EOFC][i]>=0)
  245. bigfsm[EOFC][i] = ~S_EOF;
  246. }
  247. }
  248. void
  249. fixlex(void)
  250. {
  251. /* do C++ comments? */
  252. if (Cplusplus==0)
  253. bigfsm['/'][COM1] = bigfsm['x'][COM1];
  254. }
  255. /*
  256. * fill in a row of tokens from input, terminated by NL or END
  257. * First token is put at trp->lp.
  258. * Reset is non-zero when the input buffer can be "rewound."
  259. * The value is a flag indicating that possible macros have
  260. * been seen in the row.
  261. */
  262. int
  263. gettokens(Tokenrow *trp, int reset)
  264. {
  265. register int c, state, oldstate;
  266. register uchar *ip;
  267. register Token *tp, *maxp;
  268. int runelen;
  269. Source *s = cursource;
  270. int nmac = 0;
  271. extern char outbuf[];
  272. tp = trp->lp;
  273. ip = s->inp;
  274. if (reset) {
  275. s->lineinc = 0;
  276. if (ip>=s->inl) { /* nothing in buffer */
  277. s->inl = s->inb;
  278. fillbuf(s);
  279. ip = s->inp = s->inb;
  280. } else if (ip >= s->inb+(3*s->ins/4)) {
  281. memmove(s->inb, ip, 4+s->inl-ip);
  282. s->inl = s->inb+(s->inl-ip);
  283. ip = s->inp = s->inb;
  284. }
  285. }
  286. maxp = &trp->bp[trp->max];
  287. runelen = 1;
  288. for (;;) {
  289. continue2:
  290. if (tp>=maxp) {
  291. trp->lp = tp;
  292. tp = growtokenrow(trp);
  293. maxp = &trp->bp[trp->max];
  294. }
  295. tp->type = UNCLASS;
  296. tp->hideset = 0;
  297. tp->t = ip;
  298. tp->wslen = 0;
  299. tp->flag = 0;
  300. state = START;
  301. for (;;) {
  302. oldstate = state;
  303. c = *ip;
  304. if ((state = bigfsm[c][state]) >= 0) {
  305. ip += runelen;
  306. runelen = 1;
  307. continue;
  308. }
  309. state = ~state;
  310. reswitch:
  311. switch (state&0177) {
  312. case S_SELF:
  313. ip += runelen;
  314. runelen = 1;
  315. case S_SELFB:
  316. tp->type = GETACT(state);
  317. tp->len = ip - tp->t;
  318. tp++;
  319. goto continue2;
  320. case S_NAME: /* like S_SELFB but with nmac check */
  321. tp->type = NAME;
  322. tp->len = ip - tp->t;
  323. nmac |= quicklook(tp->t[0], tp->len>1?tp->t[1]:0);
  324. tp++;
  325. goto continue2;
  326. case S_WS:
  327. tp->wslen = ip - tp->t;
  328. tp->t = ip;
  329. state = START;
  330. continue;
  331. default:
  332. if ((state&QBSBIT)==0) {
  333. ip += runelen;
  334. runelen = 1;
  335. continue;
  336. }
  337. state &= ~QBSBIT;
  338. s->inp = ip;
  339. if (c=='?') { /* check trigraph */
  340. if (trigraph(s)) {
  341. state = oldstate;
  342. continue;
  343. }
  344. goto reswitch;
  345. }
  346. if (c=='\\') { /* line-folding */
  347. if (foldline(s)) {
  348. s->lineinc++;
  349. state = oldstate;
  350. continue;
  351. }
  352. goto reswitch;
  353. }
  354. if (UTF2(c)) {
  355. runelen = 2;
  356. goto reswitch;
  357. }
  358. if (UTF3(c)) {
  359. runelen = 3;
  360. goto reswitch;
  361. }
  362. error(WARNING, "Lexical botch in cpp");
  363. ip += runelen;
  364. runelen = 1;
  365. continue;
  366. case S_EOB:
  367. s->inp = ip;
  368. fillbuf(cursource);
  369. state = oldstate;
  370. continue;
  371. case S_EOF:
  372. tp->type = END;
  373. tp->len = 0;
  374. s->inp = ip;
  375. if (tp!=trp->bp && (tp-1)->type!=NL && cursource->fd!=-1)
  376. error(WARNING,"No newline at end of file");
  377. trp->lp = tp+1;
  378. return nmac;
  379. case S_STNL:
  380. error(ERROR, "Unterminated string or char const");
  381. case S_NL:
  382. tp->t = ip;
  383. tp->type = NL;
  384. tp->len = 1;
  385. tp->wslen = 0;
  386. s->lineinc++;
  387. s->inp = ip+1;
  388. trp->lp = tp+1;
  389. return nmac;
  390. case S_EOFSTR:
  391. error(FATAL, "EOF in string or char constant");
  392. break;
  393. case S_COMNL:
  394. s->lineinc++;
  395. state = COM2;
  396. ip += runelen;
  397. runelen = 1;
  398. if (ip >= s->inb+(7*s->ins/8)) { /* very long comment */
  399. memmove(tp->t, ip, 4+s->inl-ip);
  400. s->inl -= ip-tp->t;
  401. ip = tp->t+1;
  402. }
  403. continue;
  404. case S_EOFCOM:
  405. error(WARNING, "EOF inside comment");
  406. --ip;
  407. case S_COMMENT:
  408. ++ip;
  409. tp->t = ip;
  410. tp->t[-1] = ' ';
  411. tp->wslen = 1;
  412. state = START;
  413. continue;
  414. }
  415. break;
  416. }
  417. ip += runelen;
  418. runelen = 1;
  419. tp->len = ip - tp->t;
  420. tp++;
  421. }
  422. }
  423. /* have seen ?; handle the trigraph it starts (if any) else 0 */
  424. int
  425. trigraph(Source *s)
  426. {
  427. int c;
  428. while (s->inp+2 >= s->inl && fillbuf(s)!=EOF)
  429. ;
  430. if (s->inp[1]!='?')
  431. return 0;
  432. c = 0;
  433. switch(s->inp[2]) {
  434. case '=':
  435. c = '#'; break;
  436. case '(':
  437. c = '['; break;
  438. case '/':
  439. c = '\\'; break;
  440. case ')':
  441. c = ']'; break;
  442. case '\'':
  443. c = '^'; break;
  444. case '<':
  445. c = '{'; break;
  446. case '!':
  447. c = '|'; break;
  448. case '>':
  449. c = '}'; break;
  450. case '-':
  451. c = '~'; break;
  452. }
  453. if (c) {
  454. *s->inp = c;
  455. memmove(s->inp+1, s->inp+3, s->inl-s->inp+2);
  456. s->inl -= 2;
  457. }
  458. return c;
  459. }
  460. int
  461. foldline(Source *s)
  462. {
  463. int ncr = 0;
  464. recheck:
  465. while (s->inp+1 >= s->inl && fillbuf(s)!=EOF)
  466. ;
  467. if (s->inp[ncr+1] == '\r') { /* nonstandardly, ignore CR before line-folding */
  468. ncr++;
  469. goto recheck;
  470. }
  471. if (s->inp[ncr+1] == '\n') {
  472. memmove(s->inp, s->inp+2+ncr, s->inl-s->inp+3-ncr);
  473. s->inl -= 2+ncr;
  474. return 1;
  475. }
  476. return 0;
  477. }
  478. int
  479. fillbuf(Source *s)
  480. {
  481. int n;
  482. while((char *)s->inl+s->ins/8 > (char *)s->inb+s->ins) {
  483. int l = s->inl - s->inb;
  484. int p = s->inp - s->inb;
  485. if(l < 0)
  486. error(FATAL, "negative end of input!?");
  487. if(p < 0)
  488. error(FATAL, "negative input pointer!?");
  489. /* double the buffer size and try again */
  490. s->ins *= 2;
  491. s->inb = dorealloc(s->inb, s->ins);
  492. s->inl = s->inb + l;
  493. s->inp = s->inb + p;
  494. }
  495. if (s->fd<0 || (n=read(s->fd, (char *)s->inl, s->ins/8)) <= 0)
  496. n = 0;
  497. if ((*s->inp&0xff) == EOB) /* sentinel character appears in input */
  498. *s->inp = EOFC;
  499. s->inl += n;
  500. s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOB;
  501. if (n==0) {
  502. s->inl[0] = s->inl[1]= s->inl[2]= s->inl[3] = EOFC;
  503. return EOF;
  504. }
  505. return 0;
  506. }
  507. /*
  508. * Push down to new source of characters.
  509. * If fd>0 and str==NULL, then from a file `name';
  510. * if fd==-1 and str, then from the string.
  511. */
  512. Source *
  513. setsource(char *name, int fd, char *str)
  514. {
  515. Source *s = new(Source);
  516. int len;
  517. s->line = 1;
  518. s->lineinc = 0;
  519. s->fd = fd;
  520. s->filename = name;
  521. s->next = cursource;
  522. s->ifdepth = 0;
  523. cursource = s;
  524. /* slop at right for EOB */
  525. if (str) {
  526. len = strlen(str);
  527. s->inb = domalloc(len+4);
  528. s->inp = s->inb;
  529. strncpy((char *)s->inp, str, len);
  530. } else {
  531. Dir *d;
  532. int junk;
  533. ulong length = 0;
  534. d = dirfstat(fd);
  535. if (d != nil) {
  536. length = d->length;
  537. free(d);
  538. }
  539. junk = length;
  540. if (junk<INS)
  541. junk = INS;
  542. s->inb = domalloc((junk)+4);
  543. s->inp = s->inb;
  544. len = 0;
  545. }
  546. s->ins = INS;
  547. s->inl = s->inp+len;
  548. s->inl[0] = s->inl[1] = EOB;
  549. return s;
  550. }
  551. void
  552. unsetsource(void)
  553. {
  554. Source *s = cursource;
  555. if (s->fd>=0) {
  556. close(s->fd);
  557. dofree(s->inb);
  558. }
  559. cursource = s->next;
  560. dofree(s);
  561. }