field.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include <regexp.h>
  6. typedef struct Range Range;
  7. typedef struct Slice Slice;
  8. typedef struct Slices Slices;
  9. typedef struct Token Token;
  10. struct Range {
  11. int begin;
  12. int end;
  13. };
  14. struct Slice {
  15. char *begin;
  16. char *end;
  17. };
  18. #pragma varargck type "S" Slice
  19. struct Slices {
  20. uint len;
  21. uint size;
  22. Slice *slices;
  23. };
  24. struct Token {
  25. int type;
  26. Slice slice;
  27. };
  28. enum {
  29. NF = 0x7FFFFFFF
  30. };
  31. Biobuf bin;
  32. Biobuf bout;
  33. int guesscollapse(const char *sep);
  34. int Sfmt(Fmt *f);
  35. Slice lex(char **sp);
  36. Slice next(char **sp);
  37. Slice peek(void);
  38. void extend(Slice *slice, char **sp);
  39. int tiseof(Slice *tok);
  40. int tisdelim(Slice *tok);
  41. int tisspace(Slice *tok);
  42. int parseranges(char *src, Range **rv);
  43. Range parserange(char **sp);
  44. int stoi(Slice slice);
  45. int parsenum(char **s);
  46. void process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *sep, int collapse);
  47. void pprefix(char *prefix);
  48. uint split(char *line, Reprog *delim, Slices *ss, int collapse);
  49. void reset(Slices *ss);
  50. void append(Slices *ss, char *begin, char *end);
  51. void usage(void);
  52. void
  53. main(int argc, char *argv[])
  54. {
  55. Range *rv;
  56. char *filename, *insep, *outsep;
  57. Reprog *delim;
  58. int rc, collapse, eflag, Eflag, oflag, zflag;
  59. insep = "[ \t\v\r]+";
  60. outsep = " ";
  61. Binit(&bin, 0, OREAD);
  62. Binit(&bout, 1, OWRITE);
  63. fmtinstall('S', Sfmt);
  64. zflag = 0;
  65. eflag = 0;
  66. Eflag = 0;
  67. oflag = 0;
  68. ARGBEGIN {
  69. case '0':
  70. outsep = "";
  71. zflag = 1;
  72. break;
  73. case 'e':
  74. eflag = 1;
  75. break;
  76. case 'E':
  77. Eflag = 1;
  78. break;
  79. case 'F':
  80. insep = EARGF(usage());
  81. break;
  82. case 'O':
  83. oflag = 1;
  84. outsep = EARGF(usage());
  85. break;
  86. default:
  87. usage();
  88. break;
  89. } ARGEND;
  90. if (eflag && Eflag) {
  91. fprint(2, "flag conflict: -e and -E are mutually exclusive\n");
  92. usage();
  93. }
  94. if (oflag && zflag) {
  95. fprint(2, "flag conflict: -0 and -O are mutually exclusive\n");
  96. usage();
  97. }
  98. if (argc <= 0)
  99. usage();
  100. delim = regcomp(insep);
  101. if (delim == nil)
  102. sysfatal("bad input separator regexp '%s': %r", insep);
  103. rv = nil;
  104. rc = parseranges(*argv++, &rv);
  105. if (rc < 0)
  106. sysfatal("parseranges failed");
  107. collapse = guesscollapse(insep);
  108. if (eflag)
  109. collapse = 0;
  110. if (Eflag)
  111. collapse = 1;
  112. if (*argv == nil) {
  113. process(&bin, rc, rv, delim, outsep, collapse);
  114. } else while ((filename = *argv++) != nil) {
  115. Biobuf *b;
  116. if (strcmp(filename, "-") == 0) {
  117. process(&bin, rc, rv, delim, outsep, collapse);
  118. continue;
  119. }
  120. b = Bopen(filename, OREAD);
  121. if (b == nil)
  122. sysfatal("failure opening '%s': %r", filename);
  123. process(b, rc, rv, delim, outsep, collapse);
  124. Bterm(b);
  125. }
  126. exits(0);
  127. }
  128. int
  129. guesscollapse(const char *sep)
  130. {
  131. int len = utflen(sep);
  132. return len > 1 && (len != 2 || *sep != '\\');
  133. }
  134. int
  135. Sfmt(Fmt *f)
  136. {
  137. Slice s = va_arg(f->args, Slice);
  138. if (s.begin == nil || s.end == nil)
  139. return 0;
  140. return fmtprint(f, "%.*s", s.end - s.begin, s.begin);
  141. }
  142. /*
  143. * The field selection syntax is:
  144. *
  145. * fields := range [[delim] fields]
  146. * range := field | NUM '-' [field]
  147. * field := NUM | 'NF'
  148. * delim := ws+ | '|' | ','
  149. * ws := c such that `isspace(c)` is true.
  150. */
  151. Slice
  152. lex(char **sp)
  153. {
  154. char *s;
  155. Slice slice;
  156. memset(&slice, 0, sizeof(slice));
  157. s = *sp;
  158. slice.begin = s;
  159. while (isspace(*s))
  160. s++;
  161. if (s == *sp) {
  162. switch (*s) {
  163. case '\0':
  164. slice.begin = nil;
  165. break;
  166. case '-':
  167. s++;
  168. break;
  169. case 'N':
  170. if (*++s == 'F')
  171. s++;
  172. break;
  173. case ',':
  174. case '|':
  175. s++;
  176. break;
  177. default:
  178. if (!isdigit(*s))
  179. sysfatal("lexical error, c = %c", *s);
  180. while (isdigit(*s))
  181. s++;
  182. break;
  183. }
  184. }
  185. slice.end = s;
  186. *sp = s;
  187. return slice;
  188. }
  189. Slice current;
  190. Slice
  191. peek()
  192. {
  193. return current;
  194. }
  195. Slice
  196. next(char **sp)
  197. {
  198. Slice tok = peek();
  199. current = lex(sp);
  200. return tok;
  201. }
  202. void
  203. extend(Slice *slice, char **sp)
  204. {
  205. Slice tok = next(sp);
  206. slice->end = tok.end;
  207. }
  208. int
  209. stoi(Slice slice)
  210. {
  211. char *s;
  212. int n = 0, sign = 1;
  213. s = slice.begin;
  214. if (*s == '-') {
  215. sign = -1;
  216. s++;
  217. }
  218. for (; s != slice.end; s++) {
  219. if (!isdigit(*s))
  220. sysfatal("stoi: bad number in '%S', c = %c", slice, *s);
  221. n = n * 10 + (*s - '0');
  222. }
  223. return sign * n;
  224. }
  225. int
  226. tiseof(Slice *tok)
  227. {
  228. return tok == nil || tok->begin == nil;
  229. }
  230. int
  231. tisdelim(Slice *tok)
  232. {
  233. return tiseof(tok) || tisspace(tok) || *tok->begin == ',' || *tok->begin == '|';
  234. }
  235. int
  236. tisspace(Slice *tok)
  237. {
  238. return !tiseof(tok) && isspace(*tok->begin);
  239. }
  240. int
  241. parseranges(char *src, Range **rv)
  242. {
  243. char *s;
  244. Range *rs, *t;
  245. int n, m;
  246. Slice tok;
  247. rs = nil;
  248. m = 0;
  249. n = 0;
  250. s = src;
  251. if (s == nil || *s == '\0')
  252. return -1;
  253. next(&s);
  254. do {
  255. tok = peek();
  256. while (tisspace(&tok))
  257. tok = next(&s);
  258. Range r = parserange(&s);
  259. if (n >= m) {
  260. m = 2*m;
  261. if (m == 0)
  262. m = 1;
  263. t = realloc(rs, sizeof(Range) * m);
  264. if (t == nil)
  265. sysfatal("realloc failed parsing ranges");
  266. rs = t;
  267. }
  268. rs[n++] = r;
  269. tok = next(&s);
  270. if (!tisdelim(&tok))
  271. sysfatal("syntax error in field list");
  272. } while (!tiseof(&tok));
  273. *rv = rs;
  274. return n;
  275. }
  276. int
  277. tokeq(Slice *tok, const char *s)
  278. {
  279. return !tiseof(tok) && !strncmp(tok->begin, s, tok->end - tok->begin);
  280. }
  281. Range
  282. parserange(char **sp)
  283. {
  284. Range range;
  285. Slice tok;
  286. range.begin = range.end = NF;
  287. tok = peek();
  288. if (tokeq(&tok, "NF")) {
  289. next(sp);
  290. return range;
  291. }
  292. range.begin = range.end = parsenum(sp);
  293. tok = peek();
  294. if (tokeq(&tok, "-")) {
  295. next(sp);
  296. range.end = NF;
  297. tok = peek();
  298. if (tokeq(&tok, "NF")) {
  299. next(sp);
  300. return range;
  301. }
  302. if (!tiseof(&tok) && !tisdelim(&tok))
  303. range.end = parsenum(sp);
  304. }
  305. return range;
  306. }
  307. int
  308. parsenum(char **sp)
  309. {
  310. Slice tok;
  311. tok = next(sp);
  312. if (tiseof(&tok))
  313. sysfatal("EOF in number parser");
  314. if (isdigit(*tok.begin))
  315. return stoi(tok);
  316. if (*tok.begin != '-')
  317. sysfatal("number parse error: unexpected '%S'", tok);
  318. extend(&tok, sp);
  319. if (!isdigit(*(tok.begin + 1)))
  320. sysfatal("negative number parse error: unspected '%S'", tok);
  321. return stoi(tok);
  322. }
  323. void
  324. process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *outsep, int collapse)
  325. {
  326. char *line, *prefix;
  327. const int nulldelim = 1;
  328. Slice *s;
  329. Slices ss;
  330. memset(&ss, 0, sizeof(ss));
  331. while ((line = Brdstr(b, '\n', nulldelim)) != 0) {
  332. int printed = 0;
  333. uint nfields = split(line, delim, &ss, collapse);
  334. s = ss.slices;
  335. prefix = nil;
  336. for (int k = 0; k < rc; k++) {
  337. int begin = rv[k].begin;
  338. int end = rv[k].end;
  339. if (begin == 0) {
  340. pprefix(prefix);
  341. prefix = outsep;
  342. Bprint(&bout, "%s", line);
  343. printed = 1;
  344. begin = 1;
  345. }
  346. if (begin == NF)
  347. begin = nfields;
  348. if (begin < 0)
  349. begin += nfields + 1;
  350. begin--;
  351. if (end < 0)
  352. end += nfields + 1;
  353. if (begin < 0 || end < 0 || end < begin || nfields < begin)
  354. continue;
  355. for (int f = begin; f < end && f < nfields; f++) {
  356. pprefix(prefix);
  357. prefix = outsep;
  358. Bprint(&bout, "%S", s[f]);
  359. printed = 1;
  360. }
  361. }
  362. if (rc != 0 && (printed || !collapse))
  363. Bputc(&bout, '\n');
  364. free(line);
  365. }
  366. free(ss.slices);
  367. }
  368. void
  369. pprefix(char *prefix)
  370. {
  371. if (prefix == nil)
  372. return;
  373. if (*prefix == '\0')
  374. Bputc(&bout, '\0');
  375. else
  376. Bprint(&bout, "%s", prefix);
  377. }
  378. void
  379. reset(Slices *ss)
  380. {
  381. ss->len = 0;
  382. }
  383. uint
  384. split(char *line, Reprog *delim, Slices *ss, int collapse)
  385. {
  386. char *s, *b, *e;
  387. Resub match[1];
  388. memset(match, 0, sizeof(match));
  389. reset(ss);
  390. b = nil;
  391. e = nil;
  392. s = line;
  393. while (regexec(delim, s, match, nelem(match))) {
  394. b = s;
  395. e = match[0].sp;
  396. s = match[0].ep;
  397. memset(match, 0, sizeof(match));
  398. if (collapse && (e == line || b == e))
  399. continue;
  400. append(ss, b, e);
  401. }
  402. b = s;
  403. e = b + strlen(s);
  404. if (!collapse || b != e)
  405. append(ss, b, e);
  406. return ss->len;
  407. }
  408. void
  409. append(Slices *ss, char *begin, char *end)
  410. {
  411. if (ss->len >= ss->size) {
  412. Slice *s;
  413. ss->size *= 2;
  414. if (ss->size == 0)
  415. ss->size = 1;
  416. s = realloc(ss->slices, ss->size * sizeof(Slice));
  417. if (s == nil)
  418. sysfatal("malloc failed appending slice: %r");
  419. ss->slices = s;
  420. }
  421. ss->slices[ss->len].begin = begin;
  422. ss->slices[ss->len++].end = end;
  423. }
  424. void
  425. usage()
  426. {
  427. sysfatal("usage: field [ -E | -e ] [ -F regexp ] [ -0 | -O delimiter ] <field list> [file...]");
  428. }