field.c 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. #include <u.h>
  2. #include <libc.h>
  3. #include <bio.h>
  4. #include <ctype.h>
  5. #include <regexp.h>
  6. typedef struct Range Range;
  7. typedef struct Slice Slice;
  8. typedef struct Slices Slices;
  9. struct Range {
  10. int begin;
  11. int end;
  12. };
  13. struct Slice {
  14. char *begin;
  15. char *end;
  16. };
  17. struct Slices {
  18. uint len;
  19. uint size;
  20. Slice *slices;
  21. };
  22. enum {
  23. NF = 0x7FFFFFFF
  24. };
  25. Biobuf bin;
  26. Biobuf bout;
  27. int zflag;
  28. int guesscollapse(const char *sep);
  29. int Sfmt(Fmt *f);
  30. Slice lex(char **sp);
  31. Slice next(char **sp);
  32. Slice peek(void);
  33. void extend(Slice *slice, char **sp);
  34. int tiseof(Slice *tok);
  35. int tisdelim(Slice *tok);
  36. int tisspace(Slice *tok);
  37. int parseranges(char *src, Range **rv);
  38. Range parserange(char **sp);
  39. int stoi(Slice slice);
  40. int parsenum(char **s);
  41. void process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *sep, int collapse);
  42. void pprefix(char *prefix);
  43. uint split(char *line, Reprog *delim, Slices *ss, int collapse);
  44. void reset(Slices *ss);
  45. void append(Slices *ss, char *begin, char *end);
  46. void usage(void);
  47. void
  48. main(int argc, char *argv[])
  49. {
  50. Range *rv;
  51. char *filename, *insep, *outsep;
  52. Reprog *delim;
  53. int rc, collapse, eflag, Eflag, oflag;
  54. insep = "[ \t\v\r]+";
  55. outsep = " ";
  56. Binit(&bin, 0, OREAD);
  57. Binit(&bout, 1, OWRITE);
  58. fmtinstall('S', Sfmt);
  59. zflag = 0;
  60. eflag = 0;
  61. Eflag = 0;
  62. oflag = 0;
  63. ARGBEGIN {
  64. case '0':
  65. outsep = "";
  66. zflag = 1;
  67. break;
  68. case 'e':
  69. eflag = 1;
  70. break;
  71. case 'E':
  72. Eflag = 1;
  73. break;
  74. case 'F':
  75. insep = EARGF(usage());
  76. break;
  77. case 'O':
  78. oflag = 1;
  79. outsep = EARGF(usage());
  80. break;
  81. default:
  82. usage();
  83. break;
  84. } ARGEND;
  85. if (eflag && Eflag) {
  86. fprint(2, "flag conflict: -e and -E are mutually exclusive\n");
  87. usage();
  88. }
  89. if (oflag && zflag) {
  90. fprint(2, "flag conflict: -0 and -O are mutually exclusive\n");
  91. usage();
  92. }
  93. if (argc <= 0)
  94. usage();
  95. delim = regcomp(insep);
  96. if (delim == nil)
  97. sysfatal("bad input separator regexp '%s': %r", insep);
  98. rv = nil;
  99. rc = parseranges(*argv++, &rv);
  100. if (rc < 0)
  101. sysfatal("parseranges failed");
  102. collapse = guesscollapse(insep);
  103. if (eflag)
  104. collapse = 0;
  105. if (Eflag)
  106. collapse = 1;
  107. if (*argv == nil) {
  108. process(&bin, rc, rv, delim, outsep, collapse);
  109. } else while ((filename = *argv++) != nil) {
  110. Biobuf *b;
  111. if (strcmp(filename, "-") == 0) {
  112. process(&bin, rc, rv, delim, outsep, collapse);
  113. continue;
  114. }
  115. b = Bopen(filename, OREAD);
  116. if (b == nil)
  117. sysfatal("failure opening '%s': %r", filename);
  118. process(b, rc, rv, delim, outsep, collapse);
  119. Bterm(b);
  120. }
  121. exits(0);
  122. }
  123. int
  124. guesscollapse(const char *sep)
  125. {
  126. int len = utflen(sep);
  127. return len > 1 && (len != 2 || *sep != '\\');
  128. }
  129. int
  130. Sfmt(Fmt *f)
  131. {
  132. Slice s = va_arg(f->args, Slice);
  133. if (s.begin == nil || s.end == nil)
  134. return 0;
  135. return fmtprint(f, "%.*s", s.end - s.begin, s.begin);
  136. }
  137. /*
  138. * The field selection syntax is:
  139. *
  140. * fields := range [[delim] fields]
  141. * range := field | NUM '-' [field]
  142. * field := NUM | 'NF'
  143. * delim := ws+ | '|' | ','
  144. * ws := c such that `isspace(c)` is true.
  145. */
  146. Slice
  147. lex(char **sp)
  148. {
  149. char *s;
  150. Slice slice;
  151. memset(&slice, 0, sizeof(slice));
  152. s = *sp;
  153. slice.begin = s;
  154. while (isspace(*s))
  155. s++;
  156. if (s == *sp) {
  157. switch (*s) {
  158. case '\0':
  159. slice.begin = nil;
  160. break;
  161. case '-':
  162. s++;
  163. break;
  164. case 'N':
  165. if (*++s == 'F')
  166. s++;
  167. break;
  168. case ',':
  169. case '|':
  170. s++;
  171. break;
  172. default:
  173. if (!isdigit(*s))
  174. sysfatal("lexical error, c = %c", *s);
  175. while (isdigit(*s))
  176. s++;
  177. break;
  178. }
  179. }
  180. slice.end = s;
  181. *sp = s;
  182. return slice;
  183. }
  184. Slice current;
  185. Slice
  186. peek()
  187. {
  188. return current;
  189. }
  190. Slice
  191. next(char **sp)
  192. {
  193. Slice tok = peek();
  194. current = lex(sp);
  195. return tok;
  196. }
  197. void
  198. extend(Slice *slice, char **sp)
  199. {
  200. Slice tok = next(sp);
  201. slice->end = tok.end;
  202. }
  203. int
  204. stoi(Slice slice)
  205. {
  206. char *s;
  207. int n = 0, sign = 1;
  208. s = slice.begin;
  209. if (*s == '-') {
  210. sign = -1;
  211. s++;
  212. }
  213. for (; s != slice.end; s++) {
  214. if (!isdigit(*s))
  215. sysfatal("stoi: bad number in '%S', c = %c", slice, *s);
  216. n = n * 10 + (*s - '0');
  217. }
  218. return sign * n;
  219. }
  220. int
  221. tiseof(Slice *tok)
  222. {
  223. return tok == nil || tok->begin == nil;
  224. }
  225. int
  226. tisdelim(Slice *tok)
  227. {
  228. return tiseof(tok) || tisspace(tok) || *tok->begin == ',' || *tok->begin == '|';
  229. }
  230. int
  231. tisspace(Slice *tok)
  232. {
  233. return !tiseof(tok) && isspace(*tok->begin);
  234. }
  235. int
  236. parseranges(char *src, Range **rv)
  237. {
  238. char *s;
  239. Range *rs, *t;
  240. int n, m;
  241. Slice tok;
  242. rs = nil;
  243. m = 0;
  244. n = 0;
  245. s = src;
  246. if (s == nil || *s == '\0')
  247. return -1;
  248. next(&s);
  249. do {
  250. tok = peek();
  251. while (tisspace(&tok))
  252. tok = next(&s);
  253. Range r = parserange(&s);
  254. if (n >= m) {
  255. m = 2*m;
  256. if (m == 0)
  257. m = 1;
  258. t = realloc(rs, sizeof(Range) * m);
  259. if (t == nil)
  260. sysfatal("realloc failed parsing ranges");
  261. rs = t;
  262. }
  263. rs[n++] = r;
  264. tok = next(&s);
  265. if (!tisdelim(&tok))
  266. sysfatal("syntax error in field list");
  267. } while (!tiseof(&tok));
  268. *rv = rs;
  269. return n;
  270. }
  271. int
  272. tokeq(Slice *tok, const char *s)
  273. {
  274. return !tiseof(tok) && !strncmp(tok->begin, s, tok->end - tok->begin);
  275. }
  276. Range
  277. parserange(char **sp)
  278. {
  279. Range range;
  280. Slice tok;
  281. range.begin = range.end = NF;
  282. tok = peek();
  283. if (tokeq(&tok, "NF")) {
  284. next(sp);
  285. return range;
  286. }
  287. range.begin = range.end = parsenum(sp);
  288. tok = peek();
  289. if (tokeq(&tok, "-")) {
  290. next(sp);
  291. range.end = NF;
  292. tok = peek();
  293. if (tokeq(&tok, "NF")) {
  294. next(sp);
  295. return range;
  296. }
  297. if (!tiseof(&tok) && !tisdelim(&tok))
  298. range.end = parsenum(sp);
  299. }
  300. return range;
  301. }
  302. int
  303. parsenum(char **sp)
  304. {
  305. Slice tok;
  306. tok = next(sp);
  307. if (tiseof(&tok))
  308. sysfatal("EOF in number parser");
  309. if (isdigit(*tok.begin))
  310. return stoi(tok);
  311. if (*tok.begin != '-')
  312. sysfatal("number parse error: unexpected '%S'", tok);
  313. extend(&tok, sp);
  314. if (!isdigit(*(tok.begin + 1)))
  315. sysfatal("negative number parse error: unspected '%S'", tok);
  316. return stoi(tok);
  317. }
  318. void
  319. process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *outsep, int collapse)
  320. {
  321. char *line, *prefix;
  322. const int nulldelim = 1;
  323. Slice *s;
  324. Slices ss;
  325. memset(&ss, 0, sizeof(ss));
  326. while ((line = Brdstr(b, '\n', nulldelim)) != 0) {
  327. int printed = 0;
  328. uint nfields = split(line, delim, &ss, collapse);
  329. s = ss.slices;
  330. prefix = nil;
  331. for (int k = 0; k < rc; k++) {
  332. int begin = rv[k].begin;
  333. int end = rv[k].end;
  334. if (begin == 0) {
  335. pprefix(prefix);
  336. prefix = outsep;
  337. Bprint(&bout, "%s", line);
  338. printed = 1;
  339. begin = 1;
  340. }
  341. if (begin == NF)
  342. begin = nfields;
  343. if (begin < 0)
  344. begin += nfields + 1;
  345. begin--;
  346. if (end < 0)
  347. end += nfields + 1;
  348. if (begin < 0 || end < 0 || end < begin || nfields < begin)
  349. continue;
  350. for (int f = begin; f < end && f < nfields; f++) {
  351. pprefix(prefix);
  352. prefix = outsep;
  353. Bprint(&bout, "%S", s[f]);
  354. printed = 1;
  355. }
  356. }
  357. if (rc != 0 && (printed || !collapse))
  358. Bputc(&bout, '\n');
  359. free(line);
  360. }
  361. free(ss.slices);
  362. }
  363. void
  364. pprefix(char *prefix)
  365. {
  366. if (prefix == nil)
  367. return;
  368. if (zflag)
  369. Bputc(&bout, '\0');
  370. else
  371. Bprint(&bout, "%s", prefix);
  372. }
  373. void
  374. reset(Slices *ss)
  375. {
  376. ss->len = 0;
  377. }
  378. uint
  379. split(char *line, Reprog *delim, Slices *ss, int collapse)
  380. {
  381. char *s, *b, *e;
  382. Resub match[1];
  383. memset(match, 0, sizeof(match));
  384. reset(ss);
  385. b = nil;
  386. e = nil;
  387. s = line;
  388. while (regexec(delim, s, match, nelem(match))) {
  389. b = s;
  390. e = match[0].sp;
  391. s = match[0].ep;
  392. memset(match, 0, sizeof(match));
  393. if (collapse && (e == line || b == e))
  394. continue;
  395. append(ss, b, e);
  396. }
  397. b = s;
  398. e = b + strlen(s);
  399. if (!collapse || b != e)
  400. append(ss, b, e);
  401. return ss->len;
  402. }
  403. void
  404. append(Slices *ss, char *begin, char *end)
  405. {
  406. if (ss->len >= ss->size) {
  407. Slice *s;
  408. ss->size *= 2;
  409. if (ss->size == 0)
  410. ss->size = 1;
  411. s = realloc(ss->slices, ss->size * sizeof(Slice));
  412. if (s == nil)
  413. sysfatal("malloc failed appending slice: %r");
  414. ss->slices = s;
  415. }
  416. ss->slices[ss->len].begin = begin;
  417. ss->slices[ss->len++].end = end;
  418. }
  419. void
  420. usage()
  421. {
  422. sysfatal("usage: field [ -E | -e ] [ -F regexp ] [ -0 | -O delimiter ] <field list> [file...]");
  423. }