join.c 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. /* join F1 F2 on stuff */
  10. #include <u.h>
  11. #include <libc.h>
  12. #include <bio.h>
  13. #include <ctype.h>
  14. enum {
  15. F1,
  16. F2,
  17. NIN,
  18. F0,
  19. };
  20. #define NFLD 100 /* max field per line */
  21. #define comp() runestrcmp(ppi[F1][j1], ppi[F2][j2])
  22. Biobuf *f[NIN];
  23. Rune buf[NIN][Bsize]; /* input lines */
  24. Rune *ppi[NIN][NFLD+1]; /* pointers to fields in lines */
  25. Rune sep1 = ' '; /* default field separator */
  26. Rune sep2 = '\t';
  27. int j1 = 1; /* join of this field of file 1 */
  28. int j2 = 1; /* join of this field of file 2 */
  29. int a1;
  30. int a2;
  31. int olist[NIN*NFLD]; /* output these fields */
  32. int olistf[NIN*NFLD]; /* from these files */
  33. int no; /* number of entries in olist */
  34. char *sepstr = " ";
  35. int discard; /* count of truncated lines */
  36. Rune null[Bsize] = { 0 };
  37. Biobuf binbuf, boutbuf;
  38. Biobuf *bin, *bout;
  39. char *getoptarg(int*, char***);
  40. int input(int);
  41. void join(int);
  42. void oparse(char*);
  43. void output(int, int);
  44. Rune *strtorune(Rune *, char *);
  45. void
  46. main(int argc, char **argv)
  47. {
  48. int i;
  49. int64_t off1, off2;
  50. bin = &binbuf;
  51. bout = &boutbuf;
  52. Binit(bin, 0, OREAD);
  53. Binit(bout, 1, OWRITE);
  54. argv0 = argv[0];
  55. while (argc > 1 && argv[1][0] == '-') {
  56. if (argv[1][1] == '\0')
  57. break;
  58. switch (argv[1][1]) {
  59. case '-':
  60. argc--;
  61. argv++;
  62. goto proceed;
  63. case 'a':
  64. switch(*getoptarg(&argc, &argv)) {
  65. case '1':
  66. a1++;
  67. break;
  68. case '2':
  69. a2++;
  70. break;
  71. default:
  72. sysfatal("incomplete option -a");
  73. }
  74. break;
  75. case 'e':
  76. strtorune(null, getoptarg(&argc, &argv));
  77. break;
  78. case 't':
  79. sepstr=getoptarg(&argc, &argv);
  80. chartorune(&sep1, sepstr);
  81. sep2 = sep1;
  82. break;
  83. case 'o':
  84. if(argv[1][2]!=0 ||
  85. (argc>2 && strchr(argv[2],',')!=0))
  86. oparse(getoptarg(&argc, &argv));
  87. else for (no = 0; no<2*NFLD && argc>2; no++){
  88. if (argv[2][0] == '1' && argv[2][1] == '.') {
  89. olistf[no] = F1;
  90. olist[no] = atoi(&argv[2][2]);
  91. } else if (argv[2][0] == '2' && argv[2][1] == '.') {
  92. olist[no] = atoi(&argv[2][2]);
  93. olistf[no] = F2;
  94. } else if (argv[2][0] == '0')
  95. olistf[no] = F0;
  96. else
  97. break;
  98. argc--;
  99. argv++;
  100. }
  101. break;
  102. case 'j':
  103. if(argc <= 2)
  104. break;
  105. if (argv[1][2] == '1')
  106. j1 = atoi(argv[2]);
  107. else if (argv[1][2] == '2')
  108. j2 = atoi(argv[2]);
  109. else
  110. j1 = j2 = atoi(argv[2]);
  111. argc--;
  112. argv++;
  113. break;
  114. case '1':
  115. j1 = atoi(getoptarg(&argc, &argv));
  116. break;
  117. case '2':
  118. j2 = atoi(getoptarg(&argc, &argv));
  119. break;
  120. }
  121. argc--;
  122. argv++;
  123. }
  124. proceed:
  125. for (i = 0; i < no; i++)
  126. if (olist[i]-- > NFLD) /* 0 origin */
  127. sysfatal("field number too big in -o");
  128. if (argc != 3) {
  129. fprint(2, "usage: join [-1 x -2 y] [-o list] file1 file2\n");
  130. exits("usage");
  131. }
  132. if (j1 < 1 || j2 < 1)
  133. sysfatal("invalid field indices");
  134. j1--;
  135. j2--; /* everyone else believes in 0 origin */
  136. if (strcmp(argv[1], "-") == 0)
  137. f[F1] = bin;
  138. else if ((f[F1] = Bopen(argv[1], OREAD)) == 0)
  139. sysfatal("can't open %s: %r", argv[1]);
  140. if(strcmp(argv[2], "-") == 0)
  141. f[F2] = bin;
  142. else if ((f[F2] = Bopen(argv[2], OREAD)) == 0)
  143. sysfatal("can't open %s: %r", argv[2]);
  144. off1 = Boffset(f[F1]);
  145. off2 = Boffset(f[F2]);
  146. if(Bseek(f[F2], 0, 2) >= 0){
  147. Bseek(f[F2], off2, 0);
  148. join(F2);
  149. }else if(Bseek(f[F1], 0, 2) >= 0){
  150. Bseek(f[F1], off1, 0);
  151. Bseek(f[F2], off2, 0);
  152. join(F1);
  153. }else
  154. sysfatal("neither file is randomly accessible");
  155. if (discard)
  156. sysfatal("some input line was truncated");
  157. exits("");
  158. }
  159. char *
  160. runetostr(char *buf, Rune *r)
  161. {
  162. char *s;
  163. for(s = buf; *r; r++)
  164. s += runetochar(s, r);
  165. *s = '\0';
  166. return buf;
  167. }
  168. Rune *
  169. strtorune(Rune *buf, char *s)
  170. {
  171. Rune *r;
  172. for (r = buf; *s; r++)
  173. s += chartorune(r, s);
  174. *r = '\0';
  175. return buf;
  176. }
  177. void
  178. readboth(int n[])
  179. {
  180. n[F1] = input(F1);
  181. n[F2] = input(F2);
  182. }
  183. void
  184. seekbotreadboth(int seekf, int64_t bot, int n[])
  185. {
  186. Bseek(f[seekf], bot, 0);
  187. readboth(n);
  188. }
  189. void
  190. join(int seekf)
  191. {
  192. int cmp, less;
  193. int n[NIN];
  194. int64_t top, bot;
  195. less = seekf == F2;
  196. top = 0;
  197. bot = Boffset(f[seekf]);
  198. readboth(n);
  199. while((n[F1]>0 && n[F2]>0) || ((a1||a2) && n[F1]+n[F2]>0)) {
  200. cmp = comp();
  201. if((n[F1]>0 && n[F2]>0 && cmp>0) || n[F1]==0) {
  202. if(a2)
  203. output(0, n[F2]);
  204. if (seekf == F2)
  205. bot = Boffset(f[seekf]);
  206. n[F2] = input(F2);
  207. } else if((n[F1]>0 && n[F2]>0 && cmp<0) || n[F2]==0) {
  208. if(a1)
  209. output(n[F1], 0);
  210. if (seekf == F1)
  211. bot = Boffset(f[seekf]);
  212. n[F1] = input(F1);
  213. } else {
  214. /* n[F1]>0 && n[F2]>0 && cmp==0 */
  215. while(n[F2]>0 && cmp==0) {
  216. output(n[F1], n[F2]);
  217. top = Boffset(f[seekf]);
  218. n[seekf] = input(seekf);
  219. cmp = comp();
  220. }
  221. seekbotreadboth(seekf, bot, n);
  222. for(;;) {
  223. cmp = comp();
  224. if(n[F1]>0 && n[F2]>0 && cmp==0) {
  225. output(n[F1], n[F2]);
  226. n[seekf] = input(seekf);
  227. } else if((n[F1]>0 && n[F2]>0 &&
  228. (less? cmp<0 :cmp>0)) || n[seekf]==0)
  229. seekbotreadboth(seekf, bot, n);
  230. else {
  231. /*
  232. * n[F1]>0 && n[F2]>0 &&
  233. * (less? cmp>0 :cmp<0) ||
  234. * n[seekf==F1? F2: F1]==0
  235. */
  236. Bseek(f[seekf], top, 0);
  237. bot = top;
  238. n[seekf] = input(seekf);
  239. break;
  240. }
  241. }
  242. }
  243. }
  244. }
  245. int
  246. input(int n) /* get input line and split into fields */
  247. {
  248. int c, i, len;
  249. char *line;
  250. Rune *bp;
  251. Rune **pp;
  252. bp = buf[n];
  253. pp = ppi[n];
  254. line = Brdline(f[n], '\n');
  255. if (line == nil)
  256. return(0);
  257. len = Blinelen(f[n]) - 1;
  258. c = line[len];
  259. line[len] = '\0';
  260. strtorune(bp, line);
  261. line[len] = c; /* restore delimiter */
  262. if (c != '\n')
  263. discard++;
  264. i = 0;
  265. do {
  266. i++;
  267. if (sep1 == ' ') /* strip multiples */
  268. while ((c = *bp) == sep1 || c == sep2)
  269. bp++; /* skip blanks */
  270. *pp++ = bp; /* record beginning */
  271. while ((c = *bp) != sep1 && c != sep2 && c != '\0')
  272. bp++;
  273. *bp++ = '\0'; /* mark end by overwriting blank */
  274. } while (c != '\0' && i < NFLD-1);
  275. *pp = 0;
  276. return(i);
  277. }
  278. void
  279. prfields(int f, int on, int jn)
  280. {
  281. int i;
  282. char buf[Bsize];
  283. for (i = 0; i < on; i++)
  284. if (i != jn)
  285. Bprint(bout, "%s%s", sepstr, runetostr(buf, ppi[f][i]));
  286. }
  287. void
  288. output(int on1, int on2) /* print items from olist */
  289. {
  290. int i;
  291. Rune *temp;
  292. char buf[Bsize];
  293. if (no <= 0) { /* default case */
  294. Bprint(bout, "%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
  295. prfields(F1, on1, j1);
  296. prfields(F2, on2, j2);
  297. Bputc(bout, '\n');
  298. } else {
  299. for (i = 0; i < no; i++) {
  300. if (olistf[i]==F0 && on1>j1)
  301. temp = ppi[F1][j1];
  302. else if (olistf[i]==F0 && on2>j2)
  303. temp = ppi[F2][j2];
  304. else {
  305. temp = ppi[olistf[i]][olist[i]];
  306. if((olistf[i]==F1 && on1<=olist[i]) ||
  307. (olistf[i]==F2 && on2<=olist[i]) ||
  308. *temp==0)
  309. temp = null;
  310. }
  311. Bprint(bout, "%s", runetostr(buf, temp));
  312. if (i == no - 1)
  313. Bputc(bout, '\n');
  314. else
  315. Bprint(bout, "%s", sepstr);
  316. }
  317. }
  318. }
  319. char *
  320. getoptarg(int *argcp, char ***argvp)
  321. {
  322. int argc = *argcp;
  323. char **argv = *argvp;
  324. if(argv[1][2] != 0)
  325. return &argv[1][2];
  326. if(argc<=2 || argv[2][0]=='-')
  327. sysfatal("incomplete option %s", argv[1]);
  328. *argcp = argc-1;
  329. *argvp = ++argv;
  330. return argv[1];
  331. }
  332. void
  333. oparse(char *s)
  334. {
  335. for (no = 0; no<2*NFLD && *s; no++, s++) {
  336. switch(*s) {
  337. case 0:
  338. return;
  339. case '0':
  340. olistf[no] = F0;
  341. break;
  342. case '1':
  343. case '2':
  344. if(s[1] == '.' && isdigit(s[2])) {
  345. olistf[no] = *s=='1'? F1: F2;
  346. olist[no] = atoi(s += 2);
  347. break;
  348. }
  349. /* fall thru */
  350. default:
  351. sysfatal("invalid -o list");
  352. }
  353. if(s[1] == ',')
  354. s++;
  355. }
  356. }