join.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. /* join F1 F2 on stuff */
  2. #include <u.h>
  3. #include <libc.h>
  4. #include <bio.h>
  5. #include <ctype.h>
  6. enum {
  7. F1,
  8. F2,
  9. NIN,
  10. F0,
  11. };
  12. #define NFLD 100 /* max field per line */
  13. #define comp() runestrcmp(ppi[F1][j1], ppi[F2][j2])
  14. Biobuf *f[NIN];
  15. Rune buf[NIN][Bsize]; /* input lines */
  16. Rune *ppi[NIN][NFLD+1]; /* pointers to fields in lines */
  17. Rune sep1 = ' '; /* default field separator */
  18. Rune sep2 = '\t';
  19. int j1 = 1; /* join of this field of file 1 */
  20. int j2 = 1; /* join of this field of file 2 */
  21. int a1;
  22. int a2;
  23. int olist[NIN*NFLD]; /* output these fields */
  24. int olistf[NIN*NFLD]; /* from these files */
  25. int no; /* number of entries in olist */
  26. char *sepstr = " ";
  27. int discard; /* count of truncated lines */
  28. Rune null[Bsize] = L"";
  29. Biobuf binbuf, boutbuf;
  30. Biobuf *bin, *bout;
  31. char *getoptarg(int*, char***);
  32. int input(int);
  33. void join(int);
  34. void oparse(char*);
  35. void output(int, int);
  36. Rune *strtorune(Rune *, char *);
  37. void
  38. main(int argc, char **argv)
  39. {
  40. int i;
  41. vlong off1, off2;
  42. bin = &binbuf;
  43. bout = &boutbuf;
  44. Binit(bin, 0, OREAD);
  45. Binit(bout, 1, OWRITE);
  46. argv0 = argv[0];
  47. while (argc > 1 && argv[1][0] == '-') {
  48. if (argv[1][1] == '\0')
  49. break;
  50. switch (argv[1][1]) {
  51. case '-':
  52. argc--;
  53. argv++;
  54. goto proceed;
  55. case 'a':
  56. switch(*getoptarg(&argc, &argv)) {
  57. case '1':
  58. a1++;
  59. break;
  60. case '2':
  61. a2++;
  62. break;
  63. default:
  64. sysfatal("incomplete option -a");
  65. }
  66. break;
  67. case 'e':
  68. strtorune(null, getoptarg(&argc, &argv));
  69. break;
  70. case 't':
  71. sepstr=getoptarg(&argc, &argv);
  72. chartorune(&sep1, sepstr);
  73. sep2 = sep1;
  74. break;
  75. case 'o':
  76. if(argv[1][2]!=0 ||
  77. argc>2 && strchr(argv[2],',')!=0)
  78. oparse(getoptarg(&argc, &argv));
  79. else for (no = 0; no<2*NFLD && argc>2; no++){
  80. if (argv[2][0] == '1' && argv[2][1] == '.') {
  81. olistf[no] = F1;
  82. olist[no] = atoi(&argv[2][2]);
  83. } else if (argv[2][0] == '2' && argv[2][1] == '.') {
  84. olist[no] = atoi(&argv[2][2]);
  85. olistf[no] = F2;
  86. } else if (argv[2][0] == '0')
  87. olistf[no] = F0;
  88. else
  89. break;
  90. argc--;
  91. argv++;
  92. }
  93. break;
  94. case 'j':
  95. if(argc <= 2)
  96. break;
  97. if (argv[1][2] == '1')
  98. j1 = atoi(argv[2]);
  99. else if (argv[1][2] == '2')
  100. j2 = atoi(argv[2]);
  101. else
  102. j1 = j2 = atoi(argv[2]);
  103. argc--;
  104. argv++;
  105. break;
  106. case '1':
  107. j1 = atoi(getoptarg(&argc, &argv));
  108. break;
  109. case '2':
  110. j2 = atoi(getoptarg(&argc, &argv));
  111. break;
  112. }
  113. argc--;
  114. argv++;
  115. }
  116. proceed:
  117. for (i = 0; i < no; i++)
  118. if (olist[i]-- > NFLD) /* 0 origin */
  119. sysfatal("field number too big in -o");
  120. if (argc != 3) {
  121. fprint(2, "usage: join [-1 x -2 y] [-o list] file1 file2\n");
  122. exits("usage");
  123. }
  124. if (j1 < 1 || j2 < 1)
  125. sysfatal("invalid field indices");
  126. j1--;
  127. j2--; /* everyone else believes in 0 origin */
  128. if (strcmp(argv[1], "-") == 0)
  129. f[F1] = bin;
  130. else if ((f[F1] = Bopen(argv[1], OREAD)) == 0)
  131. sysfatal("can't open %s: %r", argv[1]);
  132. if(strcmp(argv[2], "-") == 0)
  133. f[F2] = bin;
  134. else if ((f[F2] = Bopen(argv[2], OREAD)) == 0)
  135. sysfatal("can't open %s: %r", argv[2]);
  136. off1 = Boffset(f[F1]);
  137. off2 = Boffset(f[F2]);
  138. if(Bseek(f[F2], 0, 2) >= 0){
  139. Bseek(f[F2], off2, 0);
  140. join(F2);
  141. }else if(Bseek(f[F1], 0, 2) >= 0){
  142. Bseek(f[F1], off1, 0);
  143. Bseek(f[F2], off2, 0);
  144. join(F1);
  145. }else
  146. sysfatal("neither file is randomly accessible");
  147. if (discard)
  148. sysfatal("some input line was truncated");
  149. exits("");
  150. }
  151. char *
  152. runetostr(char *buf, Rune *r)
  153. {
  154. char *s;
  155. for(s = buf; *r; r++)
  156. s += runetochar(s, r);
  157. *s = '\0';
  158. return buf;
  159. }
  160. Rune *
  161. strtorune(Rune *buf, char *s)
  162. {
  163. Rune *r;
  164. for (r = buf; *s; r++)
  165. s += chartorune(r, s);
  166. *r = '\0';
  167. return buf;
  168. }
  169. void
  170. readboth(int n[])
  171. {
  172. n[F1] = input(F1);
  173. n[F2] = input(F2);
  174. }
  175. void
  176. seekbotreadboth(int seekf, vlong bot, int n[])
  177. {
  178. Bseek(f[seekf], bot, 0);
  179. readboth(n);
  180. }
  181. void
  182. join(int seekf)
  183. {
  184. int cmp, less;
  185. int n[NIN];
  186. vlong top, bot;
  187. less = seekf == F2;
  188. top = 0;
  189. bot = Boffset(f[seekf]);
  190. readboth(n);
  191. while(n[F1]>0 && n[F2]>0 || (a1||a2) && n[F1]+n[F2]>0) {
  192. cmp = comp();
  193. if(n[F1]>0 && n[F2]>0 && cmp>0 || n[F1]==0) {
  194. if(a2)
  195. output(0, n[F2]);
  196. if (seekf == F2)
  197. bot = Boffset(f[seekf]);
  198. n[F2] = input(F2);
  199. } else if(n[F1]>0 && n[F2]>0 && cmp<0 || n[F2]==0) {
  200. if(a1)
  201. output(n[F1], 0);
  202. if (seekf == F1)
  203. bot = Boffset(f[seekf]);
  204. n[F1] = input(F1);
  205. } else {
  206. /* n[F1]>0 && n[F2]>0 && cmp==0 */
  207. while(n[F2]>0 && cmp==0) {
  208. output(n[F1], n[F2]);
  209. top = Boffset(f[seekf]);
  210. n[seekf] = input(seekf);
  211. cmp = comp();
  212. }
  213. seekbotreadboth(seekf, bot, n);
  214. for(;;) {
  215. cmp = comp();
  216. if(n[F1]>0 && n[F2]>0 && cmp==0) {
  217. output(n[F1], n[F2]);
  218. n[seekf] = input(seekf);
  219. } else if(n[F1]>0 && n[F2]>0 &&
  220. (less? cmp<0 :cmp>0) || n[seekf]==0)
  221. seekbotreadboth(seekf, bot, n);
  222. else {
  223. /*
  224. * n[F1]>0 && n[F2]>0 &&
  225. * (less? cmp>0 :cmp<0) ||
  226. * n[seekf==F1? F2: F1]==0
  227. */
  228. Bseek(f[seekf], top, 0);
  229. bot = top;
  230. n[seekf] = input(seekf);
  231. break;
  232. }
  233. }
  234. }
  235. }
  236. }
  237. int
  238. input(int n) /* get input line and split into fields */
  239. {
  240. int c, i, len;
  241. char *line;
  242. Rune *bp;
  243. Rune **pp;
  244. bp = buf[n];
  245. pp = ppi[n];
  246. line = Brdline(f[n], '\n');
  247. if (line == nil)
  248. return(0);
  249. len = Blinelen(f[n]) - 1;
  250. c = line[len];
  251. line[len] = '\0';
  252. strtorune(bp, line);
  253. line[len] = c; /* restore delimiter */
  254. if (c != '\n')
  255. discard++;
  256. i = 0;
  257. do {
  258. i++;
  259. if (sep1 == ' ') /* strip multiples */
  260. while ((c = *bp) == sep1 || c == sep2)
  261. bp++; /* skip blanks */
  262. *pp++ = bp; /* record beginning */
  263. while ((c = *bp) != sep1 && c != sep2 && c != '\0')
  264. bp++;
  265. *bp++ = '\0'; /* mark end by overwriting blank */
  266. } while (c != '\0' && i < NFLD-1);
  267. *pp = 0;
  268. return(i);
  269. }
  270. void
  271. prfields(int f, int on, int jn)
  272. {
  273. int i;
  274. char buf[Bsize];
  275. for (i = 0; i < on; i++)
  276. if (i != jn)
  277. Bprint(bout, "%s%s", sepstr, runetostr(buf, ppi[f][i]));
  278. }
  279. void
  280. output(int on1, int on2) /* print items from olist */
  281. {
  282. int i;
  283. Rune *temp;
  284. char buf[Bsize];
  285. if (no <= 0) { /* default case */
  286. Bprint(bout, "%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
  287. prfields(F1, on1, j1);
  288. prfields(F2, on2, j2);
  289. Bputc(bout, '\n');
  290. } else {
  291. for (i = 0; i < no; i++) {
  292. if (olistf[i]==F0 && on1>j1)
  293. temp = ppi[F1][j1];
  294. else if (olistf[i]==F0 && on2>j2)
  295. temp = ppi[F2][j2];
  296. else {
  297. temp = ppi[olistf[i]][olist[i]];
  298. if(olistf[i]==F1 && on1<=olist[i] ||
  299. olistf[i]==F2 && on2<=olist[i] ||
  300. *temp==0)
  301. temp = null;
  302. }
  303. Bprint(bout, "%s", runetostr(buf, temp));
  304. if (i == no - 1)
  305. Bputc(bout, '\n');
  306. else
  307. Bprint(bout, "%s", sepstr);
  308. }
  309. }
  310. }
  311. char *
  312. getoptarg(int *argcp, char ***argvp)
  313. {
  314. int argc = *argcp;
  315. char **argv = *argvp;
  316. if(argv[1][2] != 0)
  317. return &argv[1][2];
  318. if(argc<=2 || argv[2][0]=='-')
  319. sysfatal("incomplete option %s", argv[1]);
  320. *argcp = argc-1;
  321. *argvp = ++argv;
  322. return argv[1];
  323. }
  324. void
  325. oparse(char *s)
  326. {
  327. for (no = 0; no<2*NFLD && *s; no++, s++) {
  328. switch(*s) {
  329. case 0:
  330. return;
  331. case '0':
  332. olistf[no] = F0;
  333. break;
  334. case '1':
  335. case '2':
  336. if(s[1] == '.' && isdigit(s[2])) {
  337. olistf[no] = *s=='1'? F1: F2;
  338. olist[no] = atoi(s += 2);
  339. break;
  340. }
  341. /* fall thru */
  342. default:
  343. sysfatal("invalid -o list");
  344. }
  345. if(s[1] == ',')
  346. s++;
  347. }
  348. }