join.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /* join F1 F2 on stuff */
  2. #include <u.h>
  3. #include <libc.h>
  4. #include <stdio.h>
  5. #include <ctype.h>
  6. #define F1 0
  7. #define F2 1
  8. #define F0 3
  9. #define NFLD 100 /* max field per line */
  10. #define comp() runecmp(ppi[F1][j1],ppi[F2][j2])
  11. FILE *f[2];
  12. Rune buf[2][BUFSIZ]; /*input lines */
  13. Rune *ppi[2][NFLD+1]; /* pointers to fields in lines */
  14. Rune *s1,*s2;
  15. int j1 = 1; /* join of this field of file 1 */
  16. int j2 = 1; /* join of this field of file 2 */
  17. int olist[2*NFLD]; /* output these fields */
  18. int olistf[2*NFLD]; /* from these files */
  19. int no; /* number of entries in olist */
  20. Rune sep1 = ' '; /* default field separator */
  21. Rune sep2 = '\t';
  22. char *sepstr=" ";
  23. int discard; /* count of truncated lines */
  24. Rune null[BUFSIZ] = L"";
  25. int a1;
  26. int a2;
  27. char *getoptarg(int*, char***);
  28. void output(int, int);
  29. int input(int);
  30. void oparse(char*);
  31. void error(char*, char*);
  32. void seek1(void), seek2(void);
  33. Rune *strtorune(Rune *, char *);
  34. void
  35. main(int argc, char **argv)
  36. {
  37. int i;
  38. while (argc > 1 && argv[1][0] == '-') {
  39. if (argv[1][1] == '\0')
  40. break;
  41. switch (argv[1][1]) {
  42. case '-':
  43. argc--;
  44. argv++;
  45. goto proceed;
  46. case 'a':
  47. switch(*getoptarg(&argc, &argv)) {
  48. case '1':
  49. a1++;
  50. break;
  51. case '2':
  52. a2++;
  53. break;
  54. default:
  55. error("incomplete option -a","");
  56. }
  57. break;
  58. case 'e':
  59. strtorune(null, getoptarg(&argc, &argv));
  60. break;
  61. case 't':
  62. sepstr=getoptarg(&argc, &argv);
  63. chartorune(&sep1, sepstr);
  64. sep2 = sep1;
  65. break;
  66. case 'o':
  67. if(argv[1][2]!=0 ||
  68. argc>2 && strchr(argv[2],',')!=0)
  69. oparse(getoptarg(&argc, &argv));
  70. else for (no = 0; no<2*NFLD && argc>2; no++){
  71. if (argv[2][0] == '1' && argv[2][1] == '.') {
  72. olistf[no] = F1;
  73. olist[no] = atoi(&argv[2][2]);
  74. } else if (argv[2][0] == '2' && argv[2][1] == '.') {
  75. olist[no] = atoi(&argv[2][2]);
  76. olistf[no] = F2;
  77. } else if (argv[2][0] == '0')
  78. olistf[no] = F0;
  79. else
  80. break;
  81. argc--;
  82. argv++;
  83. }
  84. break;
  85. case 'j':
  86. if(argc <= 2)
  87. break;
  88. if (argv[1][2] == '1')
  89. j1 = atoi(argv[2]);
  90. else if (argv[1][2] == '2')
  91. j2 = atoi(argv[2]);
  92. else
  93. j1 = j2 = atoi(argv[2]);
  94. argc--;
  95. argv++;
  96. break;
  97. case '1':
  98. j1 = atoi(getoptarg(&argc, &argv));
  99. break;
  100. case '2':
  101. j2 = atoi(getoptarg(&argc, &argv));
  102. break;
  103. }
  104. argc--;
  105. argv++;
  106. }
  107. proceed:
  108. for (i = 0; i < no; i++)
  109. if (olist[i]-- > NFLD) /* 0 origin */
  110. error("field number too big in -o","");
  111. if (argc != 3)
  112. error("usage: join [-1 x -2 y] [-o list] file1 file2","");
  113. if (j1 < 1 || j2 < 1)
  114. error("invalid field indices", "");
  115. j1--;
  116. j2--; /* everyone else believes in 0 origin */
  117. s1 = ppi[F1][j1];
  118. s2 = ppi[F2][j2];
  119. if (strcmp(argv[1], "-") == 0)
  120. f[F1] = stdin;
  121. else if ((f[F1] = fopen(argv[1], "r")) == 0)
  122. error("can't open %s", argv[1]);
  123. if(strcmp(argv[2], "-") == 0) {
  124. f[F2] = stdin;
  125. } else if ((f[F2] = fopen(argv[2], "r")) == 0)
  126. error("can't open %s", argv[2]);
  127. if(ftell(f[F2]) >= 0)
  128. seek2();
  129. else if(ftell(f[F1]) >= 0)
  130. seek1();
  131. else
  132. error("neither file is randomly accessible","");
  133. if (discard)
  134. error("some input line was truncated", "");
  135. exits("");
  136. }
  137. int runecmp(Rune *a, Rune *b){
  138. while(*a==*b){
  139. if(*a=='\0') return 0;
  140. a++;
  141. b++;
  142. }
  143. if(*a<*b) return -1;
  144. return 1;
  145. }
  146. char *runetostr(char *buf, Rune *r){
  147. char *s;
  148. for(s=buf;*r;r++) s+=runetochar(s, r);
  149. *s='\0';
  150. return buf;
  151. }
  152. Rune *strtorune(Rune *buf, char *s){
  153. Rune *r;
  154. for(r=buf;*s;r++) s+=chartorune(r, s);
  155. *r='\0';
  156. return buf;
  157. }
  158. /* lazy. there ought to be a clean way to combine seek1 & seek2 */
  159. #define get1() n1=input(F1)
  160. #define get2() n2=input(F2)
  161. void
  162. seek2()
  163. {
  164. int n1, n2;
  165. int top2=0;
  166. int bot2 = ftell(f[F2]);
  167. get1();
  168. get2();
  169. while(n1>0 && n2>0 || (a1||a2) && n1+n2>0) {
  170. if(n1>0 && n2>0 && comp()>0 || n1==0) {
  171. if(a2) output(0, n2);
  172. bot2 = ftell(f[F2]);
  173. get2();
  174. } else if(n1>0 && n2>0 && comp()<0 || n2==0) {
  175. if(a1) output(n1, 0);
  176. get1();
  177. } else /*(n1>0 && n2>0 && comp()==0)*/ {
  178. while(n2>0 && comp()==0) {
  179. output(n1, n2);
  180. top2 = ftell(f[F2]);
  181. get2();
  182. }
  183. fseek(f[F2], bot2, 0);
  184. get2();
  185. get1();
  186. for(;;) {
  187. if(n1>0 && n2>0 && comp()==0) {
  188. output(n1, n2);
  189. get2();
  190. } else if(n1>0 && n2>0 && comp()<0 || n2==0) {
  191. fseek(f[F2], bot2, 0);
  192. get2();
  193. get1();
  194. } else /*(n1>0 && n2>0 && comp()>0 || n1==0)*/{
  195. fseek(f[F2], top2, 0);
  196. bot2 = top2;
  197. get2();
  198. break;
  199. }
  200. }
  201. }
  202. }
  203. }
  204. void
  205. seek1()
  206. {
  207. int n1, n2;
  208. int top1=0;
  209. int bot1 = ftell(f[F1]);
  210. get1();
  211. get2();
  212. while(n1>0 && n2>0 || (a1||a2) && n1+n2>0) {
  213. if(n1>0 && n2>0 && comp()>0 || n1==0) {
  214. if(a2) output(0, n2);
  215. get2();
  216. } else if(n1>0 && n2>0 && comp()<0 || n2==0) {
  217. if(a1) output(n1, 0);
  218. bot1 = ftell(f[F1]);
  219. get1();
  220. } else /*(n1>0 && n2>0 && comp()==0)*/ {
  221. while(n2>0 && comp()==0) {
  222. output(n1, n2);
  223. top1 = ftell(f[F1]);
  224. get1();
  225. }
  226. fseek(f[F1], bot1, 0);
  227. get2();
  228. get1();
  229. for(;;) {
  230. if(n1>0 && n2>0 && comp()==0) {
  231. output(n1, n2);
  232. get1();
  233. } else if(n1>0 && n2>0 && comp()>0 || n1==0) {
  234. fseek(f[F1], bot1, 0);
  235. get2();
  236. get1();
  237. } else /*(n1>0 && n2>0 && comp()<0 || n2==0)*/{
  238. fseek(f[F1], top1, 0);
  239. bot1 = top1;
  240. get1();
  241. break;
  242. }
  243. }
  244. }
  245. }
  246. }
  247. int
  248. input(int n) /* get input line and split into fields */
  249. {
  250. register int i, c;
  251. Rune *bp;
  252. Rune **pp;
  253. char line[BUFSIZ];
  254. bp = buf[n];
  255. pp = ppi[n];
  256. if (fgets(line, BUFSIZ, f[n]) == 0)
  257. return(0);
  258. strtorune(bp, line);
  259. i = 0;
  260. do {
  261. i++;
  262. if (sep1 == ' ') /* strip multiples */
  263. while ((c = *bp) == sep1 || c == sep2)
  264. bp++; /* skip blanks */
  265. *pp++ = bp; /* record beginning */
  266. while ((c = *bp) != sep1 && c != '\n' && c != sep2 && c != '\0')
  267. bp++;
  268. *bp++ = '\0'; /* mark end by overwriting blank */
  269. } while (c != '\n' && c != '\0' && i < NFLD-1);
  270. if (c != '\n')
  271. discard++;
  272. *pp = 0;
  273. return(i);
  274. }
  275. void
  276. output(int on1, int on2) /* print items from olist */
  277. {
  278. int i;
  279. Rune *temp;
  280. char buf[BUFSIZ];
  281. if (no <= 0) { /* default case */
  282. printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
  283. for (i = 0; i < on1; i++)
  284. if (i != j1)
  285. printf("%s%s", sepstr, runetostr(buf, ppi[F1][i]));
  286. for (i = 0; i < on2; i++)
  287. if (i != j2)
  288. printf("%s%s", sepstr, runetostr(buf, ppi[F2][i]));
  289. printf("\n");
  290. } else {
  291. for (i = 0; i < no; i++) {
  292. if (olistf[i]==F0 && on1>j1)
  293. temp = ppi[F1][j1];
  294. else if (olistf[i]==F0 && on2>j2)
  295. temp = ppi[F2][j2];
  296. else {
  297. temp = ppi[olistf[i]][olist[i]];
  298. if(olistf[i]==F1 && on1<=olist[i] ||
  299. olistf[i]==F2 && on2<=olist[i] ||
  300. *temp==0)
  301. temp = null;
  302. }
  303. printf("%s", runetostr(buf, temp));
  304. if (i == no - 1)
  305. printf("\n");
  306. else
  307. printf("%s", sepstr);
  308. }
  309. }
  310. }
  311. void
  312. error(char *s1, char *s2)
  313. {
  314. fprintf(stderr, "join: ");
  315. fprintf(stderr, s1, s2);
  316. fprintf(stderr, "\n");
  317. exits(s1);
  318. }
  319. char *
  320. getoptarg(int *argcp, char ***argvp)
  321. {
  322. int argc = *argcp;
  323. char **argv = *argvp;
  324. if(argv[1][2] != 0)
  325. return &argv[1][2];
  326. if(argc<=2 || argv[2][0]=='-')
  327. error("incomplete option %s", argv[1]);
  328. *argcp = argc-1;
  329. *argvp = ++argv;
  330. return argv[1];
  331. }
  332. void
  333. oparse(char *s)
  334. {
  335. for (no = 0; no<2*NFLD && *s; no++, s++) {
  336. switch(*s) {
  337. case 0:
  338. return;
  339. case '0':
  340. olistf[no] = F0;
  341. break;
  342. case '1':
  343. case '2':
  344. if(s[1] == '.' && isdigit(s[2])) {
  345. olistf[no] = *s=='1'? F1: F2;
  346. olist[no] = atoi(s += 2);
  347. break;
  348. } /* fall thru */
  349. default:
  350. error("invalid -o list", "");
  351. }
  352. if(s[1] == ',')
  353. s++;
  354. }
  355. }