wc.c 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /* vi: set sw=4 ts=4: */
  2. /*
  3. * wc implementation for busybox
  4. *
  5. * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org>
  6. *
  7. * Licensed under GPLv2 or later, see file LICENSE in this source tree.
  8. */
  9. /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org)
  10. *
  11. * Rewritten to fix a number of problems and do some size optimizations.
  12. * Problems in the previous busybox implementation (besides bloat) included:
  13. * 1) broken 'wc -c' optimization (read note below)
  14. * 2) broken handling of '-' args
  15. * 3) no checking of ferror on EOF returns
  16. * 4) isprint() wasn't considered when word counting.
  17. *
  18. * NOTES:
  19. *
  20. * The previous busybox wc attempted an optimization using stat for the
  21. * case of counting chars only. I omitted that because it was broken.
  22. * It didn't take into account the possibility of input coming from a
  23. * pipe, or input from a file with file pointer not at the beginning.
  24. *
  25. * To implement such a speed optimization correctly, not only do you
  26. * need the size, but also the file position. Note also that the
  27. * file position may be past the end of file. Consider the example
  28. * (adapted from example in gnu wc.c)
  29. *
  30. * echo hello > /tmp/testfile &&
  31. * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
  32. *
  33. * for which 'wc -c' should output '0'.
  34. */
  35. //config:config WC
  36. //config: bool "wc (4.7 kb)"
  37. //config: default y
  38. //config: help
  39. //config: wc is used to print the number of bytes, words, and lines,
  40. //config: in specified files.
  41. //config:
  42. //config:config FEATURE_WC_LARGE
  43. //config: bool "Support very large counts"
  44. //config: default y
  45. //config: depends on WC
  46. //config: help
  47. //config: Use "unsigned long long" for counter variables.
  48. //applet:IF_WC(APPLET(wc, BB_DIR_USR_BIN, BB_SUID_DROP))
  49. //kbuild:lib-$(CONFIG_WC) += wc.o
  50. /* BB_AUDIT SUSv3 compliant. */
  51. /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
  52. #include "libbb.h"
  53. #include "unicode.h"
  54. #if !ENABLE_LOCALE_SUPPORT
  55. # undef isprint
  56. # undef isspace
  57. # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
  58. # define isspace(c) ((c) == ' ')
  59. #endif
  60. #if ENABLE_FEATURE_WC_LARGE
  61. # define COUNT_T unsigned long long
  62. # define COUNT_FMT "llu"
  63. #else
  64. # define COUNT_T unsigned
  65. # define COUNT_FMT "u"
  66. #endif
  67. /* We support -m even when UNICODE_SUPPORT is off,
  68. * we just don't advertise it in help text,
  69. * since it is the same as -c in this case.
  70. */
  71. //usage:#define wc_trivial_usage
  72. //usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
  73. //usage:
  74. //usage:#define wc_full_usage "\n\n"
  75. //usage: "Count lines, words, and bytes for FILEs (or stdin)\n"
  76. //usage: "\n -c Count bytes"
  77. //usage: IF_UNICODE_SUPPORT(
  78. //usage: "\n -m Count characters"
  79. //usage: )
  80. //usage: "\n -l Count newlines"
  81. //usage: "\n -w Count words"
  82. //usage: "\n -L Print longest line length"
  83. //usage:
  84. //usage:#define wc_example_usage
  85. //usage: "$ wc /etc/passwd\n"
  86. //usage: " 31 46 1365 /etc/passwd\n"
  87. /* Order is important if we want to be compatible with
  88. * column order in "wc -cmlwL" output:
  89. */
  90. enum {
  91. WC_LINES = 0, /* -l */
  92. WC_WORDS = 1, /* -w */
  93. WC_UNICHARS = 2, /* -m */
  94. WC_BYTES = 3, /* -c */
  95. WC_LENGTH = 4, /* -L */
  96. NUM_WCS = 5,
  97. };
  98. int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
  99. int wc_main(int argc UNUSED_PARAM, char **argv)
  100. {
  101. const char *arg;
  102. const char *start_fmt = " %9"COUNT_FMT + 1;
  103. const char *fname_fmt = " %s\n";
  104. COUNT_T *pcounts;
  105. COUNT_T counts[NUM_WCS];
  106. COUNT_T totals[NUM_WCS];
  107. int num_files;
  108. smallint status = EXIT_SUCCESS;
  109. unsigned print_type;
  110. init_unicode();
  111. print_type = getopt32(argv, "lwmcL");
  112. if (print_type == 0) {
  113. print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES);
  114. }
  115. argv += optind;
  116. if (!argv[0]) {
  117. *--argv = (char *) bb_msg_standard_input;
  118. fname_fmt = "\n";
  119. }
  120. if (!argv[1]) { /* zero or one filename? */
  121. if (!((print_type-1) & print_type)) /* exactly one option? */
  122. start_fmt = "%"COUNT_FMT;
  123. }
  124. memset(totals, 0, sizeof(totals));
  125. pcounts = counts;
  126. num_files = 0;
  127. while ((arg = *argv++) != NULL) {
  128. FILE *fp;
  129. const char *s;
  130. unsigned u;
  131. unsigned linepos;
  132. smallint in_word;
  133. ++num_files;
  134. fp = fopen_or_warn_stdin(arg);
  135. if (!fp) {
  136. status = EXIT_FAILURE;
  137. continue;
  138. }
  139. memset(counts, 0, sizeof(counts));
  140. linepos = 0;
  141. in_word = 0;
  142. while (1) {
  143. int c;
  144. /* Our -w doesn't match GNU wc exactly... oh well */
  145. c = getc(fp);
  146. if (c == EOF) {
  147. if (ferror(fp)) {
  148. bb_simple_perror_msg(arg);
  149. status = EXIT_FAILURE;
  150. }
  151. goto DO_EOF; /* Treat an EOF as '\r'. */
  152. }
  153. /* Cater for -c and -m */
  154. ++counts[WC_BYTES];
  155. if (unicode_status != UNICODE_ON /* every byte is a new char */
  156. || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
  157. ) {
  158. ++counts[WC_UNICHARS];
  159. }
  160. if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
  161. ++linepos;
  162. if (!isspace(c)) {
  163. in_word = 1;
  164. continue;
  165. }
  166. } else if ((unsigned)(c - 9) <= 4) {
  167. /* \t 9
  168. * \n 10
  169. * \v 11
  170. * \f 12
  171. * \r 13
  172. */
  173. if (c == '\t') {
  174. linepos = (linepos | 7) + 1;
  175. } else { /* '\n', '\r', '\f', or '\v' */
  176. DO_EOF:
  177. if (linepos > counts[WC_LENGTH]) {
  178. counts[WC_LENGTH] = linepos;
  179. }
  180. if (c == '\n') {
  181. ++counts[WC_LINES];
  182. }
  183. if (c != '\v') {
  184. linepos = 0;
  185. }
  186. }
  187. } else {
  188. continue;
  189. }
  190. counts[WC_WORDS] += in_word;
  191. in_word = 0;
  192. if (c == EOF) {
  193. break;
  194. }
  195. }
  196. fclose_if_not_stdin(fp);
  197. if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
  198. totals[WC_LENGTH] = counts[WC_LENGTH];
  199. }
  200. totals[WC_LENGTH] -= counts[WC_LENGTH];
  201. OUTPUT:
  202. /* coreutils wc tries hard to print pretty columns
  203. * (saves results for all files, finds max col len etc...)
  204. * we won't try that hard, it will bloat us too much */
  205. s = start_fmt;
  206. u = 0;
  207. do {
  208. if (print_type & (1 << u)) {
  209. printf(s, pcounts[u]);
  210. s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
  211. }
  212. totals[u] += pcounts[u];
  213. } while (++u < NUM_WCS);
  214. printf(fname_fmt, arg);
  215. }
  216. /* If more than one file was processed, we want the totals. To save some
  217. * space, we set the pcounts ptr to the totals array. This has the side
  218. * effect of trashing the totals array after outputting it, but that's
  219. * irrelavent since we no longer need it. */
  220. if (num_files > 1) {
  221. num_files = 0; /* Make sure we don't get here again. */
  222. arg = "total";
  223. pcounts = totals;
  224. --argv;
  225. goto OUTPUT;
  226. }
  227. fflush_stdout_and_exit(status);
  228. }