uniq.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. /*++
  2. Copyright (c) 2013 Minoca Corp. All Rights Reserved
  3. Module Name:
  4. uniq.c
  5. Abstract:
  6. This module implements the uniq utility, which removes adjacent duplicate
  7. lines.
  8. Author:
  9. Evan Green 9-Sep-2013
  10. Environment:
  11. POSIX
  12. --*/
  13. //
  14. // ------------------------------------------------------------------- Includes
  15. //
  16. #include <minoca/lib/types.h>
  17. #include <assert.h>
  18. #include <ctype.h>
  19. #include <errno.h>
  20. #include <getopt.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include <unistd.h>
  24. #include "swlib.h"
  25. //
  26. // ---------------------------------------------------------------- Definitions
  27. //
  28. #define UNIQ_VERSION_MAJOR 1
  29. #define UNIQ_VERSION_MINOR 0
  30. #define UNIQ_USAGE \
  31. "usage: uniq [-cdu] [-f fields] [-s char] [input_file [output_file]]\n" \
  32. "The uniq utility reads an input file, comparing adjacent lines, and \n" \
  33. "writes one unique copy of each input line to the output. The input and \n"\
  34. "output file operands are optional. If an input is not supplied or if \n" \
  35. "it is -, then standard in will be used. Options are:\n" \
  36. " -c, --count -- Precede each output line with the number of " \
  37. "occurrences.\n" \
  38. " -d, --repeated -- Suppress the writing of lines that are not \n" \
  39. " repeated in the input.\n" \
  40. " -D, --all-repeated=type -- Print repeated lines. Type can be none, \n" \
  41. " prepend to print a delimiter before every repeated group, or \n" \
  42. " separate to print a newline before every repeated group except \n" \
  43. " the first.\n" \
  44. " -f, --skip-fields N -- Avoid comparing the first N fields. Fields are\n"\
  45. " separated by blanks.\n" \
  46. " -i, --ignore-case -- Ignore case when comparing.\n" \
  47. " -s, --skip-chars N -- Avoid comparing the first N characters.\n" \
  48. " -u, --unique -- Suppress the writing of lines that are repeated in \n" \
  49. " the input.\n" \
  50. " -w, --check-chars=N -- Only check the first N characters.\n" \
  51. " -z, --zero-terminated -- Separate lines with zero bytes rather than " \
  52. "newlines.\n" \
  53. " --help -- Show this help text and exit.\n" \
  54. " --version -- Show the application version and exit.\n" \
  55. #define UNIQ_OPTIONS_STRING "cdDf:is:uw:zhV"
  56. //
  57. // Define uniq options.
  58. //
  59. //
  60. // Set this option to precede each output line with a count.
  61. //
  62. #define UNIQ_OPTION_PRINT_COUNT 0x00000001
  63. //
  64. // Set this flag to skip writing lines that are not repeated.
  65. //
  66. #define UNIQ_OPTION_SUPPRESS_UNIQUE 0x00000002
  67. //
  68. // Set this flag to ignore case when comparing.
  69. //
  70. #define UNIQ_OPTION_IGNORE_CASE 0x00000004
  71. //
  72. // Set this flag to suppress repeated lines in the input.
  73. //
  74. #define UNIQ_OPTION_SUPPRESS_REPEATED 0x00000008
  75. //
  76. // Set this flag to print the second and subsequent repeated lines.
  77. //
  78. #define UNIQ_OPTION_ALL_REPEATED 0x00000010
  79. //
  80. // ------------------------------------------------------ Data Type Definitions
  81. //
  82. typedef enum _UNIQ_GROUPING {
  83. UniqGroupNone,
  84. UniqGroupPrepend,
  85. UniqGroupSeparate
  86. } UNIQ_GROUPING, *PUNIQ_GROUPING;
  87. //
  88. // ----------------------------------------------- Internal Function Prototypes
  89. //
  90. PSTR
  91. UniqSkip (
  92. PSTR Input,
  93. ULONG FieldCount,
  94. ULONG CharacterCount
  95. );
  96. //
  97. // -------------------------------------------------------------------- Globals
  98. //
  99. struct option UniqLongOptions[] = {
  100. {"count", no_argument, 0, 'c'},
  101. {"repeated", no_argument, 0, 'd'},
  102. {"all-repeated", optional_argument, 0, 'D'},
  103. {"skip-fields", required_argument, 0, 'f'},
  104. {"ignore-case", no_argument, 0, 'i'},
  105. {"skip-chars", required_argument, 0, 's'},
  106. {"unique", no_argument, 0, 'u'},
  107. {"check-chars", required_argument, 0, 'w'},
  108. {"zero-terminated", no_argument, 0, 'z'},
  109. {"help", no_argument, 0, 'h'},
  110. {"version", no_argument, 0, 'V'},
  111. {NULL, 0, 0, 0},
  112. };
  113. //
  114. // ------------------------------------------------------------------ Functions
  115. //
  116. INT
  117. UniqMain (
  118. INT ArgumentCount,
  119. CHAR **Arguments
  120. )
  121. /*++
  122. Routine Description:
  123. This routine is the main entry point for the cp utility.
  124. Arguments:
  125. ArgumentCount - Supplies the number of command line arguments the program
  126. was invoked with.
  127. Arguments - Supplies a tokenized array of command line arguments.
  128. Return Value:
  129. Returns an integer exit code. 0 for success, nonzero otherwise.
  130. --*/
  131. {
  132. PSTR AfterScan;
  133. PSTR Argument;
  134. ULONG ArgumentIndex;
  135. size_t CharacterCount;
  136. INT Comparison;
  137. BOOL FirstGroup;
  138. FILE *Input;
  139. PSTR InputName;
  140. PSTR Line;
  141. PSTR LineStart;
  142. INT Option;
  143. ULONG Options;
  144. FILE *Output;
  145. PSTR OutputName;
  146. PSTR PreviousLine;
  147. PSTR PreviousLineStart;
  148. BOOL PrintLine;
  149. ULONG RepeatCount;
  150. UNIQ_GROUPING RepeatGroup;
  151. CHAR Separator;
  152. LONG SkipCharacters;
  153. LONG SkipFields;
  154. int Status;
  155. CharacterCount = -1;
  156. FirstGroup = TRUE;
  157. Input = NULL;
  158. Line = NULL;
  159. PreviousLine = NULL;
  160. Options = 0;
  161. Output = NULL;
  162. RepeatGroup = UniqGroupNone;
  163. Separator = '\n';
  164. SkipCharacters = 0;
  165. SkipFields = 0;
  166. //
  167. // Process the control arguments.
  168. //
  169. while (TRUE) {
  170. Option = getopt_long(ArgumentCount,
  171. Arguments,
  172. UNIQ_OPTIONS_STRING,
  173. UniqLongOptions,
  174. NULL);
  175. if (Option == -1) {
  176. break;
  177. }
  178. if ((Option == '?') || (Option == ':')) {
  179. Status = 1;
  180. goto MainEnd;
  181. }
  182. switch (Option) {
  183. case 'c':
  184. Options |= UNIQ_OPTION_PRINT_COUNT;
  185. break;
  186. case 'd':
  187. Options |= UNIQ_OPTION_SUPPRESS_UNIQUE;
  188. break;
  189. case 'D':
  190. Options |= UNIQ_OPTION_ALL_REPEATED | UNIQ_OPTION_SUPPRESS_UNIQUE;
  191. if (optarg != NULL) {
  192. if (strcmp(optarg, "none") == 0) {
  193. RepeatGroup = UniqGroupNone;
  194. } else if (strcmp(optarg, "prepend") == 0) {
  195. RepeatGroup = UniqGroupPrepend;
  196. } else if (strcmp(optarg, "separate") == 0) {
  197. RepeatGroup = UniqGroupSeparate;
  198. } else {
  199. SwPrintError(0, optarg, "Unknown grouping type");
  200. Status = 1;
  201. goto MainEnd;
  202. }
  203. }
  204. break;
  205. case 'i':
  206. Options |= UNIQ_OPTION_IGNORE_CASE;
  207. break;
  208. case 'u':
  209. Options |= UNIQ_OPTION_SUPPRESS_REPEATED;
  210. break;
  211. case 'f':
  212. Argument = optarg;
  213. assert(Argument != NULL);
  214. SkipFields = strtol(Argument, &AfterScan, 10);
  215. if ((SkipFields < 0) || (AfterScan == Argument)) {
  216. SwPrintError(0, Argument, "Invalid field count");
  217. return 1;
  218. }
  219. break;
  220. case 's':
  221. Argument = optarg;
  222. assert(Argument != NULL);
  223. SkipCharacters = strtol(Argument, &AfterScan, 10);
  224. if ((SkipCharacters < 0) || (AfterScan == Argument)) {
  225. SwPrintError(0, Argument, "Invalid character count");
  226. return 1;
  227. }
  228. break;
  229. case 'w':
  230. Argument = optarg;
  231. CharacterCount = strtoul(Argument, &AfterScan, 10);
  232. if (AfterScan == Argument) {
  233. SwPrintError(0, Argument, "Invalid character count");
  234. return 1;
  235. }
  236. break;
  237. case 'z':
  238. Separator = '\0';
  239. break;
  240. case 'V':
  241. SwPrintVersion(UNIQ_VERSION_MAJOR, UNIQ_VERSION_MINOR);
  242. return 1;
  243. case 'h':
  244. printf(UNIQ_USAGE);
  245. return 1;
  246. default:
  247. assert(FALSE);
  248. Status = 1;
  249. goto MainEnd;
  250. }
  251. }
  252. if (((Options & UNIQ_OPTION_ALL_REPEATED) != 0) &&
  253. ((Options & UNIQ_OPTION_PRINT_COUNT) != 0)) {
  254. SwPrintError(0, NULL, "-D and -c together is invalid");
  255. Status = 1;
  256. goto MainEnd;
  257. }
  258. //
  259. // Get the optional input and output names.
  260. //
  261. ArgumentIndex = optind;
  262. if (ArgumentIndex > ArgumentCount) {
  263. ArgumentIndex = ArgumentCount;
  264. }
  265. InputName = NULL;
  266. OutputName = NULL;
  267. if (ArgumentIndex < ArgumentCount) {
  268. InputName = Arguments[ArgumentIndex];
  269. if (strcmp(InputName, "-") == 0) {
  270. InputName = NULL;
  271. }
  272. ArgumentIndex += 1;
  273. if (ArgumentIndex < ArgumentCount) {
  274. OutputName = Arguments[ArgumentIndex];
  275. ArgumentIndex += 1;
  276. if (ArgumentIndex < ArgumentCount) {
  277. SwPrintError(0, Arguments[ArgumentIndex], "Too many arguments");
  278. return 1;
  279. }
  280. }
  281. }
  282. if (InputName == NULL) {
  283. Input = stdin;
  284. } else {
  285. Input = fopen(InputName, "r");
  286. if (Input == NULL) {
  287. Status = errno;
  288. SwPrintError(Status, InputName, "Unable to open");
  289. goto MainEnd;
  290. }
  291. }
  292. if (OutputName == NULL) {
  293. Output = stdout;
  294. } else {
  295. Output = fopen(OutputName, "r");
  296. if (Output == NULL) {
  297. Status = errno;
  298. SwPrintError(Status, OutputName, "Unable to open");
  299. goto MainEnd;
  300. }
  301. }
  302. //
  303. // Loop processing the files.
  304. //
  305. Status = SwReadLine(Input, &PreviousLine);
  306. if ((Status != 0) || (PreviousLine == NULL)) {
  307. goto MainEnd;
  308. }
  309. RepeatCount = 1;
  310. while (TRUE) {
  311. if (feof(Input) != 0) {
  312. Line = NULL;
  313. Comparison = 1;
  314. } else {
  315. Status = SwReadLine(Input, &Line);
  316. if (Status != 0) {
  317. goto MainEnd;
  318. }
  319. if (Line == NULL) {
  320. Comparison = 1;
  321. } else {
  322. LineStart = UniqSkip(Line, SkipFields, SkipCharacters);
  323. PreviousLineStart = UniqSkip(PreviousLine,
  324. SkipFields,
  325. SkipCharacters);
  326. if ((Options & UNIQ_OPTION_IGNORE_CASE) != 0) {
  327. Comparison = strncasecmp(LineStart,
  328. PreviousLineStart,
  329. CharacterCount);
  330. } else {
  331. Comparison = strncmp(LineStart,
  332. PreviousLineStart,
  333. CharacterCount);
  334. }
  335. }
  336. }
  337. //
  338. // Handle the lines being equal (duplicate).
  339. //
  340. if (Comparison == 0) {
  341. //
  342. // Print if repeated lines are requested.
  343. //
  344. if ((Options & UNIQ_OPTION_ALL_REPEATED) != 0) {
  345. //
  346. // Separate groups of repeated lines if requested. The only
  347. // difference bewteen prepend and separate is that separate
  348. // doesn't print a delimiter before the first group.
  349. //
  350. if (RepeatCount == 1) {
  351. if ((RepeatGroup == UniqGroupPrepend) ||
  352. ((RepeatGroup == UniqGroupSeparate) &&
  353. (FirstGroup == FALSE))) {
  354. putchar(Separator);
  355. }
  356. FirstGroup = FALSE;
  357. }
  358. //
  359. // Print the line.
  360. //
  361. PrintLine = TRUE;
  362. RepeatCount += 1;
  363. //
  364. // Skip the repeated line normally.
  365. //
  366. } else {
  367. free(Line);
  368. Line = NULL;
  369. RepeatCount += 1;
  370. continue;
  371. }
  372. } else {
  373. //
  374. // They're not equal, so spit this line out.
  375. //
  376. PrintLine = TRUE;
  377. if (RepeatCount == 1) {
  378. if ((Options & UNIQ_OPTION_SUPPRESS_UNIQUE) != 0) {
  379. PrintLine = FALSE;
  380. }
  381. } else {
  382. if ((Options & UNIQ_OPTION_SUPPRESS_REPEATED) != 0) {
  383. PrintLine = FALSE;
  384. }
  385. }
  386. }
  387. if (PrintLine != FALSE) {
  388. if ((Options & UNIQ_OPTION_PRINT_COUNT) != 0) {
  389. printf("%7d %s%c", RepeatCount, PreviousLine, Separator);
  390. } else {
  391. printf("%s%c", PreviousLine, Separator);
  392. }
  393. }
  394. //
  395. // Move the current line to the previous line.
  396. //
  397. free(PreviousLine);
  398. PreviousLine = Line;
  399. if (Comparison != 0) {
  400. RepeatCount = 1;
  401. }
  402. if (Line == NULL) {
  403. break;
  404. }
  405. }
  406. MainEnd:
  407. if (Line != NULL) {
  408. free(Line);
  409. }
  410. if (PreviousLine != NULL) {
  411. free(PreviousLine);
  412. }
  413. if ((Input != NULL) && (Input != stdin)) {
  414. fclose(Input);
  415. }
  416. if ((Output != NULL) && (Output != stdout)) {
  417. fclose(Output);
  418. }
  419. return Status;
  420. }
  421. //
  422. // --------------------------------------------------------- Internal Functions
  423. //
  424. PSTR
  425. UniqSkip (
  426. PSTR Input,
  427. ULONG FieldCount,
  428. ULONG CharacterCount
  429. )
  430. /*++
  431. Routine Description:
  432. This routine skips a certain number of fields and/or characters, where a
  433. field is defined as any number of blank spaces followed by any number of
  434. non-blank spaces. Fields are skipped before characters.
  435. Arguments:
  436. Input - Supplies the input to advance.
  437. FieldCount - Supplies the number of fields to skip.
  438. CharacterCount - Supplies the number of characters to skip.
  439. Return Value:
  440. Returns a pointer within the string advanced past the specified number of
  441. fields and/or characters..
  442. --*/
  443. {
  444. ULONG FieldIndex;
  445. for (FieldIndex = 0; FieldIndex < FieldCount; FieldIndex += 1) {
  446. if (*Input == '\0') {
  447. break;
  448. }
  449. while (isblank(*Input)) {
  450. Input += 1;
  451. }
  452. while (!isblank(*Input)) {
  453. Input += 1;
  454. }
  455. }
  456. while ((CharacterCount != 0) && (*Input != '\0')) {
  457. Input += 1;
  458. CharacterCount -= 1;
  459. }
  460. return Input;
  461. }