/*++ Copyright (c) 2013 Minoca Corp. All Rights Reserved Module Name: uniq.c Abstract: This module implements the uniq utility, which removes adjacent duplicate lines. Author: Evan Green 9-Sep-2013 Environment: POSIX --*/ // // ------------------------------------------------------------------- Includes // #include #include #include #include #include #include #include #include #include "swlib.h" // // ---------------------------------------------------------------- Definitions // #define UNIQ_VERSION_MAJOR 1 #define UNIQ_VERSION_MINOR 0 #define UNIQ_USAGE \ "usage: uniq [-cdu] [-f fields] [-s char] [input_file [output_file]]\n" \ "The uniq utility reads an input file, comparing adjacent lines, and \n" \ "writes one unique copy of each input line to the output. The input and \n"\ "output file operands are optional. If an input is not supplied or if \n" \ "it is -, then standard in will be used. Options are:\n" \ " -c, --count -- Precede each output line with the number of " \ "occurrences.\n" \ " -d, --repeated -- Suppress the writing of lines that are not \n" \ " repeated in the input.\n" \ " -D, --all-repeated=type -- Print repeated lines. Type can be none, \n" \ " prepend to print a delimiter before every repeated group, or \n" \ " separate to print a newline before every repeated group except \n" \ " the first.\n" \ " -f, --skip-fields N -- Avoid comparing the first N fields. Fields are\n"\ " separated by blanks.\n" \ " -i, --ignore-case -- Ignore case when comparing.\n" \ " -s, --skip-chars N -- Avoid comparing the first N characters.\n" \ " -u, --unique -- Suppress the writing of lines that are repeated in \n" \ " the input.\n" \ " -w, --check-chars=N -- Only check the first N characters.\n" \ " -z, --zero-terminated -- Separate lines with zero bytes rather than " \ "newlines.\n" \ " --help -- Show this help text and exit.\n" \ " --version -- Show the application version and exit.\n" \ #define UNIQ_OPTIONS_STRING "cdDf:is:uw:zhV" // // Define uniq options. // // // Set this option to precede each output line with a count. // #define UNIQ_OPTION_PRINT_COUNT 0x00000001 // // Set this flag to skip writing lines that are not repeated. // #define UNIQ_OPTION_SUPPRESS_UNIQUE 0x00000002 // // Set this flag to ignore case when comparing. // #define UNIQ_OPTION_IGNORE_CASE 0x00000004 // // Set this flag to suppress repeated lines in the input. // #define UNIQ_OPTION_SUPPRESS_REPEATED 0x00000008 // // Set this flag to print the second and subsequent repeated lines. // #define UNIQ_OPTION_ALL_REPEATED 0x00000010 // // ------------------------------------------------------ Data Type Definitions // typedef enum _UNIQ_GROUPING { UniqGroupNone, UniqGroupPrepend, UniqGroupSeparate } UNIQ_GROUPING, *PUNIQ_GROUPING; // // ----------------------------------------------- Internal Function Prototypes // PSTR UniqSkip ( PSTR Input, ULONG FieldCount, ULONG CharacterCount ); // // -------------------------------------------------------------------- Globals // struct option UniqLongOptions[] = { {"count", no_argument, 0, 'c'}, {"repeated", no_argument, 0, 'd'}, {"all-repeated", optional_argument, 0, 'D'}, {"skip-fields", required_argument, 0, 'f'}, {"ignore-case", no_argument, 0, 'i'}, {"skip-chars", required_argument, 0, 's'}, {"unique", no_argument, 0, 'u'}, {"check-chars", required_argument, 0, 'w'}, {"zero-terminated", no_argument, 0, 'z'}, {"help", no_argument, 0, 'h'}, {"version", no_argument, 0, 'V'}, {NULL, 0, 0, 0}, }; // // ------------------------------------------------------------------ Functions // INT UniqMain ( INT ArgumentCount, CHAR **Arguments ) /*++ Routine Description: This routine is the main entry point for the cp utility. Arguments: ArgumentCount - Supplies the number of command line arguments the program was invoked with. Arguments - Supplies a tokenized array of command line arguments. Return Value: Returns an integer exit code. 0 for success, nonzero otherwise. --*/ { PSTR AfterScan; PSTR Argument; ULONG ArgumentIndex; size_t CharacterCount; INT Comparison; BOOL FirstGroup; FILE *Input; PSTR InputName; PSTR Line; PSTR LineStart; INT Option; ULONG Options; FILE *Output; PSTR OutputName; PSTR PreviousLine; PSTR PreviousLineStart; BOOL PrintLine; ULONG RepeatCount; UNIQ_GROUPING RepeatGroup; CHAR Separator; LONG SkipCharacters; LONG SkipFields; int Status; CharacterCount = -1; FirstGroup = TRUE; Input = NULL; Line = NULL; PreviousLine = NULL; Options = 0; Output = NULL; RepeatGroup = UniqGroupNone; Separator = '\n'; SkipCharacters = 0; SkipFields = 0; // // Process the control arguments. // while (TRUE) { Option = getopt_long(ArgumentCount, Arguments, UNIQ_OPTIONS_STRING, UniqLongOptions, NULL); if (Option == -1) { break; } if ((Option == '?') || (Option == ':')) { Status = 1; goto MainEnd; } switch (Option) { case 'c': Options |= UNIQ_OPTION_PRINT_COUNT; break; case 'd': Options |= UNIQ_OPTION_SUPPRESS_UNIQUE; break; case 'D': Options |= UNIQ_OPTION_ALL_REPEATED | UNIQ_OPTION_SUPPRESS_UNIQUE; if (optarg != NULL) { if (strcmp(optarg, "none") == 0) { RepeatGroup = UniqGroupNone; } else if (strcmp(optarg, "prepend") == 0) { RepeatGroup = UniqGroupPrepend; } else if (strcmp(optarg, "separate") == 0) { RepeatGroup = UniqGroupSeparate; } else { SwPrintError(0, optarg, "Unknown grouping type"); Status = 1; goto MainEnd; } } break; case 'i': Options |= UNIQ_OPTION_IGNORE_CASE; break; case 'u': Options |= UNIQ_OPTION_SUPPRESS_REPEATED; break; case 'f': Argument = optarg; assert(Argument != NULL); SkipFields = strtol(Argument, &AfterScan, 10); if ((SkipFields < 0) || (AfterScan == Argument)) { SwPrintError(0, Argument, "Invalid field count"); return 1; } break; case 's': Argument = optarg; assert(Argument != NULL); SkipCharacters = strtol(Argument, &AfterScan, 10); if ((SkipCharacters < 0) || (AfterScan == Argument)) { SwPrintError(0, Argument, "Invalid character count"); return 1; } break; case 'w': Argument = optarg; CharacterCount = strtoul(Argument, &AfterScan, 10); if (AfterScan == Argument) { SwPrintError(0, Argument, "Invalid character count"); return 1; } break; case 'z': Separator = '\0'; break; case 'V': SwPrintVersion(UNIQ_VERSION_MAJOR, UNIQ_VERSION_MINOR); return 1; case 'h': printf(UNIQ_USAGE); return 1; default: assert(FALSE); Status = 1; goto MainEnd; } } if (((Options & UNIQ_OPTION_ALL_REPEATED) != 0) && ((Options & UNIQ_OPTION_PRINT_COUNT) != 0)) { SwPrintError(0, NULL, "-D and -c together is invalid"); Status = 1; goto MainEnd; } // // Get the optional input and output names. // ArgumentIndex = optind; if (ArgumentIndex > ArgumentCount) { ArgumentIndex = ArgumentCount; } InputName = NULL; OutputName = NULL; if (ArgumentIndex < ArgumentCount) { InputName = Arguments[ArgumentIndex]; if (strcmp(InputName, "-") == 0) { InputName = NULL; } ArgumentIndex += 1; if (ArgumentIndex < ArgumentCount) { OutputName = Arguments[ArgumentIndex]; ArgumentIndex += 1; if (ArgumentIndex < ArgumentCount) { SwPrintError(0, Arguments[ArgumentIndex], "Too many arguments"); return 1; } } } if (InputName == NULL) { Input = stdin; } else { Input = fopen(InputName, "r"); if (Input == NULL) { Status = errno; SwPrintError(Status, InputName, "Unable to open"); goto MainEnd; } } if (OutputName == NULL) { Output = stdout; } else { Output = fopen(OutputName, "r"); if (Output == NULL) { Status = errno; SwPrintError(Status, OutputName, "Unable to open"); goto MainEnd; } } // // Loop processing the files. // Status = SwReadLine(Input, &PreviousLine); if ((Status != 0) || (PreviousLine == NULL)) { goto MainEnd; } RepeatCount = 1; while (TRUE) { if (feof(Input) != 0) { Line = NULL; Comparison = 1; } else { Status = SwReadLine(Input, &Line); if (Status != 0) { goto MainEnd; } if (Line == NULL) { Comparison = 1; } else { LineStart = UniqSkip(Line, SkipFields, SkipCharacters); PreviousLineStart = UniqSkip(PreviousLine, SkipFields, SkipCharacters); if ((Options & UNIQ_OPTION_IGNORE_CASE) != 0) { Comparison = strncasecmp(LineStart, PreviousLineStart, CharacterCount); } else { Comparison = strncmp(LineStart, PreviousLineStart, CharacterCount); } } } // // Handle the lines being equal (duplicate). // if (Comparison == 0) { // // Print if repeated lines are requested. // if ((Options & UNIQ_OPTION_ALL_REPEATED) != 0) { // // Separate groups of repeated lines if requested. The only // difference bewteen prepend and separate is that separate // doesn't print a delimiter before the first group. // if (RepeatCount == 1) { if ((RepeatGroup == UniqGroupPrepend) || ((RepeatGroup == UniqGroupSeparate) && (FirstGroup == FALSE))) { putchar(Separator); } FirstGroup = FALSE; } // // Print the line. // PrintLine = TRUE; RepeatCount += 1; // // Skip the repeated line normally. // } else { free(Line); Line = NULL; RepeatCount += 1; continue; } } else { // // They're not equal, so spit this line out. // PrintLine = TRUE; if (RepeatCount == 1) { if ((Options & UNIQ_OPTION_SUPPRESS_UNIQUE) != 0) { PrintLine = FALSE; } } else { if ((Options & UNIQ_OPTION_SUPPRESS_REPEATED) != 0) { PrintLine = FALSE; } } } if (PrintLine != FALSE) { if ((Options & UNIQ_OPTION_PRINT_COUNT) != 0) { printf("%7d %s%c", RepeatCount, PreviousLine, Separator); } else { printf("%s%c", PreviousLine, Separator); } } // // Move the current line to the previous line. // free(PreviousLine); PreviousLine = Line; if (Comparison != 0) { RepeatCount = 1; } if (Line == NULL) { break; } } MainEnd: if (Line != NULL) { free(Line); } if (PreviousLine != NULL) { free(PreviousLine); } if ((Input != NULL) && (Input != stdin)) { fclose(Input); } if ((Output != NULL) && (Output != stdout)) { fclose(Output); } return Status; } // // --------------------------------------------------------- Internal Functions // PSTR UniqSkip ( PSTR Input, ULONG FieldCount, ULONG CharacterCount ) /*++ Routine Description: This routine skips a certain number of fields and/or characters, where a field is defined as any number of blank spaces followed by any number of non-blank spaces. Fields are skipped before characters. Arguments: Input - Supplies the input to advance. FieldCount - Supplies the number of fields to skip. CharacterCount - Supplies the number of characters to skip. Return Value: Returns a pointer within the string advanced past the specified number of fields and/or characters.. --*/ { ULONG FieldIndex; for (FieldIndex = 0; FieldIndex < FieldCount; FieldIndex += 1) { if (*Input == '\0') { break; } while (isblank(*Input)) { Input += 1; } while (!isblank(*Input)) { Input += 1; } } while ((CharacterCount != 0) && (*Input != '\0')) { Input += 1; CharacterCount -= 1; } return Input; }