Browse Source

Introduce a new command: field.

Field extracts fields from tabular data; it can be used as a
replacement for 'awk' for things where one just wants a numbered
field. It is, I claim, more functional than 'cut'.

Change-Id: I2257670b2fa58d24563415e1be0c41c53421eb3c
Signed-off-by: Dan Cross <cross@gajendra.net>
Dan Cross 8 years ago
parent
commit
9d898d9661
3 changed files with 561 additions and 0 deletions
  1. 83 0
      sys/man/1/field
  2. 1 0
      sys/src/cmd/cmds.json
  3. 477 0
      sys/src/cmd/field.c

+ 83 - 0
sys/man/1/field

@@ -0,0 +1,83 @@
+.TH FIELD 1
+.SH NAME
+field \- extract fields from tabular data
+.SH SYNOPSIS
+.B field
+[
+.B -E
+|
+.B -e
+]
+[
+.BI -F regexp
+]
+[
+.B -0
+|
+.BI -O seperator
+]
+.I field list
+[
+.I file...
+]
+.SH DESCRIPTION
+.I Field
+extracts and reports fields from tabular data from files.  If no files are
+specified, data is read from the standard input.  The file name `-' also
+specifies the standard input.
+.PP
+The input field separator defaults to one or more whitespace
+characters and can be set by the
+.B -F
+option.  The output field separator defaults to a single space character
+and can be set to an arbitrary string by the
+.B -O
+option.  If the
+.B -0
+option is given, fields will be separated by a single NUL character
+(binary zero).
+.B -O
+and
+.B -0
+are mutually exclusive.
+.PP
+.B Field
+uses a heuristic to decide whether to include empty fields in the output:
+if the delimiter is a single character, or a single escaped character, empty
+fields will be included.  Otherwise, they will be collapsed and empty fields
+at the beginning and end of a line will be ignored.  This behavior can be
+overridden by the
+.B -e
+or
+.B -E
+options:
+.B -e
+will force eliding of empty fields and
+.B -E
+will force collapsing of empty fields.
+.B -E
+and
+.B -e
+are mutually exclusive.
+.PP
+Fields are selected by number, origin 1. Field 0 is the entire line.  Negative
+fields count from the end of the line; thus, -1 is the last field.  The literal string
+`NF' can also be given to specify the last field on the line.  Inclusive ranges of
+fields can be given by joining the beginning and ending fields with a single `-'
+character; if the second number is omitted it is assumed to be NF.
+.PP
+Multiple fields can be given, separated by whitespace, commas, or the
+pipe character.
+.PP
+Fields can be repeated, and can be specified in arbitrary order.  For example:
+.PP
+.EX
+	field -F: 2,1,-2-NF /adm/users
+.EE
+.SH SOURCE
+.B /sys/src/cmd/field.c
+.SH "SEE ALSO"
+.IR awk (1),
+.IR cut (1)
+.SH BUGS
+The range syntax only permits ascending ranges.

+ 1 - 0
sys/src/cmd/cmds.json

@@ -59,6 +59,7 @@
 		"ed.c",
 		"factor.c",
 		"fcp.c",
+		"field.c",
 		"file.c",
 		"fmt.c",
 		"fortune.c",

+ 477 - 0
sys/src/cmd/field.c

@@ -0,0 +1,477 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include <regexp.h>
+
+typedef struct Range Range;
+typedef struct Slice Slice;
+typedef struct Slices Slices;
+typedef struct Token Token;
+
+struct Range {
+	int begin;
+	int end;
+};
+
+struct Slice {
+	char *begin;
+	char *end;
+};
+#pragma varargck type "S" Slice
+
+struct Slices {
+	uint len;
+	uint size;
+	Slice *slices;
+};
+
+struct Token {
+	int type;
+	Slice slice;
+};
+
+enum {
+	NF = 0x7FFFFFFF
+};
+
+Biobuf bin;
+Biobuf bout;
+
+int guesscollapse(const char *sep);
+int Sfmt(Fmt *f);
+Slice lex(char **sp);
+Slice next(char **sp);
+Slice peek(void);
+void extend(Slice *slice, char **sp);
+int tiseof(Slice *tok);
+int tisdelim(Slice *tok);
+int tisspace(Slice *tok);
+int parseranges(char *src, Range **rv);
+Range parserange(char **sp);
+int stoi(Slice slice);
+int parsenum(char **s);
+void process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *sep, int collapse);
+void pprefix(char *prefix);
+uint split(char *line, Reprog *delim, Slices *ss, int collapse);
+void reset(Slices *ss);
+void append(Slices *ss, char *begin, char *end);
+void usage(void);
+
+void
+main(int argc, char *argv[])
+{
+	Range *rv;
+	char *filename, *insep, *outsep;
+	Reprog *delim;
+	int rc, collapse, eflag, Eflag, oflag, zflag;
+
+	insep = "[ \t\v\r]+";
+	outsep = " ";
+	Binit(&bin, 0, OREAD);
+	Binit(&bout, 1, OWRITE);
+	fmtinstall('S', Sfmt);
+
+	zflag = 0;
+	eflag = 0;
+	Eflag = 0;
+	oflag = 0;
+	ARGBEGIN {
+	case '0':
+		outsep = "";
+		zflag = 1;
+		break;
+	case 'e':
+		eflag = 1;
+		break;
+	case 'E':
+		Eflag = 1;
+		break;
+	case 'F':
+		insep = EARGF(usage());
+		break;
+	case 'O':
+		oflag = 1;
+		outsep = EARGF(usage());
+		break;
+	default:
+		usage();
+		break;
+	} ARGEND;
+	if (eflag && Eflag) {
+		fprint(2, "flag conflict: -e and -E are mutually exclusive\n");
+		usage();
+	}
+	if (oflag && zflag) {
+		fprint(2, "flag conflict: -0 and -O are mutually exclusive\n");
+		usage();
+	}
+	if (argc <= 0)
+		usage();
+	delim = regcomp(insep);
+	if (delim == nil)
+		sysfatal("bad input separator regexp '%s': %r", insep);
+	rv = nil;
+	rc = parseranges(*argv++, &rv);
+	if (rc < 0)
+		sysfatal("parseranges failed");
+	collapse = guesscollapse(insep);
+	if (eflag)
+		collapse = 0;
+	if (Eflag)
+		collapse = 1;
+	if (*argv == nil) {
+		process(&bin, rc, rv, delim, outsep, collapse);
+	} else while ((filename = *argv++) != nil) {
+		Biobuf *b;
+		if (strcmp(filename, "-") == 0) {
+			process(&bin, rc, rv, delim, outsep, collapse);
+			continue;
+		}
+		b = Bopen(filename, OREAD);
+		if (b == nil)
+			sysfatal("failure opening '%s': %r", filename);
+		process(b, rc, rv, delim, outsep, collapse);
+		Bterm(b);
+	}
+
+	exits(0);
+}
+
+int
+guesscollapse(const char *sep)
+{
+	int len = utflen(sep);
+	return len > 1 && (len != 2 || *sep != '\\');
+}
+
+int
+Sfmt(Fmt *f)
+{
+	Slice s = va_arg(f->args, Slice);
+	if (s.begin == nil || s.end == nil)
+		return 0;
+	return fmtprint(f, "%.*s", s.end - s.begin, s.begin);
+}
+
+/*
+ * The field selection syntax is:
+ *
+ * fields := range [[delim] fields]
+ * range := field | NUM '-' [field]
+ * field := NUM | 'NF'
+ * delim := ws+ | '|' | ','
+ * ws := c such that `isspace(c)` is true.
+ */
+Slice
+lex(char **sp)
+{
+	char *s;
+	Slice slice;
+
+	memset(&slice, 0, sizeof(slice));
+	s = *sp;
+	slice.begin = s;
+	while (isspace(*s))
+		s++;
+	if (s == *sp) {
+		switch (*s) {
+		case '\0':
+			slice.begin = nil;
+			break;
+		case '-':
+			s++;
+			break;
+		case 'N':
+			if (*++s == 'F')
+				s++;
+			break;
+		case ',':
+		case '|':
+			s++;
+			break;
+		default:
+			if (!isdigit(*s))
+				sysfatal("lexical error, c = %c", *s);
+			while (isdigit(*s))
+				s++;
+			break;
+		}
+	}
+	slice.end = s;
+	*sp = s;
+
+	return slice;
+}
+
+Slice current;
+
+Slice
+peek()
+{
+	return current;
+}
+
+Slice
+next(char **sp)
+{
+	Slice tok = peek();
+	current = lex(sp);
+	return tok;
+}
+
+void
+extend(Slice *slice, char **sp)
+{
+	Slice tok = next(sp);
+	slice->end = tok.end;
+}
+
+int
+stoi(Slice slice)
+{
+	char *s;
+	int n = 0, sign = 1;
+
+	s = slice.begin;
+	if (*s == '-') {
+		sign = -1;
+		s++;
+	}
+	for (; s != slice.end; s++) {
+		if (!isdigit(*s))
+			sysfatal("stoi: bad number in '%S', c = %c", slice, *s);
+		n = n * 10 + (*s - '0');
+	}
+
+	return sign * n;
+}
+
+int
+tiseof(Slice *tok)
+{
+	return tok == nil || tok->begin == nil;
+}
+
+int
+tisdelim(Slice *tok)
+{
+	return tiseof(tok) || tisspace(tok) || *tok->begin == ',' || *tok->begin == '|';
+}
+
+int
+tisspace(Slice *tok)
+{
+	return !tiseof(tok) && isspace(*tok->begin);
+}
+
+int
+parseranges(char *src, Range **rv)
+{
+	char *s;
+	Range *rs, *t;
+	int n, m;
+	Slice tok;
+
+	rs = nil;
+	m = 0;
+	n = 0;
+	s = src;
+	if (s == nil || *s == '\0')
+		return -1;
+	next(&s);
+	do {
+		tok = peek();
+		while (tisspace(&tok))
+			tok = next(&s);
+		Range r = parserange(&s);
+		if (n >= m) {
+			m = 2*m;
+			if (m == 0)
+				m = 1;
+			t = realloc(rs, sizeof(Range) * m);
+			if (t == nil)
+				sysfatal("realloc failed parsing ranges");
+			rs = t;
+		}
+		rs[n++] = r;
+ 		tok = next(&s);
+		if (!tisdelim(&tok))
+			sysfatal("syntax error in field list");
+	} while (!tiseof(&tok));
+	*rv = rs;
+
+	return n;
+}
+
+int
+tokeq(Slice *tok, const char *s)
+{
+	return !tiseof(tok) && !strncmp(tok->begin, s, tok->end - tok->begin);
+}
+
+Range
+parserange(char **sp)
+{
+	Range range;
+	Slice tok;
+
+	range.begin = range.end = NF;
+	tok = peek();
+	if (tokeq(&tok, "NF")) {
+		next(sp);
+		return range;
+	}
+	range.begin = range.end = parsenum(sp);
+	tok = peek();
+	if (tokeq(&tok, "-")) {
+		next(sp);
+		range.end = NF;
+		tok = peek();
+		if (tokeq(&tok, "NF")) {
+			next(sp);
+			return range;
+		}
+		if (!tiseof(&tok) && !tisdelim(&tok))
+			range.end = parsenum(sp);
+	}
+	return range;
+}
+
+int
+parsenum(char **sp)
+{
+	Slice tok;
+
+	tok = next(sp);
+	if (tiseof(&tok))
+		sysfatal("EOF in number parser");
+	if (isdigit(*tok.begin))
+		return stoi(tok);
+	if (*tok.begin != '-')
+		sysfatal("number parse error: unexpected '%S'", tok);
+	extend(&tok, sp);
+	if (!isdigit(*(tok.begin + 1)))
+		sysfatal("negative number parse error: unspected '%S'", tok);
+	return stoi(tok);
+}
+
+void
+process(Biobuf *b, int rc, Range *rv, Reprog *delim, char *outsep, int collapse)
+{
+	char *line, *prefix;
+	const int nulldelim = 1;
+	Slice *s;
+	Slices ss;
+
+	memset(&ss, 0, sizeof(ss));
+	while ((line = Brdstr(b, '\n', nulldelim)) != 0) {
+		int printed, k;
+		uint nfields;
+
+		printed = 0;
+		nfields = split(line, delim, &ss, collapse);
+		s = ss.slices;
+		prefix = nil;
+		for (k = 0; k < rc; k++) {
+			int begin, end, f;
+
+			begin = rv[k].begin;
+			end = rv[k].end;
+			if (begin == 0) {
+				pprefix(prefix);
+				prefix = outsep;
+				Bprint(&bout, "%s", line);
+				printed = 1;
+				begin = 1;
+			}
+			if (begin == NF)
+				begin = nfields;
+			if (begin < 0)
+				begin += nfields + 1;
+			begin--;
+			if (end < 0)
+				end += nfields + 1;
+			if (begin < 0 || end < 0 || end < begin || nfields < begin)
+				continue;
+			for (f = begin; f < end && f < nfields; f++) {
+				pprefix(prefix);
+				prefix = outsep;
+				Bprint(&bout, "%S", s[f]);
+				printed = 1;
+			}
+		}
+		if (rc != 0 && (printed || !collapse))
+			Bputc(&bout, '\n');
+		free(line);
+	}
+	free(ss.slices);
+}
+
+void
+pprefix(char *prefix)
+{
+	if (prefix == nil)
+		return;
+	if (*prefix == '\0')
+		Bputc(&bout, '\0');
+	else
+		Bprint(&bout, "%s", prefix);
+}
+
+void
+reset(Slices *ss)
+{
+	ss->len = 0;
+}
+
+uint
+split(char *line, Reprog *delim, Slices *ss, int collapse)
+{
+	char *s, *b, *e;
+	Resub match[1];
+
+	memset(match, 0, sizeof(match));
+	reset(ss);
+	b = nil;
+	e = nil;
+	s = line;
+	while (regexec(delim, s, match, nelem(match))) {
+		b = s;
+		e = match[0].sp;
+		s = match[0].ep;
+		memset(match, 0, sizeof(match));
+		if (collapse && (e == line || b == e))
+			continue;
+		append(ss, b, e);
+	}
+	b = s;
+	e = b + strlen(s);
+	if (!collapse || b != e)
+		append(ss, b, e);
+
+	return ss->len;
+}
+
+void
+append(Slices *ss, char *begin, char *end)
+{
+	if (ss->len >= ss->size) {
+		Slice *s;
+		ss->size *= 2;
+		if (ss->size == 0)
+			ss->size = 1;
+		s = realloc(ss->slices, ss->size * sizeof(Slice));
+		if (s == nil)
+			sysfatal("malloc failed appending slice: %r");
+		ss->slices = s;
+	}
+	ss->slices[ss->len].begin = begin;
+	ss->slices[ss->len++].end = end;
+}
+
+void
+usage()
+{
+	sysfatal("usage: field [ -E | -e ] [ -F regexp ] [ -0 | -O delimiter ] <field list> [file...]");
+}