Browse Source

implement POSIX regexp support

Introduce a new operator `~` and new `/.../eis` regular expression syntax.

This allows filtering by regular expression, e.g.

   jsonfilter -s '[ "foo", "bar", "baz" ]' -e '$[@ ~ /^b/]'

... would yield the values `bar` and `baz`.

Possible regular expression modifiers are:

  - `e` ... enable extended POSIX regular expressions
  - `i` ... perform case insensitive matches
  - `s` ... let ranges and `.` match the newline character

A regular expression literal may occur on the left or the right side of
the `~` operator, but not on both.

In case neither side of the `~` operator is a regular expression, the right
side will be treated as regular expression pattern. Non-string values are
converted to their string representation before performing matching.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
Jo-Philipp Wich 6 years ago
parent
commit
c7e938d658
5 changed files with 176 additions and 4 deletions
  1. 73 2
      lexer.c
  2. 1 1
      lexer.h
  3. 97 0
      matcher.c
  4. 2 0
      matcher.h
  5. 3 1
      parser.y

+ 73 - 2
lexer.c

@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <regex.h>
 
 #include "ast.h"
 #include "lexer.h"
@@ -236,7 +237,21 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 				case 'r': *out = '\r'; break;
 				case 't': *out = '\t'; break;
 				case 'v': *out = '\v'; break;
-				default:  *out = *in; break;
+				default:
+					/* in regexp mode, retain backslash */
+					if (q == '/')
+					{
+						if (rem-- < 1)
+						{
+							s->error_pos = s->off + (in - buf);
+							return -3;
+						}
+
+						*out++ = '\\';
+					}
+
+					*out = *in;
+					break;
 				}
 
 				in++;
@@ -277,6 +292,58 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 }
 
 
+/*
+ * Parses a regexp literal from the given buffer.
+ *
+ * Returns a negative value on error, otherwise the amount of consumed
+ * characters from the given buffer.
+ *
+ * Error values:
+ *  -1	Unterminated regexp
+ *  -2	Invalid escape sequence
+ *  -3	Regexp literal too long
+ */
+
+static int
+parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
+{
+	int len = parse_string(buf, op, s);
+	const char *p;
+
+	if (len >= 2)
+	{
+		op->num = REG_NOSUB | REG_NEWLINE;
+
+		for (p = buf + len; p; p++)
+		{
+			switch (*p)
+			{
+			case 'e':
+				op->num |= REG_EXTENDED;
+				len++;
+				break;
+
+			case 'i':
+				op->num |= REG_ICASE;
+				len++;
+				break;
+
+			case 's':
+				op->num &= ~REG_NEWLINE;
+				len++;
+				break;
+
+			default:
+				return len;
+			}
+		}
+
+	}
+
+	return len;
+}
+
+
 /*
  * Parses a label from the given buffer.
  *
@@ -367,8 +434,10 @@ static const struct token tokens[] = {
 	{ T_LT,			"<",     1 },
 	{ T_GT,			">",     1 },
 	{ T_EQ,			"=",     1 },
+	{ T_MATCH,		"~",     1 },
 	{ T_NOT,		"!",     1 },
 	{ T_WILDCARD,	"*",     1 },
+	{ T_REGEXP,		"/",	 1, parse_regexp },
 	{ T_STRING,		"'",	 1, parse_string },
 	{ T_STRING,		"\"",	 1, parse_string },
 	{ T_LABEL,		"_",     1, parse_label  },
@@ -378,7 +447,7 @@ static const struct token tokens[] = {
 	{ T_NUMBER,		"09",    0, parse_number },
 };
 
-const char *tokennames[23] = {
+const char *tokennames[25] = {
 	[0]				= "End of file",
 	[T_AND]			= "'&&'",
 	[T_OR]			= "'||'",
@@ -389,12 +458,14 @@ const char *tokennames[23] = {
 	[T_GE]			= "'>='",
 	[T_LT]			= "'<'",
 	[T_LE]			= "'<='",
+	[T_MATCH]       = "'~'",
 	[T_NOT]			= "'!'",
 	[T_LABEL]		= "Label",
 	[T_ROOT]		= "'$'",
 	[T_THIS]		= "'@'",
 	[T_DOT]			= "'.'",
 	[T_WILDCARD]	= "'*'",
+	[T_REGEXP]      = "/.../",
 	[T_BROPEN]		= "'['",
 	[T_BRCLOSE]		= "']'",
 	[T_BOOL]		= "Bool",

+ 1 - 1
lexer.h

@@ -19,7 +19,7 @@
 
 #include "ast.h"
 
-extern const char *tokennames[23];
+extern const char *tokennames[25];
 
 struct jp_opcode *
 jp_get_token(struct jp_state *s, const char *input, int *mlen);

+ 97 - 0
matcher.c

@@ -17,6 +17,7 @@
 #include "parser.h"
 #include "matcher.h"
 
+
 static struct json_object *
 jp_match_next(struct jp_opcode *ptr,
               struct json_object *root, struct json_object *cur,
@@ -130,6 +131,99 @@ jp_cmp(struct jp_opcode *op, struct json_object *root, struct json_object *cur)
 	}
 }
 
+static bool
+jp_regmatch(struct jp_opcode *op, struct json_object *root, struct json_object *cur)
+{
+	struct jp_opcode left, right;
+	char lbuf[22], rbuf[22], *lval, *rval;
+	int err, rflags = REG_NOSUB | REG_NEWLINE;
+	regex_t preg;
+
+
+	if (!jp_resolve(root, cur, op->down, &left) ||
+	    !jp_resolve(root, cur, op->down->sibling, &right))
+		return false;
+
+	if (left.type == T_REGEXP)
+	{
+		switch (right.type)
+		{
+		case T_BOOL:
+			lval = right.num ? "true" : "false";
+			break;
+
+		case T_NUMBER:
+			snprintf(lbuf, sizeof(lbuf), "%d", right.num);
+			lval = lbuf;
+			break;
+
+		case T_STRING:
+			lval = right.str;
+			break;
+
+		default:
+			return false;
+		}
+
+		rval = left.str;
+		rflags = left.num;
+	}
+	else
+	{
+		switch (left.type)
+		{
+		case T_BOOL:
+			lval = left.num ? "true" : "false";
+			break;
+
+		case T_NUMBER:
+			snprintf(lbuf, sizeof(lbuf), "%d", left.num);
+			lval = lbuf;
+			break;
+
+		case T_STRING:
+			lval = left.str;
+			break;
+
+		default:
+			return false;
+		}
+
+		switch (right.type)
+		{
+		case T_BOOL:
+			rval = right.num ? "true" : "false";
+			break;
+
+		case T_NUMBER:
+			snprintf(rbuf, sizeof(rbuf), "%d", right.num);
+			rval = rbuf;
+			break;
+
+		case T_STRING:
+			rval = right.str;
+			break;
+
+		case T_REGEXP:
+			rval = right.str;
+			rflags = right.num;
+			break;
+
+		default:
+			return false;
+		}
+	}
+
+	if (regcomp(&preg, rval, rflags))
+		return false;
+
+	err = regexec(&preg, lval, 0, NULL, 0);
+
+	regfree(&preg);
+
+	return err ? false : true;
+}
+
 static bool
 jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur,
         int idx, const char *key, jp_match_cb_t cb, void *priv)
@@ -149,6 +243,9 @@ jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur,
 	case T_GE:
 		return jp_cmp(op, root, cur);
 
+	case T_MATCH:
+		return jp_regmatch(op, root, cur);
+
 	case T_ROOT:
 		return !!jp_match(op, root, NULL, NULL);
 

+ 2 - 0
matcher.h

@@ -19,6 +19,8 @@
 
 #include <string.h>
 #include <stdbool.h>
+#include <stdio.h>
+#include <regex.h>
 
 #ifdef JSONC
 	#include <json.h>

+ 3 - 1
parser.y

@@ -20,7 +20,7 @@
 %left T_AND.
 %left T_OR.
 %left T_UNION.
-%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE.
+%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE T_MATCH.
 %right T_NOT.
 
 %include {
@@ -87,11 +87,13 @@ cmp_exp(A) ::= unary_exp(B) T_GT unary_exp(C).		{ A = alloc_op(T_GT, 0, NULL, B,
 cmp_exp(A) ::= unary_exp(B) T_GE unary_exp(C).		{ A = alloc_op(T_GE, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_EQ unary_exp(C).		{ A = alloc_op(T_EQ, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_NE unary_exp(C).		{ A = alloc_op(T_NE, 0, NULL, B, C); }
+cmp_exp(A) ::= unary_exp(B) T_MATCH unary_exp(C).	{ A = alloc_op(T_MATCH, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B).						{ A = B; }
 
 unary_exp(A) ::= T_BOOL(B).							{ A = B; }
 unary_exp(A) ::= T_NUMBER(B).						{ A = B; }
 unary_exp(A) ::= T_STRING(B).						{ A = B; }
+unary_exp(A) ::= T_REGEXP(B).						{ A = B; }
 unary_exp(A) ::= T_WILDCARD(B).						{ A = B; }
 unary_exp(A) ::= T_POPEN or_exps(B) T_PCLOSE.		{ A = B; }
 unary_exp(A) ::= T_NOT unary_exp(B).				{ A = alloc_op(T_NOT, 0, NULL, B); }