Browse Source

awk: fix backslash handling in sub() builtins

function                                             old     new   delta
awk_sub                                              559     544     -15

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Denys Vlasenko 10 months ago
parent
commit
5f84c56336
2 changed files with 66 additions and 22 deletions
  1. 19 22
      editors/awk.c
  2. 47 0
      testsuite/awk.tests

+ 19 - 22
editors/awk.c

@@ -2492,7 +2492,7 @@ static char *awk_printf(node *n, size_t *len)
  * store result into (dest), return number of substitutions.
  * If nm = 0, replace all matches.
  * If src or dst is NULL, use $0.
- * If subexp != 0, enable subexpression matching (\1-\9).
+ * If subexp != 0, enable subexpression matching (\0-\9).
  */
 static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp)
 {
@@ -2520,35 +2520,32 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int
 		residx += eo;
 		if (++match_no >= nm) {
 			const char *s;
-			int nbs;
+			int bslash;
 
 			/* replace */
 			residx -= (eo - so);
-			nbs = 0;
+			bslash = 0;
 			for (s = repl; *s; s++) {
-				char c = resbuf[residx++] = *s;
-				if (c == '\\') {
-					nbs++;
-					continue;
+				char c = *s;
+				if (c == '\\' && s[1]) {
+					bslash ^= 1;
+					if (bslash)
+						continue;
 				}
-				if (c == '&' || (subexp && c >= '0' && c <= '9')) {
-					int j;
-					residx -= ((nbs + 3) >> 1);
-					j = 0;
+				if ((!bslash && c == '&')
+				 || (subexp && bslash && c >= '0' && c <= '9')
+				) {
+					int n, j = 0;
 					if (c != '&') {
 						j = c - '0';
-						nbs++;
 					}
-					if (nbs % 2) {
-						resbuf[residx++] = c;
-					} else {
-						int n = pmatch[j].rm_eo - pmatch[j].rm_so;
-						resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
-						memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
-						residx += n;
-					}
-				}
-				nbs = 0;
+					n = pmatch[j].rm_eo - pmatch[j].rm_so;
+					resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
+					memcpy(resbuf + residx, sp + pmatch[j].rm_so, n);
+					residx += n;
+				} else
+					resbuf[residx++] = c;
+				bslash = 0;
 			}
 		}
 

+ 47 - 0
testsuite/awk.tests

@@ -552,4 +552,51 @@ testing "awk = has higher precedence than == (despite what gawk manpage claims)"
 	'0\n1\n2\n1\n3\n' \
 	'' ''
 
+sq="'"
+testing 'awk gensub backslashes \' \
+	'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\
+\\|\\
+' \
+	'' ''
+testing 'awk gensub backslashes \\' \
+	'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\\\
+\\|\\
+' \
+	'' ''
+# gawk 5.1.1 handles trailing unpaired \ inconsistently.
+# If replace string is single \, it is used verbatim,
+# but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect.
+testing 'awk gensub backslashes \\\' \
+	'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\\\\\
+\\\\|\\\\
+' \
+	'' ''
+testing 'awk gensub backslashes \\\\' \
+	'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\\\\\\\
+\\\\|\\\\
+' \
+	'' ''
+testing 'awk gensub backslashes \&' \
+	'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\&
+&|&
+' \
+	'' ''
+testing 'awk gensub backslashes \0' \
+	'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\0
+a|a
+' \
+	'' ''
+testing 'awk gensub backslashes \\0' \
+	'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \
+	's=\\\\0
+\\0|\\0
+' \
+	'' ''
+
 exit $FAILCOUNT