MINOR: uri_normalizer: Add a percent-decode-unreserved normalizer

This normalizer decodes percent encoded characters within the RFC 3986 unreserved set. See GitHub Issue #714.
2025-11-09 04:51:01 +01:00 · 2021-04-21 21:20:36 +02:00 · 2021-04-21 21:20:36 +02:00 · 2e4a18e04a
commit 2e4a18e04a
parent d6d33deaea
6 changed files with 247 additions and 3 deletions
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@ -6029,6 +6029,7 @@ http-request normalize-uri <normalizer> [ { if | unless } <condition> ]
 http-request normalize-uri path-merge-slashes [ { if | unless } <condition> ]
 http-request normalize-uri path-strip-dot [ { if | unless } <condition> ]
 http-request normalize-uri path-strip-dotdot [ full ] [ { if | unless } <condition> ]
 http-request normalize-uri percent-decode-unreserved [ strict ] [ { if | unless } <condition> ]
 http-request normalize-uri percent-to-uppercase [ strict ] [ { if | unless } <condition> ]
 http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
@ -6048,11 +6049,25 @@ http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
  filesystem. However it might break routing of an API that expects a specific
  number of segments in the path.
  It is important to note that some normalizers might result in unsafe
  transformations for broken URIs. It might also be possible that a combination
  of normalizers that are safe by themselves results in unsafe transformations
  when improperly combined.
  As an example the "percent-decode-unreserved" normalizer might result in
  unexpected results when a broken URI includes bare percent characters. One
  such a broken URI is "/%%36%36" which would be decoded to "/%66" which in
  turn is equivalent to "/f". By specifying the "strict" option requests to
  such a broken URI would safely be rejected.
  The following normalizers are available:
  - path-strip-dot: Removes "/./" segments within the "path" component
      (RFC 3986#6.2.2.3).
      Segments including percent encoded dots ("%2E") will not be detected. Use
      the "percent-decode-unreserved" normalizer first if this is undesired.
      Example:
      - /.            -> /
      - /./bar/       -> /bar/
@ -6063,8 +6078,13 @@ http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
      (RFC 3986#6.2.2.3).
      This merges segments that attempt to access the parent directory with
-      their preceding segment. Empty segments do not receive special treatment.
+      their preceding segment.
-      Use the "path-merge-slashes" normalizer first if this is undesired.
+
      Empty segments do not receive special treatment. Use the "merge-slashes"
      normalizer first if this is undesired.
      Segments including percent encoded dots ("%2E") will not be detected. Use
      the "percent-decode-unreserved" normalizer first if this is undesired.
      Example:
      - /foo/../     -> /
@ -6073,6 +6093,7 @@ http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
      - /../bar/     -> /../bar/
      - /bar/../../  -> /../
      - /foo//../    -> /foo/
      - /foo/%2E%2E/ -> /foo/%2E%2E/
      If the "full" option is specified then "../" at the beginning will be
      removed as well:
@ -6088,6 +6109,25 @@ http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
      - //        -> /
      - /foo//bar -> /foo/bar
  - percent-decode-unreserved: Decodes unreserved percent encoded characters to
      their representation as a regular character (RFC 3986#6.2.2.2).
      The set of unreserved characters includes all letters, all digits, "-",
      ".", "_", and "~".
      Example:
      - /%61dmin       -> /admin
      - /foo%3Fbar=baz -> /foo%3Fbar=baz (no change)
      - /%%36%36       -> /%66           (unsafe)
      - /%ZZ           -> /%ZZ
      If the "strict" option is specified then invalid sequences will result
      in a HTTP 400 Bad Request being returned.
      Example:
      - /%%36%36 -> HTTP 400
      - /%ZZ     -> HTTP 400
  - percent-to-uppercase: Uppercases letters within percent-encoded sequences
      (RFC 3986#6.2.2.1).
--- a/include/haproxy/action-t.h
+++ b/include/haproxy/action-t.h
@ -109,6 +109,8 @@ enum act_normalize_uri {
 	ACT_NORMALIZE_URI_QUERY_SORT_BY_NAME,
 	ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE,
 	ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT,
 	ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED,
 	ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT,
 };
 /* NOTE: if <.action_ptr> is defined, the referenced function will always be
--- a/include/haproxy/uri_normalizer.h
+++ b/include/haproxy/uri_normalizer.h
@ -18,6 +18,7 @@
 #include <haproxy/uri_normalizer-t.h>
 enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst);
 enum uri_normalizer_err uri_normalizer_percent_upper(const struct ist input, int strict, struct ist *dst);
 enum uri_normalizer_err uri_normalizer_path_dot(const struct ist path, struct ist *dst);
 enum uri_normalizer_err uri_normalizer_path_dotdot(const struct ist path, int full, struct ist *dst);
--- a/reg-tests/http-rules/normalize_uri.vtc
+++ b/reg-tests/http-rules/normalize_uri.vtc
@ -8,7 +8,7 @@ feature ignore_unknown_macro
 server s1 {
    rxreq
    txresp
-} -repeat 54 -start
+} -repeat 63 -start
 haproxy h1 -conf {
    defaults
@ -94,6 +94,30 @@ haproxy h1 -conf {
        default_backend be
    frontend fe_percent_decode_unreserved
        bind "fd@${fe_percent_decode_unreserved}"
        http-request set-var(txn.before) url
        http-request normalize-uri percent-decode-unreserved
        http-request set-var(txn.after) url
        http-response add-header before  %[var(txn.before)]
        http-response add-header after  %[var(txn.after)]
        default_backend be
    frontend fe_percent_decode_unreserved_strict
        bind "fd@${fe_percent_decode_unreserved_strict}"
        http-request set-var(txn.before) url
        http-request normalize-uri percent-decode-unreserved strict
        http-request set-var(txn.after) url
        http-response add-header before  %[var(txn.before)]
        http-response add-header after  %[var(txn.after)]
        default_backend be
    backend be
        server s1 ${s1_addr}:${s1_port}
@ -391,3 +415,52 @@ client c6 -connect ${h1_fe_dot_sock} {
    expect resp.http.before == "/?a=/./"
    expect resp.http.after == "/?a=/./"
 } -run
 client c7 -connect ${h1_fe_percent_decode_unreserved_sock} {
    txreq -url "/a?a=a"
    rxresp
    expect resp.http.before == "/a?a=a"
    expect resp.http.after == "/a?a=a"
    txreq -url "/%61?%61=%61"
    rxresp
    expect resp.http.before == "/%61?%61=%61"
    expect resp.http.after == "/a?a=a"
    txreq -url "/%3F?foo=bar"
    rxresp
    expect resp.http.before == "/%3F?foo=bar"
    expect resp.http.after == "/%3F?foo=bar"
    txreq -url "/%%36%36"
    rxresp
    expect resp.status == 200
    expect resp.http.before == "/%%36%36"
    expect resp.http.after == "/%66"
    txreq -req OPTIONS -url "*"
    rxresp
    expect resp.http.before == "*"
    expect resp.http.after == "*"
 } -run
 client c8 -connect ${h1_fe_percent_decode_unreserved_strict_sock} {
    txreq -url "/a?a=a"
    rxresp
    expect resp.http.before == "/a?a=a"
    expect resp.http.after == "/a?a=a"
    txreq -url "/%61?%61=%61"
    rxresp
    expect resp.http.before == "/%61?%61=%61"
    expect resp.http.after == "/a?a=a"
    txreq -url "/%3F?foo=bar"
    rxresp
    expect resp.http.before == "/%3F?foo=bar"
    expect resp.http.after == "/%3F?foo=bar"
    txreq -url "/%%36%36"
    rxresp
    expect resp.status == 400
 } -run
--- a/src/http_act.c
+++ b/src/http_act.c
@ -294,6 +294,24 @@ static enum act_return http_action_normalize_uri(struct act_rule *rule, struct p
 			err = uri_normalizer_percent_upper(path, rule->action == ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT, &newpath);
 			if (err != URI_NORMALIZER_ERR_NONE)
 				break;
 			if (!http_replace_req_path(htx, newpath, 1))
 				goto fail_rewrite;
 			break;
 		}
 		case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED:
 		case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT: {
 			const struct ist path = http_get_path(uri);
 			struct ist newpath = ist2(replace->area, replace->size);
 			if (!isttest(path))
 				goto leave;
 			err = uri_normalizer_percent_decode_unreserved(path, rule->action == ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT, &newpath);
 			if (err != URI_NORMALIZER_ERR_NONE)
 				break;
@ -407,6 +425,21 @@ static enum act_parse_ret parse_http_normalize_uri(const char **args, int *orig_
 			return ACT_RET_PRS_ERR;
 		}
 	}
 	else if (strcmp(args[cur_arg], "percent-decode-unreserved") == 0) {
 		cur_arg++;
 		if (strcmp(args[cur_arg], "strict") == 0) {
 			cur_arg++;
 			rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT;
 		}
 		else if (!*args[cur_arg]) {
 			rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED;
 		}
 		else if (strcmp(args[cur_arg], "if") != 0 && strcmp(args[cur_arg], "unless") != 0) {
 			memprintf(err, "unknown argument '%s' for 'percent-decode-unreserved' normalizer", args[cur_arg]);
 			return ACT_RET_PRS_ERR;
 		}
 	}
 	else {
 		memprintf(err, "unknown normalizer '%s'", args[cur_arg]);
 		return ACT_RET_PRS_ERR;
--- a/src/uri_normalizer.c
+++ b/src/uri_normalizer.c
@ -18,6 +18,101 @@
 #include <haproxy/tools.h>
 #include <haproxy/uri_normalizer.h>
 /* Returns 1 if the given character is part of the 'unreserved' set in the
 * RFC 3986 ABNF.
 * Returns 0 if not.
 */
 static int is_unreserved_character(unsigned char c)
 {
 	switch (c) {
 	case 'A'...'Z': /* ALPHA */
 	case 'a'...'z': /* ALPHA */
 	case '0'...'9': /* DIGIT */
 	case '-':
 	case '.':
 	case '_':
 	case '~':
 		return 1;
 	default:
 		return 0;
 	}
 }
 /* Decodes percent encoded characters that are part of the 'unreserved' set.
 *
 * RFC 3986, section 2.3:
 * >  URIs that differ in the replacement of an unreserved character with
 * >  its corresponding percent-encoded US-ASCII octet are equivalent [...]
 * >  when found in a URI, should be decoded to their corresponding unreserved
 * >  characters by URI normalizers.
 *
 * If `strict` is set to 0 then percent characters that are not followed by a
 * hexadecimal digit are returned as-is without performing any decoding.
 * If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned
 * for invalid sequences.
 */
 enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst)
 {
 	enum uri_normalizer_err err;
 	const size_t size = istclear(dst);
 	struct ist output = *dst;
 	struct ist scanner = input;
 	/* The output will either be shortened or have the same length. */
 	if (size < istlen(input)) {
 		err = URI_NORMALIZER_ERR_ALLOC;
 		goto fail;
 	}
 	while (istlen(scanner)) {
 		const char current = istshift(&scanner);
 		if (current == '%') {
 			if (istlen(scanner) >= 2) {
 				if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) {
 					char hex1, hex2, c;
 					hex1 = istshift(&scanner);
 					hex2 = istshift(&scanner);
 					c = (hex2i(hex1) << 4) + hex2i(hex2);
 					if (is_unreserved_character(c)) {
 						output = __istappend(output, c);
 					}
 					else {
 						output = __istappend(output, current);
 						output = __istappend(output, hex1);
 						output = __istappend(output, hex2);
 					}
 					continue;
 				}
 			}
 			if (strict) {
 				err = URI_NORMALIZER_ERR_INVALID_INPUT;
 				goto fail;
 			}
 			else {
 				output = __istappend(output, current);
 			}
 		}
 		else {
 			output = __istappend(output, current);
 		}
 	}
 	*dst = output;
 	return URI_NORMALIZER_ERR_NONE;
  fail:
 	return err;
 }
 /* Uppercases letters used in percent encoding.
 *
 * If `strict` is set to 0 then percent characters that are not followed by a