diff options
Diffstat (limited to 'src/core')
| -rw-r--r-- | src/core/url.c | 290 | ||||
| -rw-r--r-- | src/core/url.h | 21 |
2 files changed, 163 insertions, 148 deletions
diff --git a/src/core/url.c b/src/core/url.c index 1c3d6c83..a5b4fd06 100644 --- a/src/core/url.c +++ b/src/core/url.c @@ -11,6 +11,7 @@ #include "core/nng_impl.h" #include <ctype.h> +#include <stdbool.h> #include <stdio.h> #include <string.h> @@ -23,156 +24,184 @@ url_hexval(char c) return (c - '0'); } if ((c >= 'A') && (c <= 'F')) { - return (c - 'A'); + return ((c - 'A') + 10); } if ((c >= 'a') && (c <= 'f')) { - return (c - 'a'); + return ((c - 'a') + 10); } return (0); } +// This returns either 0, or NNG_EINVAL, if the supplied input string +// is malformed UTF-8. We consider UTF-8 malformed when the sequence +// is an invalid code point, not the shortest possible code point, or +// incomplete. static int -url_decode_buf(const char *in, char *out, int len) +url_utf8_validate(void *arg) { - int dlen; - const uint8_t *src; - uint8_t * dst; - int c; - - src = (const uint8_t *) in; - dst = (uint8_t *) out; - - dlen = 0; - while ((c = *src) != 0) { - switch (c) { - case '%': - if ((!isxdigit(src[1])) || (!isxdigit(src[2]))) { - return (-1); - } - c = (url_hexval(src[1]) * 16) + url_hexval(src[2]); - // We don't support encoded control characters. - if ((c < ' ') || (c == 0x7F)) { - return (-1); - } - src += 3; - break; - case '+': - src++; - c = ' '; - break; - default: - // Reject control characters and non-ASCII - if ((c >= 0x7F) || (c <= ' ')) { - return (-1); + uint8_t *s = arg; + uint32_t v, minv; + int nb; + + while (*s) { + if ((s[0] & 0x80) == 0) { + s++; + continue; + } + if ((s[0] & 0xe0) == 0xc0) { + // 0x80 thru 0x7ff + v = (s[0] & 0x1f); + minv = 0x80; + nb = 1; + } else if ((s[0] & 0xf0) == 0xe0) { + v = (s[0] & 0xf); + minv = 0x800; + nb = 2; + } else if ((s[0] & 0xf8) == 0xf0) { + v = (s[0] & 0x7); + minv = 0x10000; + nb = 3; + } else { + // invalid byte, either continuation, or too many + // leading 1 bits. + return (NNG_EINVAL); + } + s++; + for (int i = 0; i < nb; i++) { + if ((s[0] & 0xc0) != 0x80) { + return (NNG_EINVAL); // not continuation } - // Technically this will accept some "unsafe" - // characters as is. - src++; - break; + s++; + v <<= 6; + v += s[0] & 0x3f; } - - if (dlen < len) { - *dst++ = c; + if (v < minv) { + return (NNG_EINVAL); + } + if ((v >= 0xd800) && (v <= 0xdfff)) { + return (NNG_EINVAL); + } + if (v > 0x10ffff) { + return (NNG_EINVAL); } - dlen++; - } - if (dlen < len) { - *dst = '\0'; - } - dlen++; // for null terminator - return (dlen); -} - -int -nni_url_decode(char **out, const char *in) -{ - int len = 0; - char *dst; - - if ((len = url_decode_buf(in, NULL, 0)) < 1) { - return (NNG_EINVAL); - } - if ((dst = nni_alloc(len)) == NULL) { - return (NNG_ENOMEM); } - url_decode_buf(in, dst, len); - *out = dst; return (0); } -static const char *url_hexdigits = "0123456789ABCDEF"; -static const char *url_safe = "-_.~"; - static int -url_encode_buf(const char *in, char *out, int len, const char *specials) +url_canonify_uri(char **outp, const char *in) { - uint8_t * dst; - const uint8_t *src; - int dlen; - int c; - - dlen = 0; - src = (const uint8_t *) in; - dst = (uint8_t *) out; - - while ((c = *src) != 0) { - if ((c < ' ') || (c == 0x7F)) { - // No encoding of control characters - return (-1); - } - if ((c < 0x80) && - ((isalnum(c) || (strchr(specials, c) != NULL) || - (strchr(url_safe, c) != NULL)))) { - if (dlen < len) { - *dst++ = c; + char * out; + size_t src, dst, len; + int c; + int rv; + bool skip; + + // We know that the transform is strictly "reducing". + if ((out = nni_strdup(in)) == NULL) { + return (NNG_ENOMEM); + } + len = strlen(out); + + // First pass, convert '%xx' for safe characters to unescaped forms. + src = dst = 0; + while ((c = out[src]) != 0) { + if (c == '%') { + if ((!isxdigit(out[src + 1])) || + (!isxdigit(out[src + 2]))) { + nni_free(out, len); + return (NNG_EINVAL); } - dlen++; - src++; + c = url_hexval(out[src + 1]); + c *= 16; + c += url_hexval(out[src + 2]); + // If it's a safe character, decode, otherwise leave + // it alone. We also decode valid high-bytes for + // UTF-8, which will let us validate them and use + // those characters in file names later. + if (((c >= 'A') && (c <= 'Z')) || + ((c >= 'a') && (c <= 'z')) || + ((c >= '0') && (c <= '9')) || (c == '.') || + (c == '~') || (c == '_') || (c == '-') || + (c >= 0x80)) { + out[dst++] = (char) c; + } else { + out[dst++] = '%'; + out[dst++] = toupper(out[src + 1]); + out[dst++] = toupper(out[src + 2]); + } + src += 3; continue; + } else { + out[dst++] = out[src++]; } - - if (dlen < len) { - *dst++ = '%'; - } - dlen++; - if (dlen < len) { - *dst++ = url_hexdigits[((c & 0xf0) >> 4)]; + } + out[dst] = 0; + + // Second pass, eliminate redundant //. + src = dst = 0; + skip = false; + while ((c = out[src]) != 0) { + if ((c == '/') && (!skip)) { + out[dst++] = '/'; + while (out[src] == '/') { + src++; + } + continue; } - dlen++; - if (dlen < len) { - *dst++ = url_hexdigits[(c & 0xf)]; + if ((c == '?') || (c == '#')) { + skip = true; } - dlen++; + out[dst++] = c; src++; } - if (dlen < len) { - *dst = '\0'; + out[dst] = 0; + + // Second pass, reduce /. and /.. elements, but only in the path. + src = dst = 0; + skip = false; + while ((c = out[src]) != 0) { + if ((c == '/') && (!skip)) { + if ((strncmp(out + src, "/..", 3) == 0) && + (out[src + 3] == 0 || out[src + 3] == '#' || + out[src + 3] == '?' || out[src + 3] == '/')) { + + if (dst > 0) { + do { + dst--; + } while ((dst) && (out[dst] != '/')); + } + src += 3; + continue; + } + if ((strncmp(out + src, "/.", 2) == 0) && + (out[src + 2] == 0 || out[src + 2] == '#' || + out[src + 2] == '?' || out[src + 2] == '/')) { + src += 2; // just skip over it + continue; + } + out[dst++] = '/'; + src++; + } else { + if ((c == '?') || (c == '#')) { + skip = true; + } + out[dst++] = c; + src++; + } } - dlen++; - return (dlen); -} + out[dst] = 0; -int -nni_url_encode_ext(char **out, const char *in, const char *specials) -{ - int len; - char *dst; - - if ((len = url_encode_buf(in, NULL, 0, specials)) < 0) { - return (NNG_EINVAL); + // Finally lets make sure that the results are valid UTF-8. + // This guards against using UTF-8 redundancy to break security. + if ((rv = url_utf8_validate(out)) != 0) { + nni_free(out, len); + return (rv); } - if ((dst = nni_alloc(len)) == NULL) { - return (NNG_ENOMEM); - } - url_encode_buf(in, dst, len, specials); - *out = dst; - return (0); -} -int -nni_url_encode(char **out, const char *in) -{ - return (nni_url_encode_ext(out, in, "")); + *outp = nni_strdup(out); + nni_free(out, len); + return (*outp == NULL ? NNG_ENOMEM : 0); } static struct { @@ -226,6 +255,7 @@ nni_url_parse(nni_url **urlp, const char *raw) { nni_url * url; size_t len; + int outlen; const char *s; char c; int rv; @@ -255,7 +285,9 @@ nni_url_parse(nni_url **urlp, const char *raw) rv = NNG_ENOMEM; goto error; } - memcpy(url->u_scheme, s, len); + for (int i = 0; i < len; i++) { + url->u_scheme[i] = tolower(s[i]); + } url->u_scheme[len] = '\0'; // Look for host part (including colon). Will be terminated by @@ -301,14 +333,19 @@ nni_url_parse(nni_url **urlp, const char *raw) rv = NNG_ENOMEM; goto error; } - memcpy(url->u_host, s, len); + // Copy the host portion, but make it lower case (hostnames are + // case insensitive). + for (int i = 0; i < len; i++) { + url->u_host[i] = tolower(s[i]); + } url->u_host[len] = '\0'; s += len; - if ((url->u_rawpath = nni_strdup(s)) == NULL) { - rv = NNG_ENOMEM; + if ((rv = url_canonify_uri(&url->u_rawpath, s)) != 0) { goto error; } + + s = url->u_rawpath; for (len = 0; (c = s[len]) != '\0'; len++) { if ((c == '?') || (c == '#')) { break; @@ -319,7 +356,6 @@ nni_url_parse(nni_url **urlp, const char *raw) rv = NNG_ENOMEM; goto error; } - memcpy(url->u_path, s, len); url->u_path[len] = '\0'; diff --git a/src/core/url.h b/src/core/url.h index 91054dcb..f99d6eb4 100644 --- a/src/core/url.h +++ b/src/core/url.h @@ -29,25 +29,4 @@ struct nni_url { extern int nni_url_parse(nni_url **, const char *path); extern void nni_url_free(nni_url *); -// nni_url_decode decodes the string, converting escaped characters to their -// proper form. The newly allocated string is returned in the first argument -// and may be freed with nni_strfree(). Note that we return EINVAL in the -// presence of an encoding of a control character. (Most especially NUL -// would cause problems for C code, but the other control characters have -// no business inside a URL either.) -extern int nni_url_decode(char **, const char *); - -// nni_url_encode works like nni_url_decode, but does the opposite transform. -// "Reserved" special characters (such as "/" and "@") are encoded, so don't -// use this to encode the entire URL.) This is most useful when encoding -// individual components, such as a value for a query parameter. Note that -// this returns NNG_EINVAL if the input string contains control characters, -// as those have no business inside a URL. -extern int nni_url_encode(char **, const char *); - -// nni_url_encode_ext works like nni_url_encode, but passes the named -// special characters. For example, to URL encode all elements in a path -// while preserving director separators, use the string "/" for specials. -extern int nni_url_encode_ext(char **, const char *, const char *); - #endif // CORE_URL_H |
