From 4e349e6c535344ca9ac91093d82f7c8e24b6f9a0 Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Tue, 27 Aug 2024 08:34:36 +0200 Subject: [PATCH] trurl: canonicalize the path trurl now URL-decodes + URL-encodes the path so that %-sequences that can be expressed as ASCII are shown as ASCII and %-sequences are unified to lowercase hex etc. Add test cases to verify Fixes #329 Closes #331 --- tests.json | 26 +++++++++++++++++++++ trurl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/tests.json b/tests.json index b0a97576..1543b90f 100644 --- a/tests.json +++ b/tests.json @@ -2650,5 +2650,31 @@ "stderr": "trurl note: URL decode error, most likely because of rubbish in the input (path)\n", "returncode": 0 } + }, + { + "input": { + "arguments": [ + "https://example.com/one/t%61o/%2F%42/" + ] + }, + "expected": { + "stdout": "https://example.com/one/tao/%2fB/\n", + "stderr": "", + "returncode": 0 + } + }, + { + "input": { + "arguments": [ + "https://example.com/one/t%61o/%2F%42/", + "--append", + "path=%61" + ] + }, + "expected": { + "stdout": "https://example.com/one/tao/%2fB/%2561\n", + "stderr": "", + "returncode": 0 + } } ] diff --git a/trurl.c b/trurl.c index a69b89fa..0521a821 100644 --- a/trurl.c +++ b/trurl.c @@ -1573,6 +1573,59 @@ static CURLUcode seturl(struct option *o, CURLU *uh, const char *url) CURLU_URLENCODE); } +static char *canonical_path(const char *path) +{ + /* split the path per slash, URL decode + encode, then put together again */ + size_t len = strlen(path); + char *sl; + char *dupe = NULL; + + do { + char *opath; + char *npath; + char *ndupe; + int olen; + sl = memchr(path, '/', len); + size_t partlen = sl ? (size_t)(sl - path) : len; + + if(partlen) { + /* First URL decode the part */ + opath = curl_easy_unescape(NULL, path, (int)partlen, &olen); + if(!opath) + return NULL; + + /* Then URL encode it again */ + npath = curl_easy_escape(NULL, opath, olen); + if(!npath) + return NULL; + + curl_free(opath); + ndupe = curl_maprintf("%s%s%s", dupe ? dupe : "", npath, sl ? "/": ""); + curl_free(npath); + } + else if(sl) { + /* zero length part but a slash */ + ndupe = curl_maprintf("%s/", dupe ? dupe : ""); + } + else { + /* no part, no slash */ + break; + } + curl_free(dupe); + if(!ndupe) + return NULL; + + dupe = ndupe; + if(sl) { + path = sl + 1; + len -= partlen + 1; + } + + } while(sl); + + return dupe; +} + static void singleurl(struct option *o, const char *url, /* might be NULL */ struct iterinfo *iinfo, @@ -1687,6 +1740,7 @@ static void singleurl(struct option *o, if(first_lap) { /* extract the current path */ char *opath; + char *cpath; bool path_is_modified = false; if(curl_url_get(uh, CURLUPART_PATH, &opath, 0)) errorf(o, ERROR_ITER, "out of memory"); @@ -1709,6 +1763,18 @@ static void singleurl(struct option *o, opath = npath; path_is_modified = true; } + cpath = canonical_path(opath); + if(!cpath) + errorf(o, ERROR_MEM, "out of memory"); + + if(strcmp(cpath, opath)) { + /* updated */ + path_is_modified = true; + curl_free(opath); + opath = cpath; + } + else + curl_free(cpath); if(path_is_modified) { /* set the new path */ if(curl_url_set(uh, CURLUPART_PATH, opath, 0))