From 4e349e6c535344ca9ac91093d82f7c8e24b6f9a0 Mon Sep 17 00:00:00 2001
From: Daniel Stenberg <daniel@haxx.se>
Date: Tue, 27 Aug 2024 08:34:36 +0200
Subject: [PATCH] trurl: canonicalize the path

trurl now URL-decodes + URL-encodes the path so that %-sequences that
can be expressed as ASCII are shown as ASCII and %-sequences are unified
to lowercase hex etc.

Add test cases to verify

Fixes #329
Closes #331
---
 tests.json | 26 +++++++++++++++++++++
 trurl.c    | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/tests.json b/tests.json
index b0a97576..1543b90f 100644
--- a/tests.json
+++ b/tests.json
@@ -2650,5 +2650,31 @@
         "stderr": "trurl note: URL decode error, most likely because of rubbish in the input (path)\n",
         "returncode": 0
     }
+  },
+  {
+      "input": {
+          "arguments": [
+              "https://example.com/one/t%61o/%2F%42/"
+          ]
+      },
+      "expected": {
+          "stdout": "https://example.com/one/tao/%2fB/\n",
+          "stderr": "",
+          "returncode": 0
+      }
+  },
+  {
+      "input": {
+          "arguments": [
+              "https://example.com/one/t%61o/%2F%42/",
+              "--append",
+              "path=%61"
+          ]
+      },
+      "expected": {
+          "stdout": "https://example.com/one/tao/%2fB/%2561\n",
+          "stderr": "",
+          "returncode": 0
+      }
   }
 ]
diff --git a/trurl.c b/trurl.c
index a69b89fa..0521a821 100644
--- a/trurl.c
+++ b/trurl.c
@@ -1573,6 +1573,59 @@ static CURLUcode seturl(struct option *o, CURLU *uh, const char *url)
                       CURLU_URLENCODE);
 }
 
+static char *canonical_path(const char *path)
+{
+  /* split the path per slash, URL decode + encode, then put together again */
+  size_t len = strlen(path);
+  char *sl;
+  char *dupe = NULL;
+
+  do {
+    char *opath;
+    char *npath;
+    char *ndupe;
+    int olen;
+    sl = memchr(path, '/', len);
+    size_t partlen = sl ? (size_t)(sl - path) : len;
+
+    if(partlen) {
+      /* First URL decode the part */
+      opath = curl_easy_unescape(NULL, path, (int)partlen, &olen);
+      if(!opath)
+        return NULL;
+
+      /* Then URL encode it again */
+      npath = curl_easy_escape(NULL, opath, olen);
+      if(!npath)
+        return NULL;
+
+      curl_free(opath);
+      ndupe = curl_maprintf("%s%s%s", dupe ? dupe : "", npath, sl ? "/": "");
+      curl_free(npath);
+    }
+    else if(sl) {
+      /* zero length part but a slash */
+      ndupe = curl_maprintf("%s/", dupe ? dupe : "");
+    }
+    else {
+      /* no part, no slash */
+      break;
+    }
+    curl_free(dupe);
+    if(!ndupe)
+      return NULL;
+
+    dupe = ndupe;
+    if(sl) {
+      path = sl + 1;
+      len -= partlen + 1;
+    }
+
+  } while(sl);
+
+  return dupe;
+}
+
 static void singleurl(struct option *o,
                       const char *url, /* might be NULL */
                       struct iterinfo *iinfo,
@@ -1687,6 +1740,7 @@ static void singleurl(struct option *o,
     if(first_lap) {
       /* extract the current path */
       char *opath;
+      char *cpath;
       bool path_is_modified = false;
       if(curl_url_get(uh, CURLUPART_PATH, &opath, 0))
         errorf(o, ERROR_ITER, "out of memory");
@@ -1709,6 +1763,18 @@ static void singleurl(struct option *o,
         opath = npath;
         path_is_modified = true;
       }
+      cpath = canonical_path(opath);
+      if(!cpath)
+        errorf(o, ERROR_MEM, "out of memory");
+
+      if(strcmp(cpath, opath)) {
+        /* updated */
+        path_is_modified = true;
+        curl_free(opath);
+        opath = cpath;
+      }
+      else
+        curl_free(cpath);
       if(path_is_modified) {
         /* set the new path */
         if(curl_url_set(uh, CURLUPART_PATH, opath, 0))