Skip to content

Commit

Permalink
Optimise cow_uri:urlencode
Browse files Browse the repository at this point in the history
See the previous commit [1] for details on the
optimisations involved. The optimisation provides
the same type of performance gains for encoding
and decoding.

[1] cdaac4c
  • Loading branch information
essen committed Jul 3, 2024
1 parent cdaac4c commit b64eb1f
Showing 1 changed file with 78 additions and 124 deletions.
202 changes: 78 additions & 124 deletions src/cow_uri.erl
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,10 @@

-module(cow_uri).

-include("cow_inline.hrl").

-export([urldecode/1]).
-export([urlencode/1]).

%% Decode a percent encoded string. (RFC3986 2.1)
%%
%% Inspiration for some of the optimisations done here come
%% from the new `json` module as it was in mid-2024.
%%
%% Possible input includes:
%%
%% * nothing encoded (no % character):
%% We want to return the binary as-is to avoid an allocation.
%%
%% * small number of encoded characters:
%% We can "skip" words of text.
%%
%% * mostly encoded characters (non-ascii languages)
%% We can decode characters in bulk.
-include("cow_inline.hrl").

-define(IS_PLAIN(C), (
(C =:= $!) orelse (C =:= $$) orelse (C =:= $&) orelse (C =:= $') orelse
Expand All @@ -58,6 +42,24 @@
(C =:= $y) orelse (C =:= $z) orelse (C =:= $~)
)).

%% Decode a percent encoded string. (RFC3986 2.1)
%%
%% Inspiration for some of the optimisations done here come
%% from the new `json` module as it was in mid-2024.
%%
%% Possible input includes:
%%
%% * nothing encoded (no % character):
%% We want to return the binary as-is to avoid an allocation.
%%
%% * small number of encoded characters:
%% We can "skip" words of text.
%%
%% * mostly encoded characters (non-ascii languages)
%% We can decode characters in bulk.

-spec urldecode(binary()) -> binary().

urldecode(Binary) ->
skip_dec(Binary, Binary, 0).

Expand Down Expand Up @@ -97,8 +99,8 @@ dec(<<$%, H, L, Rest/bits>>, Acc, Orig, Skip, Len) ->
end;
%% This clause helps speed up decoding of barely encoded values.
dec(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
dec(Rest, Acc, Orig, Skip, Len + 4);
dec(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) ->
dec(Rest, Acc, Orig, Skip, Len + 1);
Expand Down Expand Up @@ -175,116 +177,60 @@ horse_urldecode_worst_case_hex() ->
).
-endif.

%% @doc Percent encode a string. (RFC3986 2.1)
%% Percent encode a string. (RFC3986 2.1)
%%
%% This function is meant to be used for path components.

-spec urlencode(B) -> B when B::binary().
urlencode(B) ->
urlencode(B, <<>>).
-spec urlencode(binary()) -> binary().

urlencode(<< $!, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $! >>);
urlencode(<< $$, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $$ >>);
urlencode(<< $&, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $& >>);
urlencode(<< $', Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $' >>);
urlencode(<< $(, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $( >>);
urlencode(<< $), Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $) >>);
urlencode(<< $*, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $* >>);
urlencode(<< $+, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $+ >>);
urlencode(<< $,, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $, >>);
urlencode(<< $-, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $- >>);
urlencode(<< $., Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $. >>);
urlencode(<< $0, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $0 >>);
urlencode(<< $1, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $1 >>);
urlencode(<< $2, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $2 >>);
urlencode(<< $3, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $3 >>);
urlencode(<< $4, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $4 >>);
urlencode(<< $5, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $5 >>);
urlencode(<< $6, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $6 >>);
urlencode(<< $7, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $7 >>);
urlencode(<< $8, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $8 >>);
urlencode(<< $9, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $9 >>);
urlencode(<< $:, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $: >>);
urlencode(<< $;, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $; >>);
urlencode(<< $=, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $= >>);
urlencode(<< $@, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $@ >>);
urlencode(<< $A, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $A >>);
urlencode(<< $B, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $B >>);
urlencode(<< $C, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $C >>);
urlencode(<< $D, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $D >>);
urlencode(<< $E, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $E >>);
urlencode(<< $F, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $F >>);
urlencode(<< $G, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $G >>);
urlencode(<< $H, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $H >>);
urlencode(<< $I, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $I >>);
urlencode(<< $J, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $J >>);
urlencode(<< $K, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $K >>);
urlencode(<< $L, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $L >>);
urlencode(<< $M, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $M >>);
urlencode(<< $N, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $N >>);
urlencode(<< $O, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $O >>);
urlencode(<< $P, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $P >>);
urlencode(<< $Q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Q >>);
urlencode(<< $R, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $R >>);
urlencode(<< $S, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $S >>);
urlencode(<< $T, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $T >>);
urlencode(<< $U, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $U >>);
urlencode(<< $V, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $V >>);
urlencode(<< $W, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $W >>);
urlencode(<< $X, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $X >>);
urlencode(<< $Y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Y >>);
urlencode(<< $Z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Z >>);
urlencode(<< $_, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $_ >>);
urlencode(<< $a, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $a >>);
urlencode(<< $b, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $b >>);
urlencode(<< $c, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $c >>);
urlencode(<< $d, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $d >>);
urlencode(<< $e, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $e >>);
urlencode(<< $f, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $f >>);
urlencode(<< $g, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $g >>);
urlencode(<< $h, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $h >>);
urlencode(<< $i, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $i >>);
urlencode(<< $j, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $j >>);
urlencode(<< $k, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $k >>);
urlencode(<< $l, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $l >>);
urlencode(<< $m, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $m >>);
urlencode(<< $n, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $n >>);
urlencode(<< $o, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $o >>);
urlencode(<< $p, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $p >>);
urlencode(<< $q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $q >>);
urlencode(<< $r, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $r >>);
urlencode(<< $s, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $s >>);
urlencode(<< $t, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $t >>);
urlencode(<< $u, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $u >>);
urlencode(<< $v, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $v >>);
urlencode(<< $w, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $w >>);
urlencode(<< $x, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $x >>);
urlencode(<< $y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $y >>);
urlencode(<< $z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $z >>);
urlencode(<< $~, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $~ >>);
urlencode(<< C, Rest/bits >>, Acc) ->
H = hex(C bsr 4),
L = hex(C band 16#0f),
urlencode(Rest, << Acc/bits, $%, H, L >>);
urlencode(<<>>, Acc) ->
Acc.
urlencode(Binary) ->
skip_enc(Binary, Binary, 0).

skip_enc(Binary, Orig, Len) ->
case Binary of
<<C1, C2, C3, C4, Rest/bits>>
when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
skip_enc(Rest, Orig, Len + 4);
_ ->
enc(Binary, [], Orig, 0, Len)
end.

hex( 0) -> $0;
hex( 1) -> $1;
hex( 2) -> $2;
hex( 3) -> $3;
hex( 4) -> $4;
hex( 5) -> $5;
hex( 6) -> $6;
hex( 7) -> $7;
hex( 8) -> $8;
hex( 9) -> $9;
hex(10) -> $A;
hex(11) -> $B;
hex(12) -> $C;
hex(13) -> $D;
hex(14) -> $E;
hex(15) -> $F.
enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
enc(Rest, Acc, Orig, Skip, Len + 4);
enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) ->
enc(Rest, Acc, Orig, Skip, Len + 1);
enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
when (not ?IS_PLAIN(C2)) andalso (not ?IS_PLAIN(C3))
andalso (not ?IS_PLAIN(C4)) ->
Enc = <<$%, ?HEX(C1), $%, ?HEX(C2), $%, ?HEX(C3), $%, ?HEX(C4)>>,
case Len of
0 ->
enc(Rest, [Acc|Enc], Orig, Skip + 4, 0);
_ ->
Part = binary_part(Orig, Skip, Len),
enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 4, 0)
end;
enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) ->
Enc = <<$%, ?HEX(C)>>,
case Len of
0 ->
enc(Rest, [Acc|Enc], Orig, Skip + 1, 0);
_ ->
Part = binary_part(Orig, Skip, Len),
enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 1, 0)
end;
enc(<<>>, _, Orig, 0, _) ->
Orig;
enc(<<>>, Acc, _, _, 0) ->
iolist_to_binary(Acc);
enc(<<>>, Acc, Orig, Skip, Len) ->
Part = binary_part(Orig, Skip, Len),
iolist_to_binary([Acc|Part]);
enc(_, _, Orig, Skip, Len) ->
error({invalid_byte, binary:at(Orig, Skip + Len)}).

-ifdef(TEST).
urlencode_test_() ->
Expand Down Expand Up @@ -326,6 +272,14 @@ horse_urlencode_jp() ->
129,153,227,130,139,230,151,139,229,190,139,227,128,156>>)
).
horse_urlencode_jp_mixed() ->
horse:repeat(100000,
urlencode(<<227,131,132,227,130,164,227,131,179,227,130,189,227,
$1, $2, $3,
130,166,227,131,171,227,128,156,232,188,170,229,187,187,227,
129,153,227,130,139,230,151,139,229,190,139,227,128,156>>)
).
horse_urlencode_mix() ->
horse:repeat(100000,
urlencode(<<"Small, fast, modular HTTP server.">>)
Expand Down

0 comments on commit b64eb1f

Please sign in to comment.