From 28a8462b716b8a20359176ecfed46219f3e516dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Mon, 6 Jan 2025 16:13:55 +0100
Subject: [PATCH 1/5] Add debug logging for receiving of HTTP/2 frames

---
 src/cow_http2_machine.erl | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/cow_http2_machine.erl b/src/cow_http2_machine.erl
index 808c6cf..b08538b 100644
--- a/src/cow_http2_machine.erl
+++ b/src/cow_http2_machine.erl
@@ -293,17 +293,38 @@ init_upgrade_stream(Method, State=#http2_machine{mode=server, remote_streamid=0,
 	| {error, {stream_error, cow_http2:streamid(), cow_http2:error(), atom()}, State}
 	| {error, {connection_error, cow_http2:error(), atom()}, State}
 	when State::http2_machine().
+
+%-define(HTTP2_MACHINE_DEBUG, 1).
+-ifdef(HTTP2_MACHINE_DEBUG).
+-define(LOG_FRAME(Frame, State),
+	begin
+		Frame2 = case Frame of
+			{data,_,_,_} -> setelement(4, Frame, {'BINARY-DATA', byte_size(element(4, Frame))});
+			{continuation,_,_,_} -> setelement(4, Frame, {'BINARY-DATA', byte_size(element(4, Frame))});
+			_ -> Frame
+		end,
+		io:format(user, "~p rcv: ~p~n", [State#http2_machine.mode, Frame2])
+	end
+).
+-else.
+-define(LOG_FRAME(Frame, State), _ = Frame).
+-endif.
+
 frame(Frame, State=#http2_machine{state=settings, preface_timer=TRef}) ->
+	?LOG_FRAME(Frame, State),
 	ok = case TRef of
 		undefined -> ok;
 		_ -> erlang:cancel_timer(TRef, [{async, true}, {info, false}])
 	end,
 	settings_frame(Frame, State#http2_machine{state=normal, preface_timer=undefined});
 frame(Frame, State=#http2_machine{state={continuation, _, _}}) ->
+	?LOG_FRAME(Frame, State),
 	maybe_discard_result(continuation_frame(Frame, State));
-frame(settings_ack, State=#http2_machine{state=normal}) ->
+frame(Frame = settings_ack, State=#http2_machine{state=normal}) ->
+	?LOG_FRAME(Frame, State),
 	settings_ack_frame(State);
 frame(Frame, State=#http2_machine{state=normal}) ->
+	?LOG_FRAME(Frame, State),
 	Result = case element(1, Frame) of
 		data -> data_frame(Frame, State);
 		headers -> headers_frame(Frame, State);

From 921b1ff6178bd6710331609f226789e9a763e1ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Mon, 6 Jan 2025 16:14:47 +0100
Subject: [PATCH 2/5] Optimise Websocket UTF-8 validation

See comment within the commit for full details.

This results in improved Cowboy performance decoding text
messages of around 10 to 15% in all scenarios, except when
the message is small and it contains mostly non-ASCII (due
to the cost of trying ASCII first).

This brings decoding of text frames closer to decoding of
binary frames, with ASCII data taking approximately 1.1 times
longer to decode, mixed data 1.25 times longer and mostly
non-ASCII data 1.35 times longer to decode.
---
 src/cow_ws.erl | 184 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 154 insertions(+), 30 deletions(-)

diff --git a/src/cow_ws.erl b/src/cow_ws.erl
index 27c7c87..d5c40d0 100644
--- a/src/cow_ws.erl
+++ b/src/cow_ws.erl
@@ -72,6 +72,8 @@
 -type utf8_state() :: 0..8 | undefined.
 -export_type([utf8_state/0]).
 
+-compile({inline, [utf8_class/0]}).
+
 %% @doc Generate a key for the Websocket handshake request.
 
 -spec key() -> binary().
@@ -559,14 +561,14 @@ validate_payload(Payload, Rest, undefined, _, _, _, true) ->
 	{ok, Payload, undefined, Rest};
 %% Text frames and close control frames MUST have a payload that is valid UTF-8.
 validate_payload(Payload, Rest, Utf8State, _, Type, _, Eof) when Type =:= text; Type =:= close ->
-	case validate_utf8(Payload, Utf8State) of
+	case validate_text(Payload, Utf8State) of
 		1 -> {error, badencoding};
 		Utf8State2 when not Eof -> {more, Payload, Utf8State2};
 		0 when Eof -> {ok, Payload, 0, Rest};
 		_ -> {error, badencoding}
 	end;
 validate_payload(Payload, Rest, Utf8State, _, fragment, {Fin, text, _}, Eof) ->
-	case validate_utf8(Payload, Utf8State) of
+	case validate_text(Payload, Utf8State) of
 		1 -> {error, badencoding};
 		0 when Eof -> {ok, Payload, 0, Rest};
 		Utf8State2 when Eof, Fin =:= nofin -> {ok, Payload, Utf8State2, Rest};
@@ -581,36 +583,158 @@ validate_payload(Payload, Rest, Utf8State, _, _, _, true) ->
 %% Based on the Flexible and Economical UTF-8 Decoder algorithm by
 %% Bjoern Hoehrmann <bjoern@hoehrmann.de> (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
 %%
-%% The original algorithm has been unrolled into all combinations of values for C and State
-%% each with a clause. The common clauses were then grouped together.
+%% The original algorithm has been reworked to better adapt to
+%% the current Erlang VM (at the time of writing).
+%%
+%% We keep the character class table to quickly find which class
+%% a character is. The transition table was removed in favor of
+%% a separate Erlang function per state as that proved more
+%% efficient.
+%%
+%% We store the character class table in a tuple returned by
+%% an inline function.
+%%
+%% We handle ASCII characters specially because when ASCII
+%% characters are present we are highly likely to have mostly
+%% or only ASCII characters. We process them 4 at a time when
+%% possible.
+%%
+%% When a non-ASCII character is encountered, we switch to
+%% the UTF-8 decoder. When in the UTF-8 decoder we have to
+%% process characters one at a time. When we are in the UTF-8
+%% decoder we expect there to be additional UTF-8 characters
+%% so we check for them instead of reverting back to ASCII
+%% every time. This greatly speeds up decoding of Japanese
+%% and other non-ASCII text.
+%%
+%% Our UTF-8 decoder functions consist of looking up the
+%% character class of the current byte and then using a
+%% case clause to determine which state we are switching to.
+%%
+%% We order clauses based on the likelihood of the character class.
+%% Order is determined by the number of occurrences of the class in
+%% the table. The order (and number of occurrences) is as follow:
+%% 7 (32), 2 (30), 1 and 9 (16), 3 (14), 8 (13), 6 (3), 4, 5, 10 and 11.
 %%
 %% This function returns 0 on success, 1 on error, and 2..8 on incomplete data.
-validate_utf8(<<>>, State) -> State;
-validate_utf8(<< C, Rest/bits >>, 0) when C < 128 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 128, C < 144 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 5) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 8) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 144, C < 160 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 5) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 6) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 2) when C >= 160, C < 192 -> validate_utf8(Rest, 0);
-validate_utf8(<< C, Rest/bits >>, 3) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 4) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
-validate_utf8(<< C, Rest/bits >>, 6) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 7) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
-validate_utf8(<< C, Rest/bits >>, 0) when C >= 194, C < 224 -> validate_utf8(Rest, 2);
-validate_utf8(<< 224, Rest/bits >>, 0) -> validate_utf8(Rest, 4);
-validate_utf8(<< C, Rest/bits >>, 0) when C >= 225, C < 237 -> validate_utf8(Rest, 3);
-validate_utf8(<< 237, Rest/bits >>, 0) -> validate_utf8(Rest, 5);
-validate_utf8(<< C, Rest/bits >>, 0) when C =:= 238; C =:= 239 -> validate_utf8(Rest, 3);
-validate_utf8(<< 240, Rest/bits >>, 0) -> validate_utf8(Rest, 6);
-validate_utf8(<< C, Rest/bits >>, 0) when C =:= 241; C =:= 242; C =:= 243 -> validate_utf8(Rest, 7);
-validate_utf8(<< 244, Rest/bits >>, 0) -> validate_utf8(Rest, 8);
-validate_utf8(_, _) -> 1.
+%% It expects a starting state value of 0. It can be called again
+%% to stream parse large amounts of text as long as the returned
+%% 2..8 state is provided when it is called back.
+
+validate_text(Text, 0) -> validate_ascii(Text);
+validate_text(Text, 2) -> validate_s2(Text);
+validate_text(Text, 3) -> validate_s3(Text);
+validate_text(Text, 4) -> validate_s4(Text);
+validate_text(Text, 5) -> validate_s5(Text);
+validate_text(Text, 6) -> validate_s6(Text);
+validate_text(Text, 7) -> validate_s7(Text);
+validate_text(Text, 8) -> validate_s8(Text).
+
+validate_ascii(<<>>) -> 0;
+validate_ascii(<<C1,C2,C3,C4,R/bits>>) when C1 < 128, C2 < 128, C3 < 128, C4 < 128 -> validate_ascii(R);
+validate_ascii(<<C1,R/bits>>) when C1 < 128 -> validate_ascii(R);
+validate_ascii(Text) -> validate_s0(Text).
+
+%% Instead of switching back to ASCII we first have this
+%% function attempt to find a non-ASCII character to
+%% greatly speed up decoding of Japanese and other languages.
+validate_s0(<<C,R/bits>>) when C >= 128 ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		2 -> validate_s2(R);
+		3 -> validate_s3(R);
+		6 -> validate_s7(R);
+		4 -> validate_s5(R);
+		5 -> validate_s8(R);
+		10 -> validate_s4(R);
+		11 -> validate_s6(R);
+		_ -> 1
+	end;
+validate_s0(Text) ->
+	validate_ascii(Text).
+
+validate_s2(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s0(R);
+		1 -> validate_s0(R);
+		9 -> validate_s0(R);
+		_ -> 1
+	end;
+validate_s2(<<>>) ->
+	2.
+
+validate_s3(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s2(R);
+		1 -> validate_s2(R);
+		9 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s3(<<>>) ->
+	3.
+
+validate_s4(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s4(<<>>) ->
+	4.
+
+validate_s5(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		1 -> validate_s2(R);
+		9 -> validate_s2(R);
+		_ -> 1
+	end;
+validate_s5(<<>>) ->
+	5.
+
+validate_s6(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s3(R);
+		9 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s6(<<>>) ->
+	6.
+
+validate_s7(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		7 -> validate_s3(R);
+		1 -> validate_s3(R);
+		9 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s7(<<>>) ->
+	7.
+
+validate_s8(<<C,R/bits>>) ->
+	Class = element(C - 127, utf8_class()),
+	case Class of
+		1 -> validate_s3(R);
+		_ -> 1
+	end;
+validate_s8(<<>>) ->
+	8.
+
+utf8_class() ->
+	{
+		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3,
+		11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
+	}.
 
 %% @doc Return a frame tuple from parsed state and data.
 

From cdb769a9d33bf9c860f835c44ef445c52d21faae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Tue, 7 Jan 2025 12:38:58 +0100
Subject: [PATCH 3/5] Optimise Websocket (un)masking

By (un)masking 16 bytes at a time when possible
(instead of 4) we process frames roughly 10% faster
for all frame types.
---
 src/cow_ws.erl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/cow_ws.erl b/src/cow_ws.erl
index d5c40d0..08790ca 100644
--- a/src/cow_ws.erl
+++ b/src/cow_ws.erl
@@ -525,8 +525,12 @@ unmask(Data, MaskKey, UnmaskedLen) ->
 	MaskKey2 = (MaskKey bsl (Left * 8)) + (MaskKey bsr (Right * 8)),
 	mask(Data, MaskKey2, <<>>).
 
-mask(<<>>, _, Unmasked) ->
-	Unmasked;
+mask(<< O1:32, O2:32, O3:32, O4:32, Rest/bits >>, MaskKey, Acc) ->
+	T1 = O1 bxor MaskKey,
+	T2 = O2 bxor MaskKey,
+	T3 = O3 bxor MaskKey,
+	T4 = O4 bxor MaskKey,
+	mask(Rest, MaskKey, << Acc/binary, T1:32, T2:32, T3:32, T4:32 >>);
 mask(<< O:32, Rest/bits >>, MaskKey, Acc) ->
 	T = O bxor MaskKey,
 	mask(Rest, MaskKey, << Acc/binary, T:32 >>);
@@ -541,7 +545,9 @@ mask(<< O:16 >>, MaskKey, Acc) ->
 mask(<< O:8 >>, MaskKey, Acc) ->
 	<< MaskKey2:8, _:24 >> = << MaskKey:32 >>,
 	T = O bxor MaskKey2,
-	<< Acc/binary, T:8 >>.
+	<< Acc/binary, T:8 >>;
+mask(<<>>, _, Unmasked) ->
+	Unmasked.
 
 inflate_frame(Data, Inflate, TakeOver, FragState, true)
 		when FragState =:= undefined; element(1, FragState) =:= fin ->

From 4594ab106e15ab9dd000233a620da31ff8da5326 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Mon, 13 Jan 2025 13:23:05 +0100
Subject: [PATCH 4/5] Avoid an extra multiplication before unmasking

This has no real impact on performance
but simplies the code a little.
---
 src/cow_ws.erl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cow_ws.erl b/src/cow_ws.erl
index 08790ca..47c9e89 100644
--- a/src/cow_ws.erl
+++ b/src/cow_ws.erl
@@ -520,9 +520,9 @@ unmask(Data, MaskKey, 0) ->
 	mask(Data, MaskKey, <<>>);
 %% We unmask on the fly so we need to continue from the right mask byte.
 unmask(Data, MaskKey, UnmaskedLen) ->
-	Left = UnmaskedLen rem 4,
-	Right = 4 - Left,
-	MaskKey2 = (MaskKey bsl (Left * 8)) + (MaskKey bsr (Right * 8)),
+	Left = (UnmaskedLen rem 4) * 8,
+	Right = 32 - Left,
+	MaskKey2 = (MaskKey bsl Left) + (MaskKey bsr Right),
 	mask(Data, MaskKey2, <<>>).
 
 mask(<< O1:32, O2:32, O3:32, O4:32, Rest/bits >>, MaskKey, Acc) ->

From 109ca9ba9f6e9ab67f84363bf613e74f4bd9eff6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Tue, 14 Jan 2025 18:23:37 +0100
Subject: [PATCH 5/5] Fix OTP-27 Dialyzer warnings

Disable improper lists warnings in cow_qs/cow_uri since
we intentionally create them.
---
 src/cow_qs.erl  | 1 +
 src/cow_uri.erl | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/cow_qs.erl b/src/cow_qs.erl
index 2eb52b8..dff840a 100644
--- a/src/cow_qs.erl
+++ b/src/cow_qs.erl
@@ -13,6 +13,7 @@
 %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 -module(cow_qs).
+-dialyzer(no_improper_lists).
 
 -export([parse_qs/1]).
 -export([qs/1]).
diff --git a/src/cow_uri.erl b/src/cow_uri.erl
index 790d7de..1864139 100644
--- a/src/cow_uri.erl
+++ b/src/cow_uri.erl
@@ -13,6 +13,7 @@
 %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 -module(cow_uri).
+-dialyzer(no_improper_lists).
 
 -export([urldecode/1]).
 -export([urlencode/1]).