Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

z_html_sanitize: sanitize data url with svg, export more relaxed uri sanitize #103

Merged
merged 8 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/z_dateformat.erl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ format(FormatString, Options) ->
-spec format( datetime() | calendar:date(), string(), list() ) -> binary() | undefined.
format({{9999,_,_},_}, _FormatString, _Options) ->
undefined;
format({{0,0,0},{0,0,0}}, _FormatString, _Options) ->
undefined;
format({{_,_,_} = Date,{_,_,_} = Time}, FormatString, Options) ->
iolist_to_binary(replace_tags(Date, Time, FormatString, Options));

Expand Down Expand Up @@ -411,10 +413,19 @@ to_utc(LTime, Options) ->
UTC
end.

tzoffset({{Y, _, _}, _}, _Options) when Y < 1916->
0;
tzoffset(LTime, Options) ->
case proplists:get_value(utc, Options) of
undefined ->
tzoffset_1(LTime, erlang:localtime_to_universaltime(LTime));
UTime = case LTime of
{{Y, M, D}, T} when Y =< 1 ->
{{Y1, M1, D1}, T1} = erlang:localtime_to_universaltime({{10, M, D}, T}),
{{Y1 - 10 + Y, M1, D1}, T1};
_ ->
erlang:localtime_to_universaltime(LTime)
end,
tzoffset_1(LTime, UTime);
UTime ->
tzoffset_1(LTime, UTime)
end.
Expand Down
117 changes: 71 additions & 46 deletions src/z_html.erl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
%% @author Marc Worrell <[email protected]>
%% @copyright 2009-2022 Marc Worrell
%% @copyright 2009-2024 Marc Worrell
%% @doc Utility functions for html processing. Also used for property filtering (by m_rsc_update).
%% @end

%% Copyright 2009-2022 Marc Worrell
%% Copyright 2009-2024 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,6 +38,7 @@
noscript/1,
noscript/2,
sanitize_uri/1,
sanitize_uri/2,
escape_link/1,
nl2br/1,
br2nl/1,
Expand Down Expand Up @@ -525,7 +526,7 @@ escape_link(Text) ->
make_link(B) when is_binary(B) ->
escape(B);
make_link({link, Link}) ->
NoScript = noscript(Link, true),
NoScript = noscript(Link, false),
LinkText = escape(NoScript),
LinkUrl = escape(ensure_protocol(NoScript)),
<<
Expand Down Expand Up @@ -571,33 +572,50 @@ ensure_protocol(Link) when is_binary(Link) ->

%% @doc Ensure that an uri is (quite) harmless by removing any script reference
-spec sanitize_uri( maybe_iodata() ) -> maybe_binary().
sanitize_uri(undefined) ->
sanitize_uri(MaybeUrl) ->
sanitize_uri(MaybeUrl, true).

%% @doc Ensure that an uri is (quite) harmless by removing any script reference. Option
%% to allow 'data:' urls. Data urls are only passed if their mime type is denotes an image,
%% video, or plain-text. SVGs will be sanitized.
-spec sanitize_uri(MaybeUrl, IsAllowData) -> maybe_binary() when
MaybeUrl :: maybe_iodata(),
IsAllowData :: boolean().
sanitize_uri(undefined, _IsAllowData) ->
undefined;
sanitize_uri(<<>>) ->
sanitize_uri(<<>>, _IsAllowData) ->
<<>>;
sanitize_uri([]) ->
sanitize_uri([], _IsAllowData) ->
<<>>;
sanitize_uri(Uri) ->
B = iolist_to_binary(ensure_protocol(noscript(z_string:trim(Uri), true))),
cleanup_uri_chars(B, <<>>).
sanitize_uri(Uri, IsAllowData) ->
case iolist_to_binary(ensure_protocol(noscript(z_string:trim(Uri), IsAllowData))) of
<<"data:", _/binary>> = Uri1 ->
cleanup_uri_chars(Uri1, data, <<>>);
Uri1 ->
cleanup_uri_chars(Uri1, url, <<>>)
end.


cleanup_uri_chars(<<>>, Acc) ->
cleanup_uri_chars(<<>>, _Mode, Acc) ->
Acc;
cleanup_uri_chars(<<$%, A, B, C/binary>>, Acc)
cleanup_uri_chars(<<$%, A, B, C/binary>>, Mode, Acc)
when ((A >= $0 andalso A =< $9) orelse (A >= $A andalso A =< $Z))
andalso ((B >= $0 andalso B =< $9) orelse (B >= $A andalso B =< $Z)) ->
cleanup_uri_chars(C, <<Acc/binary, $%, A, B>>);
cleanup_uri_chars(<<C, B/binary>>, Acc)
when C =:= $.; C =:= $&; C =:= $:; C =:= $/;
cleanup_uri_chars(C, Mode, <<Acc/binary, $%, A, B>>);
cleanup_uri_chars(<<C, B/binary>>, Mode, Acc)
when C =:= $.; C =:= $&; C =:= $:; C =:= $/;
C =:= $=; C =:= $?; C =:= $#; C =:= $+ ->
cleanup_uri_chars(B, <<Acc/binary, C>>);
cleanup_uri_chars(<<C, B/binary>>, Acc) ->
cleanup_uri_chars(B, Mode, <<Acc/binary, C>>);
cleanup_uri_chars(<<C, B/binary>>, data, Acc)
when C =:= $,; C =:= $; ->
cleanup_uri_chars(B, data, <<Acc/binary, C>>);
cleanup_uri_chars(<<C, B/binary>>, Mode, Acc) ->
case z_url:url_unreserved_char(C) of
false ->
C1 = iolist_to_binary(z_url:hex_encode([C])),
cleanup_uri_chars(B, <<Acc/binary, $%, C1/binary>>);
cleanup_uri_chars(B, Mode, <<Acc/binary, $%, C1/binary>>);
true ->
cleanup_uri_chars(B, <<Acc/binary, C>>)
cleanup_uri_chars(B, Mode, <<Acc/binary, C>>)
end.

%% @doc Strip all html elements from the text. Simple parsing is applied to find the elements.
Expand Down Expand Up @@ -951,10 +969,10 @@ sanitize_attr_value(<<"class">>, V) ->
% Remove all do_xxxx widget manager classes
filter_widget_class(V);
sanitize_attr_value(<<"href">>, V) ->
noscript(V, true);
noscript(V, false);
sanitize_attr_value(Attr, V) ->
case is_url_attr(Attr) of
true -> noscript(V, false);
true -> noscript(V, true);
false -> V
end.

Expand Down Expand Up @@ -1153,25 +1171,25 @@ filter_widget_class(Class) ->
Url :: string() | binary(),
SafeUrl :: binary().
noscript(Url) ->
noscript(Url, true).
noscript(Url, false).

%% @doc Filter an url, if strict then also remove "data:" (as data can be text/html).
-spec noscript(Url, IsStrict) -> SafeUrl when
-spec noscript(Url, IsAllowData) -> SafeUrl when
Url :: string() | binary(),
IsStrict :: boolean(),
IsAllowData :: boolean(),
SafeUrl :: binary().
noscript(Url0, IsStrict) ->
noscript(Url0, IsAllowData) ->
Url = z_string:trim(z_string:sanitize_utf8(z_convert:to_binary(Url0))),
case nows(Url, <<>>) of
case nows_protocol_split(Url, <<>>) of
{<<"javascript">>, _} -> <<"#script-removed">>;
{<<"script">>, _} -> <<"#script-removed">>;
{<<"vbscript">>, _} -> <<"#script-removed">>;
{<<"data">>, _} when IsStrict -> <<>>;
{<<"data">>, Data} ->
{<<"data">>, Data} when IsAllowData ->
case noscript_data(Data) of
<<>> -> <<>>;
Data1 -> <<"data:", Data1/binary>>
end;
{<<"data">>, _} -> <<>>;
{<<"mailto">>, Rest} -> <<"mailto:", (z_string:trim(Rest))/binary>>;
{Protocol, Rest} when is_binary(Protocol) -> <<Protocol/binary, $:, Rest/binary>>;
{undefined, <<>>} -> <<>>;
Expand All @@ -1180,35 +1198,42 @@ noscript(Url0, IsStrict) ->

%% @doc Remove whitespace and make lowercase till we find a colon, slash or pound-sign. Also
%% deletes all invalid utf8 characters.
-spec nows( binary(), binary() ) -> {binary()|undefined, binary()}.
nows(<<>>, Acc) -> {undefined, Acc};
nows(<<$:, Rest/binary>>, Acc) -> {Acc, Rest};
nows(<<$/, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $/, Rest/binary>>};
nows(<<$#, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $#, Rest/binary>>};
nows(<<$\\, Rest/binary>>, Acc) -> nows(Rest, Acc);
nows(<<$%, A, B, Rest/binary>>, Acc) ->
-spec nows_protocol_split( binary(), binary() ) -> {binary()|undefined, binary()}.
nows_protocol_split(<<>>, Acc) -> {undefined, Acc};
nows_protocol_split(<<$:, Rest/binary>>, Acc) -> {Acc, Rest};
nows_protocol_split(<<$/, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $/, Rest/binary>>};
nows_protocol_split(<<$#, Rest/binary>>, Acc) -> {undefined, <<Acc/binary, $#, Rest/binary>>};
nows_protocol_split(<<$\\, Rest/binary>>, Acc) -> nows_protocol_split(Rest, Acc);
nows_protocol_split(<<$%, A, B, Rest/binary>>, Acc) ->
case catch erlang:binary_to_integer(<<A, B>>, 16) of
V when is_integer(V) -> nows(<<V, Rest/binary>>, Acc);
V when is_integer(V) -> nows_protocol_split(<<V, Rest/binary>>, Acc);
_ -> {undefined, <<>>}
end;
nows(<<$%, _/binary>>, _Acc) ->
nows_protocol_split(<<$%, _/binary>>, _Acc) ->
% Illegal: not enough characters left for escape sequence
{undefined, <<>>};
nows(<<C, Rest/binary>>, Acc) when C =< 32 ->
nows_protocol_split(<<C, Rest/binary>>, Acc) when C =< 32 ->
% Discard control characters
nows(Rest, Acc);
nows(<<C, Rest/binary>>, Acc) when C >= $A, C =< $Z ->
nows_protocol_split(Rest, Acc);
nows_protocol_split(<<C, Rest/binary>>, Acc) when C >= $A, C =< $Z ->
% Ensure lowercase a-z
nows(Rest, <<Acc/binary, (C+32)>>);
nows(<<C/utf8, Rest/binary>>, Acc) ->
nows(Rest, <<Acc/binary, C/utf8>>);
nows(<<_, Rest/binary>>, Acc) ->
nows_protocol_split(Rest, <<Acc/binary, (C+32)>>);
nows_protocol_split(<<C/utf8, Rest/binary>>, Acc) ->
nows_protocol_split(Rest, <<Acc/binary, C/utf8>>);
nows_protocol_split(<<_, Rest/binary>>, Acc) ->
% Discard non utf8 characters
nows(Rest, Acc).
nows_protocol_split(Rest, Acc).

%% @doc Sanitize the data link, drop anything suspected to be a script, or that could contain a script.
%% @todo Parse SVG with the svg sanitizer
noscript_data(<<"image/svg", _/binary>>) -> <<>>;
noscript_data(<<"image/svg", _/binary>> = Url) ->
Url1 = <<"data:", Url/binary>>,
case z_url:decode_data_url(Url1) of
{ok, Mime, _Charset, Decoded} ->
Sanitized = z_svg:sanitize(Decoded),
<<Mime/binary, ";base64,", (base64:encode(Sanitized))/binary>>;
{error, _} ->
<<>>
end;
noscript_data(<<"image/", _/binary>> = Data) -> Data;
noscript_data(<<"audio/", _/binary>> = Data) -> Data;
noscript_data(<<"video/", _/binary>> = Data) -> Data;
Expand Down
111 changes: 78 additions & 33 deletions src/z_url.erl
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
%% @author Marc Worrell
%% @copyright 2012-2022 Marc Worrell
%% @copyright 2012-2024 Marc Worrell
%% @doc Misc utility URL functions for zotonic
%% @end

%% Copyright 2012-2022 Marc Worrell
%% Copyright 2012-2024 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,7 +37,8 @@
location/1,
abs_link/2,
split_base_host/1,
decode_data_url/1
decode_data_url/1,
encode_data_url/3
]).


Expand All @@ -49,15 +51,19 @@

%%% URL ENCODE %%%

-spec url_encode( string() | atom() | float() | integer() | binary() | iodata() ) -> binary().
-spec url_encode(Value) -> Encoded when
Value :: string() | atom() | float() | integer() | binary() | iodata(),
Encoded :: binary().
url_encode(S) ->
Encoded = cow_qs:urlencode( z_convert:to_binary(S) ),
case binary:match(Encoded, <<"+">>) of
nomatch -> Encoded;
_ -> binary:replace(Encoded, <<"+">>, <<"%20">>, [global])
end.

-spec url_decode( string() | binary() | iodata() ) -> binary().
-spec url_decode(Encoded) -> Data when
Encoded :: iodata(),
Data :: binary().
url_decode(S) ->
cow_qs:urldecode( z_convert:to_binary(S) ).

Expand Down Expand Up @@ -131,6 +137,8 @@ remove_protocol(<<>>) -> <<>>.

%% VALID URL CHARACTERS
%% RFC 3986
-spec url_valid_char(Char) -> boolean() when
Char :: non_neg_integer().
url_valid_char(Char) ->
url_reserved_char(Char) orelse url_unreserved_char(Char).

Expand Down Expand Up @@ -250,7 +258,10 @@ split_base_host(Base) ->


%% @doc Given a relative URL and a base URL, calculate the absolute URL.
-spec abs_link(string()|binary(), string()|binary()) -> binary().
-spec abs_link(Url, BaseUrl) -> AbsUrl when
Url :: string() | binary(),
BaseUrl :: string() | binary(),
AbsUrl :: binary().
abs_link(RelativeUrl, BaseUrl) ->
{BaseHost, BaseHostDir} = z_url:split_base_host(BaseUrl),
ensure_protocol(iolist_to_binary(make_abs_link(z_convert:to_binary(RelativeUrl), BaseHost, BaseHostDir))).
Expand Down Expand Up @@ -284,39 +295,73 @@ make_abs_link(Url, _Host, HostDir) ->
[HostDir, Url].


%% @doc Decode a "data:" url to its parts.
%% Crashes if the url doesn't have a "data:" protocol.
-spec decode_data_url(binary()) -> {ok, Mime::binary(), Charset::binary(), Data::binary()} | {error, unknown_encoding}.
%% @doc Decode a "data:" url to its parts. If the charset is not defined in the data
%% then it is returned as "US-ASCII". The mime type defaults to "text/plain".
-spec decode_data_url(DataUrl) -> {ok, Mime, Charset, Data} | {error, Reason} when
DataUrl :: binary(),
Mime :: binary(),
Charset :: binary(),
Data :: binary(),
Reason :: unknown_encoding | nodata.
decode_data_url(<<"data:", Data/binary>>) ->
Parts = binary:split(Data, <<";">>, [global]),
[Encoded|Args] = lists:reverse(Parts),
case decode_url_data(Encoded) of
{ok, Decoded} ->
{Mime, Charset} = decode_data_url_args(Args),
{ok, Mime, Charset, Decoded};
{error, _} = Error ->
Error
end.
case binary:split(Data, <<",">>) of
[ MimeData, EncodedData ] ->
MimeParts = binary:split(MimeData, <<";">>, [global]),
Mime = find_mime(MimeParts),
Charset = find_charset(MimeParts),
DecodedData = case last(MimeParts) of
<<"base64">> -> decode_base64(EncodedData);
<<"utf8">> -> EncodedData;
_ -> z_url:url_decode(EncodedData)
end,
{ok, Mime, Charset, DecodedData};
[ _ ] ->
{error, unknown_encoding}
end;
decode_data_url(Url) when is_binary(Url) ->
{error, nodata}.

decode_url_data(<<"base64,", Data/binary>>) ->
last([]) -> undefined;
last(L) -> lists:last(L).

decode_base64(Data) ->
Data1 = << <<case C of $- -> $+; $_ -> $/; _ -> C end>> || <<C>> <= Data >>,
Data2 = case byte_size(Data1) rem 4 of
0 -> Data1;
2 -> <<Data1/binary, "==">>;
3 -> <<Data1/binary, "=">>
end,
{ok, base64:decode(Data2)};
decode_url_data(<<",", Data/binary>>) ->
{ok, Data};
decode_url_data(_) ->
{error, unknown_encoding}.

decode_data_url_args(Args) ->
lists:foldl(fun(<<"charset=", Charset/binary>>, {Mime,_Charset}) ->
{Mime,Charset};
(Mime, {_Mime,Charset}) ->
{Mime,Charset}
end,
{<<"text/plain">>, <<"US-ASCII">>},
Args).
base64:decode(Data2).

find_mime([]) -> <<"text/plain">>;
find_mime([<<>>|_]) -> <<"text/plain">>;
find_mime([M|_]) ->
case binary:match(M, <<"=">>) of
nomatch -> M;
{_,_} -> <<"text/plain">>
end.

find_charset([]) -> <<"US-ASCII">>;
find_charset([ <<"charset=", Charset/binary>> | _ ]) -> Charset;
find_charset([ _ | Ms ]) -> find_charset(Ms).


%% Encode a data URL. If the charset is US-ASCII or empty then it is omitted.
%% Plain text (text/plain) is URL encoded, other data is base64 encoded.
-spec encode_data_url(Mime, Charset, Data) -> Encoded when
Mime :: binary(),
Charset :: binary() | undefined,
Data :: binary(),
Encoded :: binary().
encode_data_url(<<"text/plain">>, Charset, Data) when
Charset =:= undefined;
Charset =:= <<>>;
Charset =:= <<"US-ASCII">> ->
<<"data:,", (url_encode(Data))/binary>>;
encode_data_url(Mime, Charset, Data) when
Charset =:= undefined;
Charset =:= <<>>;
Charset =:= <<"US-ASCII">> ->
<<"data:", Mime/binary, ";base64,", (base64:encode(Data))/binary>>;
encode_data_url(Mime, Charset, Data) ->
<<"data:", Mime/binary, ";charset=", Charset/binary, ";base64,", (base64:encode(Data))/binary>>.
Loading
Loading