UTF-8 encode and decode

Here are two test cases of utf8_codes//1(source). One converts a Unicode code point to UTF-8 (encode) and the other converts a UTF-8 encoding to a Unicode code point (decode).

:- begin_tests(utf8).

unicode_utf8_test_case_generator(0x00    ,[               0x00]).
unicode_utf8_test_case_generator(0'a     ,[               0'a ]).
unicode_utf8_test_case_generator(0'z     ,[               0'z ]).
unicode_utf8_test_case_generator(0'A     ,[               0'A ]).
unicode_utf8_test_case_generator(0'Z     ,[               0'Z ]).
unicode_utf8_test_case_generator(0'0     ,[               0'0 ]).
unicode_utf8_test_case_generator(0'9     ,[               0'9 ]).
unicode_utf8_test_case_generator(0x7F    ,[               0x7F]).
unicode_utf8_test_case_generator(0x0080  ,[          0xC2,0x80]).
unicode_utf8_test_case_generator(0xC0    ,[          0xC3,0x80]).
unicode_utf8_test_case_generator(0x07FF  ,[          0xDF,0xBF]).
unicode_utf8_test_case_generator(0x0800  ,[     0xE0,0xA0,0x80]).
unicode_utf8_test_case_generator(0xFFFF  ,[     0xEF,0xBF,0xBF]).
unicode_utf8_test_case_generator(0x010000,[0xF0,0x90,0x80,0x80]).
unicode_utf8_test_case_generator(0x10FFFF,[0xF4,0x8F,0xBF,0xBF]).

test(unicode_to_utf8,[forall(unicode_utf8_test_case_generator(Unicode_code_point,Expected_UTF8_encoded))]) :-
    DCG = utf8_codes([Unicode_code_point]),
    phrase(DCG,UTF8_encoded,Hole),

    assertion( var(Hole) ),
    Hole = [],
    assertion( UTF8_encoded == Expected_UTF8_encoded ).

test(utf8_to_unicode,[forall(unicode_utf8_test_case_generator(Expected_unicode_code_point,UTF8_encoded))]) :-
    DCG = utf8_codes([Unicode_code_point|Hole]),
    phrase(DCG,UTF8_encoded,Hole),

    assertion( Hole == [] ),
    assertion( Unicode_code_point == Expected_unicode_code_point ).

% 2.17.1.4 Syntax for non-decimal numbers - https://www.swi-prolog.org/pldoc/man?section=nondecsyntax
unicode_string_utf8_test_case_generator("a",[        97]).
unicode_string_utf8_test_case_generator("a",[       0'a]).
unicode_string_utf8_test_case_generator("a",[      0x61]).
unicode_string_utf8_test_case_generator("a",[     16'61]).   % ' to rebalance the single tic for the formatter.
unicode_string_utf8_test_case_generator("a",[0b01100001]).
unicode_string_utf8_test_case_generator("a",[2'01100001]). % ' 
unicode_string_utf8_test_case_generator("a",[     0o141]).
unicode_string_utf8_test_case_generator("a",[     8'141]). % ' 

% Character Escape Syntax - https://www.swi-prolog.org/pldoc/man?section=charescapes
unicode_string_utf8_test_case_generator("\x61"      ,[          0'a ]).
unicode_string_utf8_test_case_generator("\u0061"    ,[          0'a ]).
unicode_string_utf8_test_case_generator("\U00000061",[          0'a ]).
unicode_string_utf8_test_case_generator("\141"      ,[          0'a ]).
unicode_string_utf8_test_case_generator("\x00"      ,[          0x00]).
unicode_string_utf8_test_case_generator("\u0000"    ,[          0x00]).
unicode_string_utf8_test_case_generator("\U00000000",[          0x00]).
unicode_string_utf8_test_case_generator("\x7F"      ,[          0x7F]).
unicode_string_utf8_test_case_generator("\u007F"    ,[          0x7F]).
unicode_string_utf8_test_case_generator("\U0000007F",[          0x7F]).
unicode_string_utf8_test_case_generator("\x80"      ,[     0xC2,0x80]).
unicode_string_utf8_test_case_generator("\u0080"    ,[     0xC2,0x80]).
unicode_string_utf8_test_case_generator("\U00000080",[     0xC2,0x80]).
unicode_string_utf8_test_case_generator("\x07FF"    ,[     0xDF,0xBF]).
unicode_string_utf8_test_case_generator("\u07FF"    ,[     0xDF,0xBF]).
unicode_string_utf8_test_case_generator("\U000007FF",[     0xDF,0xBF]).
unicode_string_utf8_test_case_generator("\x0800"    ,[0xE0,0xA0,0x80]).
unicode_string_utf8_test_case_generator("\u0800"    ,[0xE0,0xA0,0x80]).
unicode_string_utf8_test_case_generator("\U00000800",[0xE0,0xA0,0x80]).
unicode_string_utf8_test_case_generator("\xFFFF"    ,[0xEF,0xBF,0xBF]).
unicode_string_utf8_test_case_generator("\uFFFF"    ,[0xEF,0xBF,0xBF]).
unicode_string_utf8_test_case_generator("\U0000FFFF",[0xEF,0xBF,0xBF]).

% Note SWI-Prolog currently can not support Unicode beyond U+FFFF because internally it uses wchar_t arrays.
% unicode_string_utf8_test_case_generator("\x010000",[0xF0,0x90,0x80,0x80]).
% unicode_string_utf8_test_case_generator("\U00010000",[0xF0,0x90,0x80,0x80]).

unicode_string_utf8_test_case_generator("\u0000azAZ09\u007F\u0080\u07FF\u0800\uFFFF",[0x00,0'a,0'z,0'A,0'Z,0'0,0'9,0x7F,0xC2,0x80,0xDF,0xBF,0xE0,0xA0,0x80,0xEF,0xBF,0xBF]).
unicode_string_utf8_test_case_generator(" azAZ09~¡ÿĀ⛰",[  32,  97, 122,  65,  90,  48,  57, 126, 195, 130, 194, 161, 195, 131, 194, 191, 195, 132, 226, 130, 172, 195, 162, 226, 128, 186, 194, 176]).
unicode_string_utf8_test_case_generator(" azAZ09~¡ÿĀ⛰",[0x20,0x61,0x7A,0x41,0x5A,0x30,0x39,0x7E,0xC3,0x82,0xC2,0xA1,0xC3,0x83,0xC2,0xBF,0xC3,0x84,0xE2,0x82,0xAC,0xC3,0xA2,0xE2,0x80,0xBA,0xC2,0xB0]).

test(unicode_string_to_utf8,[forall(unicode_string_utf8_test_case_generator(Unicode_string,Expected_UTF8_encoded))]) :-
    string_codes(Unicode_string,Unicode_codes),
    DCG = utf8_codes(Unicode_codes),
    phrase(DCG,UTF8_encoded,Hole),

    assertion( var(Hole) ),
    Hole = [],
    assertion( UTF8_encoded == Expected_UTF8_encoded ).

test(utf8_to_unicode_string,[forall(unicode_string_utf8_test_case_generator(Expected_unicode_string,UTF8_encoded))]) :-
    DCG = utf8_codes(Unicode_codes),
    phrase(DCG,UTF8_encoded,Hole),

    assertion( Hole == [] ),
    string_codes(Unicode_string,Unicode_codes),
    assertion( Unicode_string == Expected_unicode_string ).

:- end_tests(utf8).

For those who like to remove the excesss

test(utf8_to_unicode,[forall(unicode_utf8_test_case_generator(Unicode_code_point,UTF8_encoded))]) :-
  phrase(utf8_codes([Unicode_code_point]),UTF8_encoded).
2 Likes