The idea is to prepare input text for a DCG. I found tokenising using DCGs directly from a stream of characters overcomplicated the “thinking in terms of formal grammars” part of the exercise.
Since I don’t need escaped quotes for my project anyway, and I found stringing punctuation characters caused all kinds of unforeseen problems (such as seeing “],” as one token instead of two, I decided to simplify the tokeniser I intend using for now down to this:
:- module(tokeniser, [string_tokens/2, % +String, -Tokens
read_file_to_tokens/2 % +FileName, -Tokens
]).
string_tokens(String, Tokens) :-
open_string(String, Stream),
read_tokens(Stream, Tokens).
read_file_to_tokens(SrcDest, Tokens) :-
setup_call_cleanup(open(SrcDest, read, Stream, []),
read_tokens(Stream, Tokens),
close(Stream)).
read_tokens(Stream, Chars) :-
get_char(Stream, Char),
read_tokens(Char, Stream, [], Chars).
read_tokens(end_of_file, _Stream, ReversedList, Tail) :-
reverse(ReversedList, Tail), !.
read_tokens(InChar, Stream, Tokens, Tail) :-
char_type(InChar, space), !,
get_char(Stream, Char),
read_tokens(Char, Stream, Tokens, Tail).
read_tokens(InChar, Stream, Tokens, Tail) :-
char_type(InChar, alnum), !,
read_word(InChar, Stream, [], Word, LeftOver),
read_tokens(LeftOver, Stream, [Word|Tokens], Tail).
read_tokens(QuoteChar, Stream, Tokens, Tail) :-
char_type(QuoteChar, quote), !,
get_char(Stream, InChar),
read_quote(InChar, Stream, QuoteChar, [QuoteChar], Quote, LeftOver),
read_tokens(LeftOver, Stream, [Quote|Tokens], Tail).
read_tokens(InChar, Stream, Tokens, Tail) :-
char_type(InChar, punct), !,
get_char(Stream, OutChar),
string_chars(Token, [InChar]), % not sure why this is needed, but it is
read_tokens(OutChar, Stream, [Token|Tokens], Tail).
read_word(LeftOver, _Stream, ReversedList, Word, LeftOver) :-
\+char_type(LeftOver, alnum), !,
reverse(ReversedList, Chars),
string_chars(Word, Chars).
read_word(InChar, Stream, Chars, Tail, LeftOver) :-
% char_type(InChar, alnum),
get_char(Stream, OutChar),
read_word(OutChar, Stream, [InChar|Chars], Tail, LeftOver).
read_quote(InChar, Stream, QuoteChar, Chars, Tail, LeftOver) :-
dif(InChar, QuoteChar), !,
get_char(Stream, OutChar),
read_quote(OutChar, Stream, QuoteChar, [InChar|Chars], Tail, LeftOver).
read_quote(QuoteChar, Stream, QuoteChar, ReversedList, Quote, LeftOver) :-
get_char(Stream, LeftOver),
reverse([QuoteChar|ReversedList], Chars),
string_chars(Quote, Chars).
Here is the test file I’m using. My thinking (at the moment) is, to handle negative numbers, floats, and exponents delimited with ^, e or ** or who knows what, it’s best to just put those as symbols in the token list to be handled by the parser, probably written as a DCG, while storing the digits as strings.
:- begin_tests(tokeniser).
:- use_module(['/home/roblaing/libs/swipl/tokeniser.pl']).
% Example taken from http://www.learnprolognow.org/lpnpage.php?pagetype=html&pageid=lpn-htmlse56
test(lpn_eg) :-
string_tokens("The cow under the table shoots.", Tokens),
Tokens = ["The", "cow", "under", "the", "table", "shoots", "."].
% Example taken from Chapter 24 "A Compiler" from The Art of Prolog
test(pascal_eg) :-
Code = "program factorial;
begin
read value;
count := 1;
result := 1;
while count < value do
begin
count := count + 1;
result := result * count
end;
write result
end",
string_tokens(Code, Tokens),
Tokens = ["program","factorial",";","begin","read","value",";","count",
":","=","1",";","result",":", "=","1",";","while","count","<","value","do",
"begin","count",":", "=","count","+","1",";","result",":", "=","result","*","count",
"end",";","write","result","end"].
test(json) :-
Code = '{
"control": "red",
"step": 1,
"piece_count": [
["black", 12],
["red", 12]
],
"cell": [
["a", 1, "b"],
["a", 2, "wp"],
["a", 3, "b"],
["a", 4, "b"],
["a", 5, "b"],
["a", 6, "bp"],
["a", 7, "b"],
["a", 8, "bp"],
["b", 1, "wp"],
["b", 2, "b"],
["b", 3, "wp"],
["b", 4, "b"],
["b", 5, "b"],
["b", 6, "b"],
["b", 7, "bp"],
["b", 8, "b"],
["c", 1, "b"],
["c", 2, "wp"],
["c", 3, "b"],
["c", 4, "b"],
["c", 5, "b"],
["c", 6, "bp"],
["c", 7, "b"],
["c", 8, "bp"],
["d", 1, "wp"],
["d", 2, "b"],
["d", 3, "wp"],
["d", 4, "b"],
["d", 5, "b"],
["d", 6, "b"],
["d", 7, "bp"],
["d", 8, "b"],
["e", 1, "b"],
["e", 2, "wp"],
["e", 3, "b"],
["e", 4, "b"],
["e", 5, "b"],
["e", 6, "bp"],
["e", 7, "b"],
["e", 8, "bp"],
["f", 1, "wp"],
["f", 2, "b"],
["f", 3, "wp"],
["f", 4, "b"],
["f", 5, "b"],
["f", 6, "b"],
["f", 7, "bp"],
["f", 8, "b"],
["g", 1, "b"],
["g", 2, "wp"],
["g", 3, "b"],
["g", 4, "b"],
["g", 5, "b"],
["g", 6, "bp"],
["g", 7, "b"],
["g", 8, "bp"],
["h", 1, "wp"],
["h", 2, "b"],
["h", 3, "wp"],
["h", 4, "b"],
["h", 5, "b"],
["h", 6, "b"],
["h", 7, "bp"],
["h", 8, "b"]
]
}',
string_tokens(Code, Tokens),
Tokens = ["{","\"control\"",":","\"red\"",",",
"\"step\"",":","1",",",
"\"piece_count\"",":","[","[","\"black\"",",","12","]",",",
"[","\"red\"",",","12","]","]",",",
"\"cell\"",":","[","[","\"a\"",",","1",",","\"b\"","]",",",
"[","\"a\"",",","2",",","\"wp\"","]",",",
"[","\"a\"",",","3",",","\"b\"","]",",",
"[","\"a\"",",","4",",","\"b\"","]",",",
"[","\"a\"",",","5",",","\"b\"","]",",",
"[","\"a\"",",","6",",","\"bp\"","]",",",
"[","\"a\"",",","7",",","\"b\"","]",",",
"[","\"a\"",",","8",",","\"bp\"","]",",",
"[","\"b\"",",","1",",","\"wp\"","]",",",
"[","\"b\"",",","2",",","\"b\"","]",",",
"[","\"b\"",",","3",",","\"wp\"","]",",",
"[","\"b\"",",","4",",","\"b\"","]",",",
"[","\"b\"",",","5",",","\"b\"","]",",",
"[","\"b\"",",","6",",","\"b\"","]",",",
"[","\"b\"",",","7",",","\"bp\"","]",",",
"[","\"b\"",",","8",",","\"b\"","]",",",
"[","\"c\"",",","1",",","\"b\"","]",",",
"[","\"c\"",",","2",",","\"wp\"","]",",",
"[","\"c\"",",","3",",","\"b\"","]",",",
"[","\"c\"",",","4",",","\"b\"","]",",",
"[","\"c\"",",","5",",","\"b\"","]",",",
"[","\"c\"",",","6",",","\"bp\"","]",",",
"[","\"c\"",",","7",",","\"b\"","]",",",
"[","\"c\"",",","8",",","\"bp\"","]",",",
"[","\"d\"",",","1",",","\"wp\"","]",",",
"[","\"d\"",",","2",",","\"b\"","]",",",
"[","\"d\"",",","3",",","\"wp\"","]",",",
"[","\"d\"",",","4",",","\"b\"","]",",",
"[","\"d\"",",","5",",","\"b\"","]",",",
"[","\"d\"",",","6",",","\"b\"","]",",",
"[","\"d\"",",","7",",","\"bp\"","]",",",
"[","\"d\"",",","8",",","\"b\"","]",",",
"[","\"e\"",",","1",",","\"b\"","]",",",
"[","\"e\"",",","2",",","\"wp\"","]",",",
"[","\"e\"",",","3",",","\"b\"","]",",",
"[","\"e\"",",","4",",","\"b\"","]",",",
"[","\"e\"",",","5",",","\"b\"","]",",",
"[","\"e\"",",","6",",","\"bp\"","]",",",
"[","\"e\"",",","7",",","\"b\"","]",",",
"[","\"e\"",",","8",",","\"bp\"","]",",",
"[","\"f\"",",","1",",","\"wp\"","]",",",
"[","\"f\"",",","2",",","\"b\"","]",",",
"[","\"f\"",",","3",",","\"wp\"","]",",",
"[","\"f\"",",","4",",","\"b\"","]",",",
"[","\"f\"",",","5",",","\"b\"","]",",",
"[","\"f\"",",","6",",","\"b\"","]",",",
"[","\"f\"",",","7",",","\"bp\"","]",",",
"[","\"f\"",",","8",",","\"b\"","]",",",
"[","\"g\"",",","1",",","\"b\"","]",",",
"[","\"g\"",",","2",",","\"wp\"","]",",",
"[","\"g\"",",","3",",","\"b\"","]",",",
"[","\"g\"",",","4",",","\"b\"","]",",",
"[","\"g\"",",","5",",","\"b\"","]",",",
"[","\"g\"",",","6",",","\"bp\"","]",",",
"[","\"g\"",",","7",",","\"b\"","]",",",
"[","\"g\"",",","8",",","\"bp\"","]",",",
"[","\"h\"",",","1",",","\"wp\"","]",",",
"[","\"h\"",",","2",",","\"b\"","]",",",
"[","\"h\"",",","3",",","\"wp\"","]",",",
"[","\"h\"",",","4",",","\"b\"","]",",",
"[","\"h\"",",","5",",","\"b\"","]",",",
"[","\"h\"",",","6",",","\"b\"","]",",",
"[","\"h\"",",","7",",","\"bp\"","]",",",
"[","\"h\"",",","8",",","\"b\"","]","]","}"].
test(quoted) :-
string_tokens('Preserve "Hello Text!" as a string', Tokens),
Tokens = ["Preserve", "\"Hello Text!\"", "as", "a", "string"].
/*
test(escaped_quote) :-
string_tokens('Preserve "Hello\\" Text!" as a string', Tokens),
Tokens = ["Preserve", "\"Hello\" Text!\"", "as", "\"a\"", "string"].
*/
:- end_tests(tokeniser).