I’ve been playing around with talking to a local LLM (in this case via ollama) using some proof-of-concept code pasted further down so I can have an assistant at the REPL.
FYI I’m using the latest develop version of SWI 9.3.24
If you want to play/test, you will need to install ollama and a suitable model. e.g.
brew install ollama
ollama serve&
ollama pull codellama:latest
In another terminal session you have swipl running the module code and ask the LLM questions. e.g.
?- ol_chat("Tell me about yourself.").
% _7036{messages:[_7022{content:Tell me about yourself.,role:user}],model:codellama,stream:true}
% > POST /api/chat HTTP/1.1
% > Host: localhost:11434
% > User-Agent: SWI-Prolog
% > Connection: close
% > Content-Type: application/json
Content-Length: 114
Warning: Ambiguous operation on stream pair <stream>(0x600002841400,0x600002841600)
Hello! I am LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I am trained on a massive dataset of text from the internet and can answer questions, provide information, and even generate creative content such as stories or dialogue. I can understand natural language and can communicate with you in a way that feels natural and similar to human conversation. I can be used to create chatbots, virtual assistants, and other applications where natural language input and output is desired.
% LLM duration: 26.903 seconds, Evals: 114, EPS=5.85768
true.
It all works fine except for the warning, which I cannot figure out how to remove. I believe it’s because the local LLM is sending chunked data, and due to issues with the chunked library I’m having to use http_chunked_open/3 directly, and maybe at that moment the predicate doesn’t know if the passed stream (from http_open/3) is a read or write stream? Is there any way to get rid of the warning?
Reading the docs, I shouldn’t need to use http_chunked_open/3 as the http_stream library should have transparently decoded the traffic such that the response stream from http_open/3 consists of json objects, however it fails reading the stream produced (see section after code).
Working code below:
:-module(ollama, [
ol_chat/1,
ol_chat/2,
ol_clear/0,
ol_history/0,
ol_regenerate/0,
ol_version/0,
ol_version/1]).
:-use_module(library(macros)).
:-use_module(library(http/http_stream)).
:-use_module(library(http/http_json)).
:-use_module(library(http/http_open)).
:-use_module(library(http/http_client)).
:-use_module(library(http/json)).
:-use_module(library(memfile)).
:-use_module(library(predicate_options)).
:-debug(ollama).
:-debug(http(send_request)).
/*********** Default Model Settings **************/
%#define(llm_model, 'deepseek-coder-v2:latest').
#define(llm_model,'codellama').
#define(rag_model,'mxbai-embed-large:latest').
#define(ollama_host, 'localhost:11434').
#define(stream, true).
/*************************************************/
:-dynamic ol_history/3.
:-predicate_options(ol_version/1, 1, [host(text)]).
:-predicate_options(ol_chat/2, 2, [host(text),
role:oneof([role, tool, assistant]),
model(text),
stream(boolean)]).
ol_clear :- retractall(ol_history(_, _, _)).
ol_regenerate :-
ol_history(D, Question, _),
\+ (ol_history(P, _, _), P@>D),
once(retract(ol_history(D, _, _))),
ol_chat(Question).
ol_version:-ol_version([]).
ol_version(Options) :-
option(host(Host), Options, #ollama_host),
atom_concat('http://', Host, Server),
atom_concat(Server, "/api/version", URL),
http_open(URL, Reply, [method(get), json_object(dict)]),
json_read_dict(Reply, Data, []),
close(Reply),
print_message(information, format('~w', Data.version)).
ol_chat(Data) :- ol_chat(Data, []).
ol_chat(Data, Options) :-
option(host(Host), Options, #ollama_host),
option(role(Role), Options, 'user'),
option(model(Model), Options, #llm_model),
option(stream(Stream), Options, #stream),
atom_concat('http://', Host, Server),
atom_concat(Server, "/api/chat", URL),
findall([_{role:user, content:Me}, _{role:assistant, content:You}], ol_history(_, Me, You), History),
flatten(History, FHistory),
append(FHistory, [_{role:Role, content:Data}], ChatHistory),
Message = _{model:Model, messages:ChatHistory, stream:Stream},
debug(ollama, '~w', Message),
http_open(URL, Response, [headers(ResponseHeaders), post(json(Message)), raw_encoding(chunked)]),
( Stream == true ->
setup_call_cleanup(
(
new_memory_file(MFile),
open_memory_file(MFile, write, MStream)
),
(
dump_stream(Response, MStream, CreatedAt),
flush_output(MStream),
close(MStream),
memory_file_to_string(MFile, Content),
assert(ol_history(CreatedAt, Data, Content))
),
(
free_memory_file(MFile)
)
)
;
% Was it chunked?
(memberchk(transfer_encoding(chunked), ResponseHeaders) ->
http_chunked_open(Response, ChunkData, []),
json_read_dict(ChunkData, Reply, [])
;
json_read_dict(Response, Reply, [])
),
print_message(informational, format('LLM duration: ~3f seconds, Evals: ~I, EPS=~5f', [Reply.total_duration/10^9,Reply.eval_count,Reply.eval_count/Reply.eval_duration * 10^9])),
write(Reply.message.content),
assert(ol_history(Reply.created_at, Data, Reply.message.content))
).
ol_history:-
ol_history(D, M, Y), \+format('-------------------------------------------------------------------------------~w--~nMe : ~w~nYou:~w~n', [D, M, Y]).
ol_history:-
format('-----------------------------------------End of Transcript -------------------------------------------------~n').
dump_stream(Stream, MStream, CreatedAt):-
http_chunked_open(Stream, ChunkData, []),
dump_part(ChunkData, MStream, CreatedAt).
dump_part(ChunkData, MStream, CreatedAt):-
json_read_dict(ChunkData, Part),
write(Part.message.content),
flush_output,
format(MStream, '~w', Part.message.content),
(_{ done:true } :< Part ->
print_message(informational, format('LLM duration: ~3f seconds, Evals: ~I, EPS=~5f', [Part.total_duration/10^9,Part.eval_count,Part.eval_count/Part.eval_duration * 10^9])),
CreatedAt = Part.created_at
;
dump_part(ChunkData, MStream, CreatedAt)
).
The version that highlights issues letting the library handle the chunked transfer outputs the following chat:
?- ol_chat("Tell me about yourself.").
% _9496{messages:[_9240{content:Tell me about yourself.,role:user}],model:codellama,stream:true}
% > POST /api/chat HTTP/1.1
% > Host: localhost:11434
% > User-Agent: SWI-Prolog
% > Connection: close
% > Content-Type: application/json
Content-Length: 114
ERROR: key `message' does not exist in _59116{error:"error reading llm response: context canceled"}
If chunked transfer_encoding is supposed to be handled transparently I would expect the code to work below:
Failing code below:
:-module(ollama, [
ol_chat/1,
ol_chat/2,
ol_clear/0,
ol_history/0,
ol_regenerate/0,
ol_version/0,
ol_version/1]).
:-use_module(library(macros)).
:-use_module(library(http/http_stream)).
:-use_module(library(http/http_json)).
:-use_module(library(http/http_open)).
:-use_module(library(http/http_client)).
:-use_module(library(http/json)).
:-use_module(library(memfile)).
:-use_module(library(predicate_options)).
:-debug(ollama).
:-debug(http(send_request)).
/*********** Default Model Settings **************/
%#define(llm_model, 'deepseek-coder-v2:latest').
#define(llm_model,'codellama').
#define(rag_model,'mxbai-embed-large:latest').
#define(ollama_host, 'localhost:11434').
#define(stream, true).
/*************************************************/
:-dynamic ol_history/3.
:-predicate_options(ol_version/1, 1, [host(text)]).
:-predicate_options(ol_chat/2, 2, [host(text),
role:oneof([role, tool, assistant]),
model(text),
stream(boolean)]).
ol_clear :- retractall(ol_history(_, _, _)).
ol_regenerate :-
ol_history(D, Question, _),
\+ (ol_history(P, _, _), P@>D),
once(retract(ol_history(D, _, _))),
ol_chat(Question).
ol_version:-ol_version([]).
ol_version(Options) :-
option(host(Host), Options, #ollama_host),
atom_concat('http://', Host, Server),
atom_concat(Server, "/api/version", URL),
http_open(URL, Reply, [method(get), json_object(dict)]),
json_read_dict(Reply, Data, []),
close(Reply),
print_message(information, format('~w', Data.version)).
ol_chat(Data) :- ol_chat(Data, []).
ol_chat(Data, Options) :-
option(host(Host), Options, #ollama_host),
option(role(Role), Options, 'user'),
option(model(Model), Options, #llm_model),
option(stream(Stream), Options, #stream),
atom_concat('http://', Host, Server),
atom_concat(Server, "/api/chat", URL),
findall([_{role:user, content:Me}, _{role:assistant, content:You}], ol_history(_, Me, You), History),
flatten(History, FHistory),
append(FHistory, [_{role:Role, content:Data}], ChatHistory),
Message = _{model:Model, messages:ChatHistory, stream:Stream},
debug(ollama, '~w', Message),
http_open(URL, Response, [post(json(Message))]),
( Stream == true ->
setup_call_cleanup(
(
new_memory_file(MFile),
open_memory_file(MFile, write, MStream)
),
(
dump_stream(Response, MStream, CreatedAt),
flush_output(MStream),
close(MStream),
memory_file_to_string(MFile, Content),
assert(ol_history(CreatedAt, Data, Content))
),
(
free_memory_file(MFile)
)
)
;
json_read_dict(Response, Reply, []),
print_message(informational, format('LLM duration: ~3f seconds, Evals: ~I, EPS=~5f', [Reply.total_duration/10^9,Reply.eval_count,Reply.eval_count/Reply.eval_duration * 10^9])),
write(Reply.message.content),
assert(ol_history(Reply.created_at, Data, Reply.message.content))
).
ol_history:-
ol_history(D, M, Y), \+format('-------------------------------------------------------------------------------~w--~nMe : ~w~nYou:~w~n', [D, M, Y]).
ol_history:-
format('-----------------------------------------End of Transcript -------------------------------------------------~n').
dump_stream(Stream, MStream, CreatedAt):-
dump_part(Stream, MStream, CreatedAt).
dump_part(Stream, MStream, CreatedAt):-
json_read_dict(Stream, Part),
write(Part.message.content),
flush_output,
format(MStream, '~w', Part.message.content),
(_{ done:true } :< Part ->
print_message(informational, format('LLM duration: ~3f seconds, Evals: ~I, EPS=~5f', [Part.total_duration/10^9,Part.eval_count,Part.eval_count/Part.eval_duration * 10^9])),
CreatedAt = Part.created_at
;
dump_part(Stream, MStream, CreatedAt)
).