132 lines
3.4 KiB
Prolog
132 lines
3.4 KiB
Prolog
:-module(shannon_entropy, [shannon_entropy/2]).
|
|
|
|
%! shannon_entropy(+String, -Entropy) is det.
|
|
%
|
|
% Calculate the Shannon Entropy of String.
|
|
%
|
|
% Example query:
|
|
% ==
|
|
% ?- shannon_entropy(1223334444, H).
|
|
% H = 1.8464393446710154.
|
|
% ==
|
|
%
|
|
shannon_entropy(String, Entropy):-
|
|
atom_chars(String, Cs)
|
|
,relative_frequencies(Cs, Frequencies)
|
|
,findall(CI
|
|
,(member(_C-F, Frequencies)
|
|
,log2(F, L)
|
|
,CI is F * L
|
|
)
|
|
,CIs)
|
|
,foldl(sum, CIs, 0, E)
|
|
,Entropy is -E.
|
|
|
|
%! frequencies(+Characters,-Frequencies) is det.
|
|
%
|
|
% Calculates the relative frequencies of elements in the list of
|
|
% Characters.
|
|
%
|
|
% Frequencies is a key-value list with elements of the form:
|
|
% C-F, where C a character in the list and F its relative
|
|
% frequency in the list.
|
|
%
|
|
% Example query:
|
|
% ==
|
|
% ?- relative_frequencies([a,a,a,b,b,b,b,b,b,c,c,c,a,a,f], Fs).
|
|
% Fs = [a-0.3333333333333333, b-0.4, c-0.2,f-0.06666666666666667].
|
|
% ==
|
|
%
|
|
relative_frequencies(List, Frequencies):-
|
|
run_length_encoding(List, Rle)
|
|
% Sort Run-length encoded list and aggregate lengths by element
|
|
,keysort(Rle, Sorted_Rle)
|
|
,group_pairs_by_key(Sorted_Rle, Elements_Run_lengths)
|
|
,length(List, Elements_in_list)
|
|
,findall(E-Frequency_of_E
|
|
,(member(E-RLs, Elements_Run_lengths)
|
|
% Sum the list of lengths of runs of E
|
|
,foldl(plus, RLs, 0, Occurences_of_E)
|
|
,Frequency_of_E is Occurences_of_E / Elements_in_list
|
|
)
|
|
,Frequencies).
|
|
|
|
|
|
%! run_length_encoding(+List, -Run_length_encoding) is det.
|
|
%
|
|
% Converts a list to its run-length encoded form where each "run"
|
|
% of contiguous repeats of the same element is replaced by that
|
|
% element and the length of the run.
|
|
%
|
|
% Run_length_encoding is a key-value list, where each element is a
|
|
% term:
|
|
%
|
|
% Element:term-Repetitions:number.
|
|
%
|
|
% Example query:
|
|
% ==
|
|
% ?- run_length_encoding([a,a,a,b,b,b,b,b,b,c,c,c,a,a,f], RLE).
|
|
% RLE = [a-3, b-6, c-3, a-2, f-1].
|
|
% ==
|
|
%
|
|
run_length_encoding([], []-0):-
|
|
!. % No more results needed.
|
|
|
|
run_length_encoding([Head|List], Run_length_encoded_list):-
|
|
run_length_encoding(List, [Head-1], Reversed_list)
|
|
% The resulting list is in reverse order due to the head-to-tail processing
|
|
,reverse(Reversed_list, Run_length_encoded_list).
|
|
|
|
%! run_length_encoding(+List,+Initialiser,-Accumulator) is det.
|
|
%
|
|
% Business end of run_length_encoding/3. Calculates the run-length
|
|
% encoded form of a list and binds the result to the Accumulator.
|
|
% Initialiser is a list [H-1] where H is the first element of the
|
|
% input list.
|
|
%
|
|
run_length_encoding([], Fs, Fs).
|
|
|
|
% Run of F consecutive occurrences of C
|
|
run_length_encoding([C|Cs],[C-F|Fs], Acc):-
|
|
% Backtracking would produce successive counts
|
|
% of runs of C at different indices in the list.
|
|
!
|
|
,F_ is F + 1
|
|
,run_length_encoding(Cs, [C-F_| Fs], Acc).
|
|
|
|
% End of a run of consecutive identical elements.
|
|
run_length_encoding([C|Cs], Fs, Acc):-
|
|
run_length_encoding(Cs,[C-1|Fs], Acc).
|
|
|
|
|
|
/* Arithmetic helper predicates */
|
|
|
|
%! log2(N, L2_N) is det.
|
|
%
|
|
% L2_N is the logarithm with base 2 of N.
|
|
%
|
|
log2(N, L2_N):-
|
|
L_10 is log10(N)
|
|
,L_2 is log10(2)
|
|
,L2_N is L_10 / L_2.
|
|
|
|
%! sum(+A,+B,?Sum) is det.
|
|
%
|
|
% True when Sum is the sum of numbers A and B.
|
|
%
|
|
% Helper predicate to allow foldl/4 to do addition. The following
|
|
% call will raise an error (because there is no predicate +/3):
|
|
% ==
|
|
% foldl(+, [1,2,3], 0, Result).
|
|
% ==
|
|
%
|
|
% This will not raise an error:
|
|
% ==
|
|
% foldl(sum, [1,2,3], 0, Result).
|
|
% ==
|
|
%
|
|
sum(A, B, Sum):-
|
|
must_be(number, A)
|
|
,must_be(number, B)
|
|
,Sum is A + B.
|