import math
from numpy.typing import NDArray
from typing import Any, Iterable
[docs]
def lempel_ziv_complexity(
X: Iterable[Any], return_dictionary: bool = False
) -> float | tuple[float, set]:
"""
Uses the classic Lempel-Ziv compression algorithm to estimate the entropy rate of a one-dimensional array with :math:`N` samples.
If each element in X is a multi-dimensional tuple, then the result is equivalent to the joint entropy rate.
The extension to multivariate Lempel-Ziv is straightforward and involves representing the joint state of each element at time t as a tuple (X(t), Y(t)) and treating the two sources as a single joint source.
Here, the dictionary length :math:`|D|` is normalized:
.. math::
\\textnormal{Complexity}(X) = \\frac{|D|\\log|D|}{N}
Parameters
----------
X : list
A discrete array. Can contain digits 0-9 and/or
single-character strings ("A", "a", "B", "b", etc.)..
return_substrings : bool, optional
DESCRIPTION. The default is False.
Returns
-------
float
The estimated Lempel-Ziv complexity.
set
The dictionary (only returned if return_dictionary == True)
References
----------
Schartner, M. M., Carhart-Harris, R. L., Barrett, A. B., Seth, A. K., & Muthukumaraswamy, S. D. (2017).
Increased spontaneous MEG signal diversity for psychoactive doses of ketamine, LSD and psilocybin.
Scientific Reports, 7, 46421.
https://doi.org/10.1038/srep46421
Blanc, J.-L., Schmidt, N., Bonnier, L., Pezard, L., & Lesne, A. (2008).
Quantifying Neural Correlations Using Lempel-Ziv Complexity.
Deuxième conférence française de Neurosciences Computationnelles,
Marseille, France.
https://hal.science/hal-00331599/document
Zozor, S., Ravier, P., & Buttelli, O. (2005).
On Lempel–Ziv complexity for multidimensional data analysis.
Physica A: Statistical Mechanics and Its Applications, 345(1), 285–302.
https://doi.org/10.1016/j.physa.2004.07.025
"""
tuples: tuple = tuple((i,) for i in X)
N: int = len(tuples)
i = 0
# dictionary = {tuples[0]} # Store unique substrings
dictionary = set()
while i < N:
j = i + 1
while j <= N and tuples[i:j] in dictionary:
j += 1
dictionary.add(tuples[i:j])
i = j # Move index to next position
c = len(dictionary)
complexity = (c * math.log2(c)) / N # The +1 is removed for consistency w/ Coutinho
if return_dictionary is True:
return complexity, dictionary
else:
return complexity
[docs]
def conditional_lempel_ziv_complexity(X: Iterable[Any], Y: Iterable[Any]) -> float:
"""
The conditional entropy rate estimated with the Lempel-Ziv algorithm.
.. math::
LZ(X|Y) = LZ(X,Y) - LZ(Y)
Parameters
----------
X : Iterable[Any]
An iterable of discrete RVs. Can contain digits 0-9 and/or
single-character strings ("A", "a", "B", "b", etc.)..
Y : Iterable[Any]
An iterable of dicrete RVs. Can contain digits 0-9 and/or
single-character strings ("A", "a", "B", "b", etc.)..
Returns
-------
float
The estimated conditional entropy rate.
References
----------
Zozor, S., Ravier, P., & Buttelli, O. (2005).
On Lempel–Ziv complexity for multidimensional data analysis.
Physica A: Statistical Mechanics and Its Applications, 345(1), 285–302.
https://doi.org/10.1016/j.physa.2004.07.025
"""
joint: list = list(zip(X, Y))
return lempel_ziv_complexity(joint) - lempel_ziv_complexity(Y)
[docs]
def lempel_ziv_total_correlation(data: NDArray[Any]) -> float:
"""
A straightforward generalization of the mutual information rate given by
Zozor et al., and Blanc et al.,
.. math::
TC_{LZ}(X) = \\sum_{i=1}^{N} LZ(X_i) - LZ(X)
Parameters
----------
data : NDArray[Any]
A multi-dimensional discrete array, assumed to be in
channels x time format.
Returns
-------
float
The estimated total correlation rate.
"""
N0: int
N1: int
N0, N1 = data.shape
sum_marginals: float = 0.0
for i in range(N0):
sum_marginals += lempel_ziv_complexity(data[i, :])
joint = [tuple(data[:, i]) for i in range(N1)]
return sum_marginals - lempel_ziv_complexity(joint)
[docs]
def cross_lempel_ziv_complexity(X: list, Y: list) -> int:
"""
Computes the Lempel-Ziv complexity of a string X
using a pre-constructed dictionary optimized on Y. The relative
Lempel-Ziv complexity is the extra patterns that appear in X but
not in the compressed Y.
This code uses the optimized dictionary for Y, rather than searching all
substrings of Y. Using all possible substrings is very impractical for
long time series. As a result, however, it is very sensitive to the
particular temporal ordering of fluctuations in X and Y.
Should only be used when X and Y were recorded at the same time, such
as channels in an EEG/fMRI/MEG recording.
MATHEMATICAL GARUNTEES GIVEN IN ZIV & MERHAV 1993 ARE NOT PRESERVED
BY THIS METHOD. THIS FUNCTION REMAINS EXPERIMENTAL
Parameters
----------
X : NDArray[Any]
A discrete array. Can contain digits 0-9 and/or
single-character strings ("A", "a", "B", "b", etc.).
Y : NDArray[Any]
A discrete array. Can contain digits 0-9 and/or
single-character strings ("A", "a", "B", "b", etc.).
Returns
-------
complexity : int
The Lempel-ziv relative complexity.
References
----------
Ziv, J., & Merhav, N. (1993).
A Measure of Relative Entropy between Individual Sequences with
Application to Universal Classification.
Proceedings. IEEE International Symposium on Information Theory, 352–352.
https://doi.org/10.1109/ISIT.1993.748668
"""
X_tuples = tuple((i,) for i in X)
N_X = len(X_tuples)
# Step 1: Build LZ dictionary from Y (sequential parsing, no full substring storage)
_, dictionary_Y = lempel_ziv_complexity(Y, return_dictionary=True)
# Step 2: Parse X with dictionary_Y
i, c = 0, 0
while i < N_X - 1:
j = i + 1
while j <= N_X and X_tuples[i : j + 1] in dictionary_Y:
j += 1
dictionary_Y.add(X_tuples[i : j + 1]) # Store new substrings
c += 1 # A new additions
i = j # Move forward
complexity = c * math.log2(c) / N_X
return complexity