Coverage for mchammer/data_analysis.py: 100%

1from typing import Optional

3import numpy as np

4import pandas as pd

5from scipy import stats

8def analyze_data(data: np.ndarray, max_lag: int = None) -> dict:

9 """Carries out an extensive analysis of the data series and returns a

10 dictionary containing the mean, standard deviation,

11 correlation length and a 95% error estimate.

13 Parameters

14 ----------

15 data

16 Data series for which to compute autocorrelation function.

17 max_lag

18 Maximum lag between two data points used for computing autocorrelation.

19 """

20 summary = dict(mean=data.mean(),

21 std=data.std())

22 acf = get_autocorrelation_function(data, max_lag)

23 correlation_length = _estimate_correlation_length_from_acf(acf)

24 if correlation_length is not None:

25 error_estimate = _estimate_error(data, correlation_length, confidence=0.95)

26 summary['correlation_length'] = correlation_length

27 summary['error_estimate'] = error_estimate

28 else:

29 summary['correlation_length'] = np.nan

30 summary['error_estimate'] = np.nan

31 return summary

34def get_autocorrelation_function(data: np.ndarray, max_lag: int = None) -> np.ndarray:

35 """ Returns autocorrelation function.

37 The autocorrelation function is computed using :func:`pandas.Series.autocorr

38 <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.autocorr.html>`.

40 Parameters

41 ----------

42 data

43 Data series for which to compute autocorrelation function.

44 max_lag

45 Maximum lag between two data points.

46 """

47 if max_lag is None:

48 max_lag = len(data) - 1

49 if max_lag < 1 or max_lag >= len(data):

50 raise ValueError('max_lag should be between 1 and len(data)-1.')

51 series = pd.Series(data)

52 acf = [series.autocorr(lag) for lag in range(0, max_lag)]

53 return np.array(acf)

56def get_correlation_length(data: np.ndarray) -> Optional[int]:

57 r"""Returns estimate of the correlation length of data.

59 The correlation length is taken as the first point where the

60 autocorrelation functions is less than :math:`\exp(-2)`. If the

61 correlation function never drops below :math:`\exp(-2)` ``np.nan`` is

62 returned.

64 If the correlation length cannot be computed since the

65 autocorrelation function is unconverged the function returns

66 ``None``.

68 Parameters

69 ----------

70 data

71 Data series for which to the compute autocorrelation function.

72 """

74 acf = get_autocorrelation_function(data)

75 correlation_length = _estimate_correlation_length_from_acf(acf)

76 if correlation_length is None:

77 return None

78 return correlation_length

81def get_error_estimate(data: np.ndarray, confidence: float = 0.95) -> Optional[float]:

82 r"""Returns estimate of standard error :math:`\mathrm{error}`

83 with confidence interval via

85 .. math::

87 \mathrm{error} = t_\mathrm{factor} * \mathrm{std}(\mathrm{data}) / \sqrt{N_s}

89 where :math:`t_\mathrm{factor}` is the factor corresponding to the confidence

90 interval and :math:`N_s` is the number of independent measurements

91 (with correlation taken into account).

93 If the correlation length cannot be computed since the

94 autocorrelation function is unconverged the function returns

95 ``None``.

97 Parameters

98 ----------

99 data

100 Eata series for which to estimate the error.

101

102 """

103 correlation_length = get_correlation_length(data)

104 if correlation_length is None:

105 return None

106 error_estimate = _estimate_error(data, correlation_length, confidence)

107 return error_estimate

108

109

110def _estimate_correlation_length_from_acf(acf: np.ndarray) -> Optional[int]:

111 """Estimates correlation length from :attr:`acf`. Returns ``None`` if

112 the autocorrelation function is uncoverged.

113 """

114 for i, a in enumerate(acf):

115 if a < np.exp(-2):

116 return i

117 return None # np.nan

118

119

120def _estimate_error(data: np.ndarray,

121 correlation_length: int,

122 confidence: float) -> float:

123 """ Estimates error using correlation length. """

124 t_factor: float = stats.t.ppf((1 + confidence) / 2, len(data) - 1)

125 error: float = t_factor * np.std(data) / np.sqrt(len(data) / correlation_length)

126 return error