Coverage for mchammer/data_analysis.py: 100%
44 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-12-26 04:12 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-12-26 04:12 +0000
1from typing import Optional
3import numpy as np
4import pandas as pd
5from scipy import stats
8def analyze_data(data: np.ndarray, max_lag: int = None) -> dict:
9 """Carries out an extensive analysis of the data series and returns a
10 dictionary containing the mean, standard deviation,
11 correlation length and a 95% error estimate.
13 Parameters
14 ----------
15 data
16 Data series for which to compute autocorrelation function.
17 max_lag
18 Maximum lag between two data points used for computing autocorrelation.
19 """
20 summary = dict(mean=data.mean(),
21 std=data.std())
22 acf = get_autocorrelation_function(data, max_lag)
23 correlation_length = _estimate_correlation_length_from_acf(acf)
24 if correlation_length is not None:
25 error_estimate = _estimate_error(data, correlation_length, confidence=0.95)
26 summary['correlation_length'] = correlation_length
27 summary['error_estimate'] = error_estimate
28 else:
29 summary['correlation_length'] = np.nan
30 summary['error_estimate'] = np.nan
31 return summary
34def get_autocorrelation_function(data: np.ndarray, max_lag: int = None) -> np.ndarray:
35 """ Returns autocorrelation function.
37 The autocorrelation function is computed using :func:`pandas.Series.autocorr
38 <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.autocorr.html>`.
40 Parameters
41 ----------
42 data
43 Data series for which to compute autocorrelation function.
44 max_lag
45 Maximum lag between two data points.
46 """
47 if max_lag is None:
48 max_lag = len(data) - 1
49 if max_lag < 1 or max_lag >= len(data):
50 raise ValueError('max_lag should be between 1 and len(data)-1.')
51 series = pd.Series(data)
52 acf = [series.autocorr(lag) for lag in range(0, max_lag)]
53 return np.array(acf)
56def get_correlation_length(data: np.ndarray) -> Optional[int]:
57 r"""Returns estimate of the correlation length of data.
59 The correlation length is taken as the first point where the
60 autocorrelation functions is less than :math:`\exp(-2)`. If the
61 correlation function never drops below :math:`\exp(-2)` ``np.nan`` is
62 returned.
64 If the correlation length cannot be computed since the
65 autocorrelation function is unconverged the function returns
66 ``None``.
68 Parameters
69 ----------
70 data
71 Data series for which to the compute autocorrelation function.
72 """
74 acf = get_autocorrelation_function(data)
75 correlation_length = _estimate_correlation_length_from_acf(acf)
76 if correlation_length is None:
77 return None
78 return correlation_length
81def get_error_estimate(data: np.ndarray, confidence: float = 0.95) -> Optional[float]:
82 r"""Returns estimate of standard error :math:`\mathrm{error}`
83 with confidence interval via
85 .. math::
87 \mathrm{error} = t_\mathrm{factor} * \mathrm{std}(\mathrm{data}) / \sqrt{N_s}
89 where :math:`t_\mathrm{factor}` is the factor corresponding to the confidence
90 interval and :math:`N_s` is the number of independent measurements
91 (with correlation taken into account).
93 If the correlation length cannot be computed since the
94 autocorrelation function is unconverged the function returns
95 ``None``.
97 Parameters
98 ----------
99 data
100 Eata series for which to estimate the error.
102 """
103 correlation_length = get_correlation_length(data)
104 if correlation_length is None:
105 return None
106 error_estimate = _estimate_error(data, correlation_length, confidence)
107 return error_estimate
110def _estimate_correlation_length_from_acf(acf: np.ndarray) -> Optional[int]:
111 """Estimates correlation length from :attr:`acf`. Returns ``None`` if
112 the autocorrelation function is uncoverged.
113 """
114 for i, a in enumerate(acf):
115 if a < np.exp(-2):
116 return i
117 return None # np.nan
120def _estimate_error(data: np.ndarray,
121 correlation_length: int,
122 confidence: float) -> float:
123 """ Estimates error using correlation length. """
124 t_factor: float = stats.t.ppf((1 + confidence) / 2, len(data) - 1)
125 error: float = t_factor * np.std(data) / np.sqrt(len(data) / correlation_length)
126 return error