Coverage for mchammer/data_analysis.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-06 04:14 +0000

1from typing import Optional 

2 

3import numpy as np 

4import pandas as pd 

5from scipy import stats 

6 

7 

8def analyze_data(data: np.ndarray, max_lag: int = None) -> dict: 

9 """Carries out an extensive analysis of the data series and returns a 

10 dictionary containing the mean, standard deviation, 

11 correlation length and a 95% error estimate. 

12 

13 Parameters 

14 ---------- 

15 data 

16 Data series for which to compute autocorrelation function. 

17 max_lag 

18 Maximum lag between two data points used for computing autocorrelation. 

19 """ 

20 summary = dict(mean=data.mean(), 

21 std=data.std()) 

22 acf = get_autocorrelation_function(data, max_lag) 

23 correlation_length = _estimate_correlation_length_from_acf(acf) 

24 if correlation_length is not None: 

25 error_estimate = _estimate_error(data, correlation_length, confidence=0.95) 

26 summary['correlation_length'] = correlation_length 

27 summary['error_estimate'] = error_estimate 

28 else: 

29 summary['correlation_length'] = np.nan 

30 summary['error_estimate'] = np.nan 

31 return summary 

32 

33 

34def get_autocorrelation_function(data: np.ndarray, max_lag: int = None) -> np.ndarray: 

35 """ Returns autocorrelation function. 

36 

37 The autocorrelation function is computed using :func:`pandas.Series.autocorr 

38 <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.autocorr.html>`. 

39 

40 Parameters 

41 ---------- 

42 data 

43 Data series for which to compute autocorrelation function. 

44 max_lag 

45 Maximum lag between two data points. 

46 """ 

47 if max_lag is None: 

48 max_lag = len(data) - 1 

49 if max_lag < 1 or max_lag >= len(data): 

50 raise ValueError('max_lag should be between 1 and len(data)-1.') 

51 series = pd.Series(data) 

52 acf = [series.autocorr(lag) for lag in range(0, max_lag)] 

53 return np.array(acf) 

54 

55 

56def get_correlation_length(data: np.ndarray) -> Optional[int]: 

57 r"""Returns estimate of the correlation length of data. 

58 

59 The correlation length is taken as the first point where the 

60 autocorrelation functions is less than :math:`\exp(-2)`. If the 

61 correlation function never drops below :math:`\exp(-2)` ``np.nan`` is 

62 returned. 

63 

64 If the correlation length cannot be computed since the 

65 autocorrelation function is unconverged the function returns 

66 ``None``. 

67 

68 Parameters 

69 ---------- 

70 data 

71 Data series for which to the compute autocorrelation function. 

72 """ 

73 

74 acf = get_autocorrelation_function(data) 

75 correlation_length = _estimate_correlation_length_from_acf(acf) 

76 if correlation_length is None: 

77 return None 

78 return correlation_length 

79 

80 

81def get_error_estimate(data: np.ndarray, confidence: float = 0.95) -> Optional[float]: 

82 r"""Returns estimate of standard error :math:`\mathrm{error}` 

83 with confidence interval via 

84 

85 .. math:: 

86 

87 \mathrm{error} = t_\mathrm{factor} * \mathrm{std}(\mathrm{data}) / \sqrt{N_s} 

88 

89 where :math:`t_\mathrm{factor}` is the factor corresponding to the confidence 

90 interval and :math:`N_s` is the number of independent measurements 

91 (with correlation taken into account). 

92 

93 If the correlation length cannot be computed since the 

94 autocorrelation function is unconverged the function returns 

95 ``None``. 

96 

97 Parameters 

98 ---------- 

99 data 

100 Eata series for which to estimate the error. 

101 

102 """ 

103 correlation_length = get_correlation_length(data) 

104 if correlation_length is None: 

105 return None 

106 error_estimate = _estimate_error(data, correlation_length, confidence) 

107 return error_estimate 

108 

109 

110def _estimate_correlation_length_from_acf(acf: np.ndarray) -> Optional[int]: 

111 """Estimates correlation length from :attr:`acf`. Returns ``None`` if 

112 the autocorrelation function is uncoverged. 

113 """ 

114 for i, a in enumerate(acf): 

115 if a < np.exp(-2): 

116 return i 

117 return None # np.nan 

118 

119 

120def _estimate_error(data: np.ndarray, 

121 correlation_length: int, 

122 confidence: float) -> float: 

123 """ Estimates error using correlation length. """ 

124 t_factor: float = stats.t.ppf((1 + confidence) / 2, len(data) - 1) 

125 error: float = t_factor * np.std(data) / np.sqrt(len(data) / correlation_length) 

126 return error