Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Ensemble Optimizer 

3 

4https://en.wikipedia.org/wiki/Bootstrap_aggregating 

5http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html # NOQA 

6""" 

7 

8import numpy as np 

9from typing import Any, Dict, List, Tuple, Union 

10from .base_optimizer import BaseOptimizer 

11from .optimizer import Optimizer 

12 

13 

14class EnsembleOptimizer(BaseOptimizer): 

15 """ 

16 The ensemble optimizer carries out a series of single optimization runs 

17 using the :class:`Optimizer` class in order to solve the linear 

18 :math:`\\boldsymbol{A}\\boldsymbol{x} = \\boldsymbol{y}` problem. 

19 Subsequently, it provides access to various ensemble averaged 

20 quantities such as errors and parameters. 

21 

22 Warning 

23 ------- 

24 Repeatedly setting up a EnsembleOptimizer and training 

25 *without* changing the seed for the random number generator will yield 

26 identical or correlated results, to avoid this please specify a different 

27 seed when setting up multiple EnsembleOptimizer instances. 

28 

29 Parameters 

30 ---------- 

31 fit_data : tuple(numpy.ndarray, numpy.ndarray) 

32 the first element of the tuple represents the fit matrix `A` 

33 (`N, M` array) while the second element represents the vector 

34 of target values `y` (`N` array); here `N` (=rows of `A`, 

35 elements of `y`) equals the number of target values and `M` 

36 (=columns of `A`) equals the number of parameters 

37 fit_method : str 

38 method to be used for training; possible choice are 

39 "least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr", 

40 "rfe", "split-bregman" 

41 standardize : bool 

42 if True the fit matrix and target values are standardized before fitting, 

43 meaning columns in the fit matrix and th target values are rescaled to 

44 have a standard deviation of 1.0. 

45 ensemble_size : int 

46 number of fits in the ensemble 

47 train_size : float or int 

48 if float represents the fraction of `fit_data` (rows) to be used for 

49 training; if int, represents the absolute number of rows to be used for 

50 training 

51 bootstrap : bool 

52 if True sampling will be carried out with replacement 

53 check_condition : bool 

54 if True the condition number will be checked 

55 (this can be sligthly more time consuming for larger 

56 matrices) 

57 seed : int 

58 seed for pseudo random number generator 

59 """ 

60 

61 def __init__(self, 

62 fit_data: Tuple[np.ndarray, np.ndarray], 

63 fit_method: str = 'least-squares', 

64 standardize: bool = True, 

65 ensemble_size: int = 50, 

66 train_size: Union[int, float] = 1.0, 

67 bootstrap: bool = True, 

68 check_condition: bool = True, 

69 seed: int = 42, 

70 **kwargs) -> None: 

71 

72 super().__init__(fit_data, fit_method, standardize, check_condition, 

73 seed) 

74 

75 # set training size 

76 if isinstance(train_size, float): 

77 self._train_size = int(np.round(train_size * self.n_target_values)) 

78 elif isinstance(train_size, int): 

79 self._train_size = train_size 

80 else: 

81 raise TypeError('Training size must be int or float') 

82 

83 self._ensemble_size = ensemble_size 

84 self._bootstrap = bootstrap 

85 self._kwargs = kwargs 

86 self._train_set_list = None 

87 self._test_set_list = None 

88 self._parameter_vectors = None 

89 self._parameters_std = None 

90 self._rmse_train_ensemble = None 

91 self._rmse_test_ensemble = None 

92 

93 def train(self) -> None: 

94 """ 

95 Carries out ensemble training and construct the final model by 

96 averaging over all models in the ensemble. 

97 """ 

98 self._run_ensemble() 

99 self._construct_final_model() 

100 

101 def _run_ensemble(self) -> None: 

102 """ Constructs an ensemble of models. """ 

103 

104 rs = np.random.RandomState(self.seed) 

105 optimizers = [] 

106 for _ in range(self.ensemble_size): 

107 # construct training and test sets 

108 train_set = rs.choice(np.arange(self.n_target_values), 

109 self.train_size, replace=self.bootstrap) 

110 test_set = np.setdiff1d( 

111 range(self.n_target_values), train_set) 

112 

113 # train 

114 opt = Optimizer((self._A, self._y), self.fit_method, 

115 standardize=self.standardize, 

116 train_set=train_set, test_set=test_set, 

117 check_condition=self._check_condition, 

118 **self._kwargs) 

119 opt.train() 

120 optimizers.append(opt) 

121 

122 # collect data from each fit 

123 

124 self._parameter_vectors = np.array( 

125 [opt.parameters for opt in optimizers]) 

126 self._train_set_list = [opt.train_set for opt in optimizers] 

127 self._test_set_list = [opt.test_set for opt in optimizers] 

128 self._rmse_train_ensemble = np.array( 

129 [opt.rmse_train for opt in optimizers]) 

130 self._rmse_test_ensemble = np.array( 

131 [opt.rmse_test for opt in optimizers]) 

132 

133 def _construct_final_model(self) -> None: 

134 """ 

135 Constructs final model by averaging over all models in the ensemble. 

136 """ 

137 self._fit_results['parameters'] = np.mean( 

138 self.parameter_vectors, axis=0) 

139 self._parameters_std = np.std(self.parameter_vectors, axis=0) 

140 

141 def predict(self, 

142 A: np.ndarray, 

143 return_std: bool = False) \ 

144 -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: 

145 """ 

146 Predicts data given an input matrix :math:`\boldsymbol{A}`, 

147 i.e., :math:`\\boldsymbol{A}\\boldsymbol{x}`, where 

148 :math:`\\boldsymbol{x}` is the vector of the fitted parameters. 

149 The method returns the vector of predicted values and optionally also 

150 the vector of standard deviations. 

151 

152 By using all parameter vectors in the ensemble a standard deviation of 

153 the prediction can be obtained. 

154 

155 Parameters 

156 ---------- 

157 A 

158 fit matrix where `N` (=rows of `A`, elements of `y`) equals the 

159 number of target values and `M` (=columns of `A`) equals the number 

160 of parameters 

161 return_std 

162 whether or not to return the standard deviation of the prediction 

163 """ 

164 prediction = np.dot(A, self.parameters) 

165 if return_std: 

166 predictions = np.dot(A, self.parameter_vectors.T) 

167 if len(predictions.shape) == 1: # shape is (N, ) 

168 std = np.std(predictions) 

169 else: # shape is (N, M) 

170 std = np.std(predictions, axis=1) 

171 return prediction, std 

172 else: 

173 return prediction 

174 

175 @property 

176 def error_matrix(self) -> np.ndarray: 

177 """ 

178 matrix of fit errors where `N` is the number of target values and 

179 `M` is the number of fits (i.e., the size of the ensemble) 

180 """ 

181 if self.parameter_vectors is None: 181 ↛ 182line 181 didn't jump to line 182, because the condition on line 181 was never true

182 return None 

183 error_matrix = np.zeros((self._n_rows, self.ensemble_size)) 

184 for i, parameters in enumerate(self.parameter_vectors): 

185 error_matrix[:, i] = np.dot(self._A, parameters) - self._y 

186 return error_matrix 

187 

188 @property 

189 def summary(self) -> Dict[str, Any]: 

190 """ comprehensive information about the optimizer """ 

191 info = super().summary 

192 

193 # Add class specific data 

194 info['parameters_std'] = self.parameters_std 

195 info['ensemble_size'] = self.ensemble_size 

196 info['rmse_train'] = self.rmse_train 

197 info['rmse_train_ensemble'] = self.rmse_train_ensemble 

198 info['rmse_test'] = self.rmse_test 

199 info['rmse_test_ensemble'] = self.rmse_test_ensemble 

200 info['train_size'] = self.train_size 

201 info['bootstrap'] = self.bootstrap 

202 

203 # add kwargs used for fitting 

204 info = {**info, **self._kwargs} 

205 return info 

206 

207 def __repr__(self) -> str: 

208 kwargs = dict() 

209 kwargs['fit_method'] = self.fit_method 

210 kwargs['ensemble_size'] = self.ensemble_size 

211 kwargs['train_size'] = self.train_size 

212 kwargs['bootstrap'] = self.bootstrap 

213 kwargs['seed'] = self.seed 

214 kwargs = {**kwargs, **self._kwargs} 

215 return 'EnsembleOptimizer((A, y), {})'.format( 

216 ', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items())) 

217 

218 @property 

219 def parameters_std(self) -> np.ndarray: 

220 """ standard deviation for each parameter """ 

221 return self._parameters_std 

222 

223 @property 

224 def parameter_vectors(self) -> List[np.ndarray]: 

225 """ all parameter vectors in the ensemble """ 

226 return self._parameter_vectors 

227 

228 @property 

229 def ensemble_size(self) -> int: 

230 """ number of train rounds """ 

231 return self._ensemble_size 

232 

233 @property 

234 def rmse_train(self) -> float: 

235 """ 

236 ensemble average of root mean squared error over train sets 

237 """ 

238 if self.rmse_train_ensemble is None: 

239 return None 

240 return np.sqrt(np.mean((self.rmse_train_ensemble)**2)) 

241 

242 @property 

243 def rmse_train_ensemble(self) -> np.ndarray: 

244 """ 

245 root mean squared train errors obtained during for 

246 each fit in ensemble 

247 """ 

248 return self._rmse_train_ensemble 

249 

250 @property 

251 def rmse_test(self) -> float: 

252 """ 

253 ensemble average of root mean squared error over test sets 

254 """ 

255 if self.rmse_test_ensemble is None: 

256 return None 

257 return np.sqrt(np.mean((self.rmse_test_ensemble)**2)) 

258 

259 @property 

260 def rmse_test_ensemble(self) -> np.ndarray: 

261 """ 

262 root mean squared test errors obtained during for 

263 each fit in ensemble 

264 """ 

265 return self._rmse_test_ensemble 

266 

267 @property 

268 def train_size(self) -> int: 

269 """ 

270 number of rows included in train sets; note that this will 

271 be different from the number of unique rows if boostrapping 

272 """ 

273 return self._train_size 

274 

275 @property 

276 def train_fraction(self) -> float: 

277 """ 

278 fraction of input data used for training; this value can differ 

279 slightly from the value set during initialization due to 

280 rounding 

281 """ 

282 return self.train_set_size / self._n_rows 

283 

284 @property 

285 def bootstrap(self) -> bool: 

286 """ True if sampling is carried out with replacement """ 

287 return self._bootstrap