 r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""

2Ensemble Optimizer

4https://en.wikipedia.org/wiki/Bootstrap_aggregating

5http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html # NOQA

6"""

8import numpy as np

9from typing import Any, Dict, List, Tuple, Union

10from .base_optimizer import BaseOptimizer

11from .optimizer import Optimizer

14class EnsembleOptimizer(BaseOptimizer):

15 """

16 The ensemble optimizer carries out a series of single optimization runs

17 using the :class:`Optimizer` class in order to solve the linear

18 :math:`\\boldsymbol{A}\\boldsymbol{x} = \\boldsymbol{y}` problem.

20 quantities such as errors and parameters.

22 Warning

23 -------

24 Repeatedly setting up a EnsembleOptimizer and training

25 *without* changing the seed for the random number generator will yield

26 identical or correlated results, to avoid this please specify a different

27 seed when setting up multiple EnsembleOptimizer instances.

29 Parameters

30 ----------

31 fit_data : tuple(numpy.ndarray, numpy.ndarray)

32 the first element of the tuple represents the fit matrix `A`

33 (`N, M` array) while the second element represents the vector

34 of target values `y` (`N` array); here `N` (=rows of `A`,

35 elements of `y`) equals the number of target values and `M`

36 (=columns of `A`) equals the number of parameters

37 fit_method : str

38 method to be used for training; possible choice are

39 "least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr",

40 "rfe", "split-bregman"

41 standardize : bool

42 if True the fit matrix and target values are standardized before fitting,

43 meaning columns in the fit matrix and th target values are rescaled to

44 have a standard deviation of 1.0.

45 ensemble_size : int

46 number of fits in the ensemble

47 train_size : float or int

48 if float represents the fraction of `fit_data` (rows) to be used for

49 training; if int, represents the absolute number of rows to be used for

50 training

51 bootstrap : bool

52 if True sampling will be carried out with replacement

53 check_condition : bool

54 if True the condition number will be checked

55 (this can be sligthly more time consuming for larger

56 matrices)

57 seed : int

58 seed for pseudo random number generator

59 """

61 def __init__(self,

62 fit_data: Tuple[np.ndarray, np.ndarray],

63 fit_method: str = 'least-squares',

64 standardize: bool = True,

65 ensemble_size: int = 50,

66 train_size: Union[int, float] = 1.0,

67 bootstrap: bool = True,

68 check_condition: bool = True,

69 seed: int = 42,

70 **kwargs) -> None:

72 super().__init__(fit_data, fit_method, standardize, check_condition,

73 seed)

75 # set training size

76 if isinstance(train_size, float):

77 self._train_size = int(np.round(train_size * self.n_target_values))

78 elif isinstance(train_size, int):

79 self._train_size = train_size

80 else:

81 raise TypeError('Training size must be int or float')

83 self._ensemble_size = ensemble_size

84 self._bootstrap = bootstrap

85 self._kwargs = kwargs

86 self._train_set_list = None

87 self._test_set_list = None

88 self._parameter_vectors = None

89 self._parameters_std = None

90 self._rmse_train_ensemble = None

91 self._rmse_test_ensemble = None

93 def train(self) -> None:

94 """

95 Carries out ensemble training and construct the final model by

96 averaging over all models in the ensemble.

97 """

98 self._run_ensemble()

99 self._construct_final_model()

101 def _run_ensemble(self) -> None:

102 """ Constructs an ensemble of models. """

104 rs = np.random.RandomState(self.seed)

105 optimizers = []

106 for _ in range(self.ensemble_size):

107 # construct training and test sets

108 train_set = rs.choice(np.arange(self.n_target_values),

109 self.train_size, replace=self.bootstrap)

110 test_set = np.setdiff1d(

111 range(self.n_target_values), train_set)

113 # train

114 opt = Optimizer((self._A, self._y), self.fit_method,

115 standardize=self.standardize,

116 train_set=train_set, test_set=test_set,

117 check_condition=self._check_condition,

118 **self._kwargs)

119 opt.train()

120 optimizers.append(opt)

122 # collect data from each fit

124 self._parameter_vectors = np.array(

125 [opt.parameters for opt in optimizers])

126 self._train_set_list = [opt.train_set for opt in optimizers]

127 self._test_set_list = [opt.test_set for opt in optimizers]

128 self._rmse_train_ensemble = np.array(

129 [opt.rmse_train for opt in optimizers])

130 self._rmse_test_ensemble = np.array(

131 [opt.rmse_test for opt in optimizers])

133 def _construct_final_model(self) -> None:

134 """

135 Constructs final model by averaging over all models in the ensemble.

136 """

137 self._fit_results['parameters'] = np.mean(

138 self.parameter_vectors, axis=0)

139 self._parameters_std = np.std(self.parameter_vectors, axis=0)

141 def predict(self,

142 A: np.ndarray,

143 return_std: bool = False) \

144 -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:

145 """

146 Predicts data given an input matrix :math:`\boldsymbol{A}`,

147 i.e., :math:`\\boldsymbol{A}\\boldsymbol{x}`, where

148 :math:`\\boldsymbol{x}` is the vector of the fitted parameters.

149 The method returns the vector of predicted values and optionally also

150 the vector of standard deviations.

152 By using all parameter vectors in the ensemble a standard deviation of

153 the prediction can be obtained.

155 Parameters

156 ----------

157 A

158 fit matrix where `N` (=rows of `A`, elements of `y`) equals the

159 number of target values and `M` (=columns of `A`) equals the number

160 of parameters

161 return_std

162 whether or not to return the standard deviation of the prediction

163 """

164 prediction = np.dot(A, self.parameters)

165 if return_std:

166 predictions = np.dot(A, self.parameter_vectors.T)

167 if len(predictions.shape) == 1: # shape is (N, )

168 std = np.std(predictions)

169 else: # shape is (N, M)

170 std = np.std(predictions, axis=1)

171 return prediction, std

172 else:

173 return prediction

175 @property

176 def error_matrix(self) -> np.ndarray:

177 """

178 matrix of fit errors where `N` is the number of target values and

179 `M` is the number of fits (i.e., the size of the ensemble)

180 """

181 if self.parameter_vectors is None: 181 ↛ 182line 181 didn't jump to line 182, because the condition on line 181 was never true

182 return None

183 error_matrix = np.zeros((self._n_rows, self.ensemble_size))

184 for i, parameters in enumerate(self.parameter_vectors):

185 error_matrix[:, i] = np.dot(self._A, parameters) - self._y

186 return error_matrix

188 @property

189 def summary(self) -> Dict[str, Any]:

190 """ comprehensive information about the optimizer """

191 info = super().summary

193 # Add class specific data

194 info['parameters_std'] = self.parameters_std

195 info['ensemble_size'] = self.ensemble_size

196 info['rmse_train'] = self.rmse_train

197 info['rmse_train_ensemble'] = self.rmse_train_ensemble

198 info['rmse_test'] = self.rmse_test

199 info['rmse_test_ensemble'] = self.rmse_test_ensemble

200 info['train_size'] = self.train_size

201 info['bootstrap'] = self.bootstrap

203 # add kwargs used for fitting

204 info = {**info, **self._kwargs}

205 return info

207 def __repr__(self) -> str:

208 kwargs = dict()

209 kwargs['fit_method'] = self.fit_method

210 kwargs['ensemble_size'] = self.ensemble_size

211 kwargs['train_size'] = self.train_size

212 kwargs['bootstrap'] = self.bootstrap

213 kwargs['seed'] = self.seed

214 kwargs = {**kwargs, **self._kwargs}

215 return 'EnsembleOptimizer((A, y), {})'.format(

216 ', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items()))

218 @property

219 def parameters_std(self) -> np.ndarray:

220 """ standard deviation for each parameter """

221 return self._parameters_std

223 @property

224 def parameter_vectors(self) -> List[np.ndarray]:

225 """ all parameter vectors in the ensemble """

226 return self._parameter_vectors

228 @property

229 def ensemble_size(self) -> int:

230 """ number of train rounds """

231 return self._ensemble_size

233 @property

234 def rmse_train(self) -> float:

235 """

236 ensemble average of root mean squared error over train sets

237 """

238 if self.rmse_train_ensemble is None:

239 return None

240 return np.sqrt(np.mean((self.rmse_train_ensemble)**2))

242 @property

243 def rmse_train_ensemble(self) -> np.ndarray:

244 """

245 root mean squared train errors obtained during for

246 each fit in ensemble

247 """

248 return self._rmse_train_ensemble

250 @property

251 def rmse_test(self) -> float:

252 """

253 ensemble average of root mean squared error over test sets

254 """

255 if self.rmse_test_ensemble is None:

256 return None

257 return np.sqrt(np.mean((self.rmse_test_ensemble)**2))

259 @property

260 def rmse_test_ensemble(self) -> np.ndarray:

261 """

262 root mean squared test errors obtained during for

263 each fit in ensemble

264 """

265 return self._rmse_test_ensemble

267 @property

268 def train_size(self) -> int:

269 """

270 number of rows included in train sets; note that this will

271 be different from the number of unique rows if boostrapping

272 """

273 return self._train_size

275 @property

276 def train_fraction(self) -> float:

277 """

278 fraction of input data used for training; this value can differ

279 slightly from the value set during initialization due to

280 rounding

281 """

282 return self.train_set_size / self._n_rows

284 @property

285 def bootstrap(self) -> bool:

286 """ True if sampling is carried out with replacement """

287 return self._bootstrap