Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2scikit-learn is an excellent library for training linear models and provides a 

3large number of useful tools. 

4 

5This module provides simplified interfaces for vaiours linear model regression 

6methods. These methods are set up in a way that work out of the box for typical 

7problems in cluster expansion and force constant potential construction. This 

8includes slight adjustments scitkit-learn default values. 

9 

10If you would like more flexibility or extended functionality or ability to 

11fine-tune parameters that are not included in this interface, it is of course 

12possible to use scikit-learn directly. 

13More information about the sklearn linear models can be found at 

14http://scikit-learn.org/stable/modules/linear_model.html 

15""" 

16 

17import numpy as np 

18from collections import OrderedDict 

19from sklearn.linear_model import (Lasso, 

20 LassoCV, 

21 Ridge, 

22 RidgeCV, 

23 ElasticNet, 

24 ElasticNetCV, 

25 BayesianRidge, 

26 ARDRegression) 

27from sklearn.model_selection import ShuffleSplit 

28from sklearn.feature_selection import RFE, RFECV 

29from sklearn.preprocessing import StandardScaler 

30from typing import Any, Dict, List, Union 

31from ..input_output.logging_tools import logger 

32from .split_bregman import fit_split_bregman 

33 

34 

35logger = logger.getChild('fit_methods') 

36 

37 

38def fit(X: np.ndarray, 

39 y: np.ndarray, 

40 fit_method: str, 

41 standardize: bool = True, 

42 check_condition: bool = True, 

43 **kwargs) -> Dict[str, Any]: 

44 """ 

45 Wrapper function for all available fit methods. The function 

46 returns parameters and other pertinent information in the form of 

47 a dictionary. 

48 

49 Parameters 

50 ----------- 

51 X 

52 fit matrix 

53 y 

54 target array 

55 fit_method 

56 method to be used for training; possible choice are 

57 "least-squares", "lasso", "elasticnet", "bayesian-ridge", "ardr", 

58 "rfe", "split-bregman" 

59 standardize : bool 

60 if True the fit matrix is standardized before fitting 

61 check_condition : bool 

62 if True the condition number will be checked 

63 (this can be sligthly more time consuming for larger 

64 matrices) 

65 """ 

66 

67 if fit_method not in available_fit_methods: 

68 msg = ['Fit method not available'] 

69 msg += ['Please choose one of the following:'] 

70 for key in available_fit_methods: 

71 msg += [' * ' + key] 

72 raise ValueError('\n'.join(msg)) 

73 

74 if check_condition and X.shape[0] >= X.shape[1]: 

75 cond = np.linalg.cond(X) 

76 if cond > 1e10: 

77 logger.warning('Condition number is large, {}'.format(cond)) 

78 

79 if standardize: 

80 

81 # standardize fit matrix, column wise std -> 1.0 

82 ss = StandardScaler(copy=False, with_mean=False, with_std=True) 

83 ss.fit_transform(X) # change in place 

84 

85 # standardize target values, std(y) -> 1.0 

86 y_scale = 1.0/np.std(y) 

87 y_rescaled = y * y_scale 

88 

89 # train 

90 results = fit_methods[fit_method](X, y_rescaled, **kwargs) 

91 

92 # inverse standardization 

93 parameters = results['parameters'] / y_scale 

94 ss.inverse_transform(X) # change in place 

95 ss.transform(parameters.reshape(1, -1)).reshape(-1,) 

96 results['parameters'] = parameters 

97 

98 else: 

99 results = fit_methods[fit_method](X, y, **kwargs) 

100 return results 

101 

102 

103def _fit_least_squares(X: np.ndarray, y: np.ndarray) -> Dict[str, Any]: 

104 """ 

105 Returns the least-squares solution `a` to the linear problem 

106 `Xa=y` in the form of a dictionary with a key named `parameters`. 

107 

108 This function is a wrapper to the `linalg.lstsq` function in NumPy. 

109 

110 Parameters 

111 ----------- 

112 X 

113 fit matrix 

114 y 

115 target array 

116 """ 

117 results = dict() 

118 results['parameters'] = np.linalg.lstsq(X, y, rcond=-1)[0] 

119 return results 

120 

121 

122def _fit_lasso(X: np.ndarray, y: np.ndarray, 

123 alpha: float = None, fit_intercept: bool = False, 

124 **kwargs) -> Dict[str, Any]: 

125 """ 

126 Returns the solution `a` to the linear problem `Xa=y` obtained by 

127 using the LASSO method as implemented in scitkit-learn in the form 

128 of a dictionary with a key named `parameters`. 

129 

130 LASSO optimizes the following problem:: 

131 

132 (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 

133 

134 If `alpha` is `None` this function will call `fit_lassoCV` which attempts 

135 to find the optimal alpha via sklearn's `LassoCV` class. 

136 

137 Parameters 

138 ---------- 

139 X 

140 fit matrix 

141 y 

142 target array 

143 alpha 

144 alpha value 

145 fit_intercept 

146 center data or not, forwarded to sklearn 

147 """ 

148 if alpha is None: 

149 return _fit_lassoCV(X, y, fit_intercept=fit_intercept, **kwargs) 

150 else: 

151 lasso = Lasso(alpha=alpha, fit_intercept=fit_intercept, **kwargs) 

152 lasso.fit(X, y) 

153 results = dict() 

154 results['parameters'] = lasso.coef_ 

155 return results 

156 

157 

158def _fit_lassoCV(X: np.ndarray, 

159 y: np.ndarray, 

160 alphas: List[float] = None, 

161 fit_intercept: bool = False, 

162 cv: int = 10, 

163 n_jobs: int = -1, 

164 **kwargs) -> Dict[str, Any]: 

165 """ 

166 Returns the solution `a` to the linear problem `Xa=y` obtained by 

167 using the LassoCV method as implemented in scitkit-learn in the 

168 form of a dictionary with a key named `parameters`. 

169 

170 The dictionary will also contain the keys `alpha_optimal` (alpha 

171 value that yields the lowest validation RMSE), `alpha_path` (all 

172 tested alpha values), and `mse_path` (MSE for validation set for 

173 each alpha). 

174 

175 Parameters 

176 ----------- 

177 X 

178 fit matrix 

179 y 

180 target array 

181 alphas 

182 list of alpha values to be evaluated during regularization path 

183 fit_intercept 

184 center data or not, forwarded to sklearn 

185 cv 

186 how many folds to carry out in cross-validation 

187 n_jobs 

188 number of cores to use during the cross validation. 

189 None means 1 unless in a joblib.parallel_backend context. 

190 -1 means using all processors. 

191 See sklearn's glossary for more details. 

192 """ 

193 if alphas is None: 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false

194 alphas = np.logspace(-8, -0.3, 100) 

195 

196 lassoCV = LassoCV(alphas=alphas, fit_intercept=fit_intercept, cv=cv, n_jobs=n_jobs, **kwargs) 

197 lassoCV.fit(X, y) 

198 results = dict() 

199 results['parameters'] = lassoCV.coef_ 

200 results['alpha_optimal'] = lassoCV.alpha_ 

201 return results 

202 

203 

204def _fit_ridge(X, y, alpha=None, fit_intercept=False, **kwargs): 

205 results = dict() 

206 if alpha is None: 206 ↛ 213line 206 didn't jump to line 213, because the condition on line 206 was never false

207 if 'alphas' not in kwargs: 207 ↛ 209line 207 didn't jump to line 209, because the condition on line 207 was never false

208 kwargs['alphas'] = np.logspace(-6, 3, 100) 

209 ridge = RidgeCV(fit_intercept=fit_intercept, **kwargs) 

210 ridge.fit(X, y) 

211 results['alpha_optimal'] = ridge.alpha_ 

212 else: 

213 ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept, **kwargs) 

214 ridge.fit(X, y) 

215 results['parameters'] = ridge.coef_ 

216 return results 

217 

218 

219def _fit_bayesian_ridge(X: np.ndarray, y: np.ndarray, 

220 fit_intercept: bool = False, 

221 **kwargs) -> Dict[str, Any]: 

222 """ 

223 Returns the solution `a` to the linear problem `Xa=y` obtained by using 

224 Bayesian ridge regression as implemented in scitkit-learn in the 

225 form of a dictionary with a key named `parameters`. 

226 

227 Parameters 

228 ----------- 

229 X 

230 fit matrix 

231 y 

232 target array 

233 fit_intercept 

234 center data or not, forwarded to sklearn 

235 """ 

236 brr = BayesianRidge(fit_intercept=fit_intercept, **kwargs) 

237 brr.fit(X, y) 

238 results = dict() 

239 results['parameters'] = brr.coef_ 

240 return results 

241 

242 

243def _fit_elasticnet(X: np.ndarray, y: np.ndarray, 

244 alpha: float = None, fit_intercept: bool = False, 

245 **kwargs) -> Dict[str, Any]: 

246 """ 

247 Returns the solution `a` to the linear problem `Xa=y` obtained by using 

248 the ElasticNet method as implemented in scitkit-learn in the 

249 form of a dictionary with a key named `parameters`. 

250 

251 If `alpha` is `None` this function will call the fit_lassoCV which attempts 

252 to find the optimal alpha via sklearn ElasticNetCV class. 

253 

254 Parameters 

255 ----------- 

256 X 

257 fit matrix 

258 y 

259 target array 

260 alpha 

261 alpha value 

262 fit_intercept 

263 center data or not, forwarded to sklearn 

264 """ 

265 if alpha is None: 

266 return _fit_elasticnetCV(X, y, fit_intercept=fit_intercept, **kwargs) 

267 else: 

268 elasticnet = ElasticNet(alpha=alpha, fit_intercept=fit_intercept, **kwargs) 

269 elasticnet.fit(X, y) 

270 results = dict() 

271 results['parameters'] = elasticnet.coef_ 

272 return results 

273 

274 

275def _fit_elasticnetCV(X: np.ndarray, 

276 y: np.ndarray, 

277 alphas: List[float] = None, 

278 l1_ratio: Union[float, List[float]] = None, 

279 fit_intercept: bool = False, 

280 cv: int = 10, 

281 n_jobs: int = -1, 

282 **kwargs) -> Dict[str, Any]: 

283 """ 

284 Returns the solution `a` to the linear problem `Xa=y` obtained by using 

285 the ElasticNetCV method as implemented in scitkit-learn in the 

286 form of a dictionary with a key named `parameters`. 

287 

288 The dictionary returned by this function will also contain the 

289 fields `alpha_optimal` (alpha value that yields the lowest 

290 validation RMSE), `alpha_path` (all tested alpha values), 

291 `l1_ratio_optmal` (alpha value that yields the lowest validation 

292 RMSE), `l1_ratio_path` (all tested `l1_ratio` values) `mse_path` 

293 (MSE for validation set for each alpha and `l1_ratio`) 

294 

295 Parameters 

296 ----------- 

297 X 

298 fit matrix 

299 y 

300 target array 

301 alphas 

302 list of alpha values to be evaluated during regularization path 

303 l1_ratio 

304 l1_ratio values to be evaluated during regularization path 

305 fit_intercept 

306 center data or not, forwarded to sklearn 

307 cv 

308 how many folds to carry out in cross-validation 

309 n_jobs 

310 number of cores to use during the cross validation. 

311 None means 1 unless in a joblib.parallel_backend context. 

312 -1 means using all processors. 

313 See sklearn's glossary for more details. 

314 """ 

315 

316 if alphas is None: 316 ↛ 318line 316 didn't jump to line 318, because the condition on line 316 was never false

317 alphas = np.logspace(-8, -0.3, 100) 

318 if l1_ratio is None: 318 ↛ 322line 318 didn't jump to line 322, because the condition on line 318 was never false

319 l1_ratio = [1.0, 0.995, 0.99, 0.98, 0.97, 0.95, 0.925, 0.9, 0.85, 

320 0.8, 0.75, 0.65, 0.5, 0.4, 0.25, 0.1] 

321 

322 elasticnetCV = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratio, cv=cv, 

323 fit_intercept=fit_intercept, n_jobs=n_jobs, **kwargs) 

324 elasticnetCV.fit(X, y) 

325 results = dict() 

326 results['parameters'] = elasticnetCV.coef_ 

327 results['alpha_optimal'] = elasticnetCV.alpha_ 

328 results['l1_ratio_optimal'] = elasticnetCV.l1_ratio_ 

329 return results 

330 

331 

332def _fit_ardr(X: np.ndarray, 

333 y: np.ndarray, 

334 threshold_lambda: float = None, 

335 line_scan: bool = False, 

336 fit_intercept: bool = False, 

337 **kwargs) -> Dict[str, Any]: 

338 """ 

339 Returns the solution `a` to the linear problem `Xa=y` obtained by 

340 using the automatic relevance determination regression (ARDR) 

341 method as implemented in scitkit-learn in the form of a dictionary 

342 with a key named `parameters`. 

343 

344 Parameters 

345 ----------- 

346 X 

347 fit matrix 

348 y 

349 target array 

350 threshold_lambda 

351 threshold lambda parameter forwarded to sklearn 

352 line_scan 

353 whether or not to perform line-scan in order to find optimal 

354 threshold-lambda 

355 fit_intercept 

356 center data or not, forwarded to sklearn 

357 """ 

358 

359 if threshold_lambda is not None and line_scan: 359 ↛ 360line 359 didn't jump to line 360, because the condition on line 359 was never true

360 raise ValueError('Specify threshold_lambda or set line_scan=True, not both') 

361 

362 if threshold_lambda is None: 

363 threshold_lambda = 1e4 

364 

365 if line_scan: 

366 return _fit_ardr_line_scan(X, y, fit_intercept=fit_intercept, **kwargs) 

367 else: 

368 ardr = ARDRegression(threshold_lambda=threshold_lambda, fit_intercept=fit_intercept, 

369 **kwargs) 

370 ardr.fit(X, y) 

371 results = dict() 

372 results['parameters'] = ardr.coef_ 

373 return results 

374 

375 

376def _fit_ardr_line_scan(X: np.ndarray, 

377 y: np.ndarray, 

378 cv_splits: int = 1, 

379 threshold_lambdas: List[float] = None, 

380 **kwargs) -> Dict[str, Any]: 

381 """ ARDR with line-scan for optimal threshold-lambda. 

382 

383 Parameters 

384 ----------- 

385 X 

386 fit matrix 

387 y 

388 target array 

389 threshold_lambdas 

390 list of threshold-lambda values to be evaluated. The optimal 

391 lambda-value found will be used in the final fit. 

392 cv_splits 

393 how many CV splits to carry out when evaluating each lambda value. 

394 """ 

395 

396 from .cross_validation import CrossValidationEstimator 

397 

398 # default lambda values to scan 

399 if threshold_lambdas is None: 

400 threshold_lambdas = np.logspace(3, 6, 15) 

401 

402 # run lin-scan of lambda values 

403 cv_data = [] 

404 for lam in threshold_lambdas: 

405 cve = CrossValidationEstimator((X, y), fit_method='ardr', validation_method='shuffle-split', 

406 threshold_lambda=lam, test_size=0.1, train_size=0.9, 

407 n_splits=cv_splits, **kwargs) 

408 cve.validate() 

409 cv_data.append([lam, cve.rmse_validation]) 

410 

411 # select best lambda 

412 cv_data = np.array(cv_data) 

413 optimal_ind = cv_data[:, 1].argmin() 

414 lambda_optimal = cv_data[optimal_ind, 0] 

415 

416 # final fit with optimal lambda 

417 results = _fit_ardr(X, y, threshold_lambda=lambda_optimal, **kwargs) 

418 results['threshold_lambda_optimal'] = lambda_optimal 

419 return results 

420 

421 

422class _Estimator: 

423 """ Estimator class making it possible to use all fit methods 

424 as estimators in RFE """ 

425 

426 def __init__(self, fit_method, **kwargs): 

427 if fit_method == 'rfe': 427 ↛ 428line 427 didn't jump to line 428, because the condition on line 427 was never true

428 raise ValueError('recursive infinitum') 

429 self.fit_method = fit_method 

430 self.kwargs = kwargs 

431 self.coef_ = None 

432 

433 def fit(self, X, y): 

434 fit_func = fit_methods[self.fit_method] 

435 results = fit_func(X, y, **self.kwargs) 

436 self.coef_ = results['parameters'] 

437 

438 def get_params(self, deep=True): 

439 params = {k: v for k, v in self.kwargs.items()} 

440 params['fit_method'] = self.fit_method 

441 return params 

442 

443 def predict(self, A): 

444 return np.dot(A, self.coef_) 

445 

446 def _get_tags(self): 

447 from sklearn.base import _DEFAULT_TAGS 

448 return _DEFAULT_TAGS 

449 

450 

451def _fit_rfe(X: np.ndarray, 

452 y: np.ndarray, 

453 n_features: int = None, 

454 step: Union[int, float] = 0.04, 

455 estimator: str = 'least-squares', 

456 final_estimator: str = None, 

457 estimator_kwargs: dict = {}, 

458 final_estimator_kwargs: dict = {}, 

459 cv_splits: int = 5, 

460 n_jobs: int = -1, 

461 **rfe_kwargs): 

462 """ 

463 Returns the solution `a` to the linear problem `Xa=y` obtained by 

464 recursive feature elimination (RFE). 

465 

466 Parameters 

467 ----------- 

468 X 

469 fit matrix 

470 y 

471 target array 

472 n_features 

473 number of features to select, if None sklearn.feature_selection.RFECV 

474 will be used to determine the optimal number of features 

475 step 

476 if given as integer then corresponds to number of parameters to 

477 eliminate in each iteration. If given as a float then corresponds to 

478 the fraction of parameters to remove each iteration. 

479 estimator 

480 fit method during RFE algorithm 

481 final_estimator 

482 fit_method to be used in final fit, 

483 if None will default to whichever estimator is being used 

484 cv_splits 

485 number of cv-splits to carry out if finding optimal n_features 

486 n_jobs 

487 number of cores to use during the cross validation. 

488 -1 means using all processors. 

489 """ 

490 

491 # handle kwargs 

492 if final_estimator is None: 

493 final_estimator = estimator 

494 if len(final_estimator_kwargs) == 0: 494 ↛ 497line 494 didn't jump to line 497, because the condition on line 494 was never false

495 final_estimator_kwargs = estimator_kwargs 

496 

497 estimator_obj = _Estimator(estimator, **estimator_kwargs) 

498 if n_features is None: 

499 if 'scoring' not in rfe_kwargs: 499 ↛ 501line 499 didn't jump to line 501, because the condition on line 499 was never false

500 rfe_kwargs['scoring'] = 'neg_mean_squared_error' 

501 cv = ShuffleSplit(train_size=0.9, test_size=0.1, n_splits=cv_splits) 

502 rfe = RFECV(estimator_obj, step=step, cv=cv, n_jobs=n_jobs, **rfe_kwargs) 

503 else: 

504 rfe = RFE(estimator_obj, n_features_to_select=n_features, step=step, **rfe_kwargs) 

505 

506 # Carry out RFE 

507 rfe.fit(X, y) 

508 features = rfe.support_ 

509 ranking = rfe.ranking_ 

510 

511 # carry out final fit 

512 n_params = X.shape[1] 

513 results = fit(X[:, features], y, fit_method=final_estimator, **final_estimator_kwargs) 

514 params = np.zeros(n_params) 

515 params[features] = results['parameters'] 

516 results['parameters'] = params 

517 

518 # finish up 

519 results['features'] = features 

520 results['ranking'] = ranking 

521 return results 

522 

523 

524fit_methods = OrderedDict([ 

525 ('least-squares', _fit_least_squares), 

526 ('lasso', _fit_lasso), 

527 ('ridge', _fit_ridge), 

528 ('bayesian-ridge', _fit_bayesian_ridge), 

529 ('elasticnet', _fit_elasticnet), 

530 ('split-bregman', fit_split_bregman), 

531 ('ardr', _fit_ardr), 

532 ('rfe', _fit_rfe), 

533 ]) 

534available_fit_methods = sorted(fit_methods.keys())