-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGA_ensemble.py
483 lines (360 loc) · 17.3 KB
/
GA_ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# -*- coding: utf-8 -*-
"""GA_ensemble.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ZWFMrt_ST5ZMGQJ_I8TNWXene37q0-B8
"""
!pip install tpot
!pip install deap
!pip install auto-sklearn
#@title private funcs
def get_models_from_tpot(tpot):
result_list = []
counter = 0
for key, value in tpot.evaluated_individuals_.items():
if value['internal_cv_score'] == float('-inf'):
continue
deap_pipeline = creator.Individual.from_string(key, tpot._pset)
sklearn_pipeline = tpot._toolbox.compile(expr=deap_pipeline)
pipelin_model_name = sklearn_pipeline.steps[-1][1].__class__.__name__
set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)
result_list.append((pipelin_model_name + '_' + str(counter),
sklearn_pipeline # list(sklearn_pipeline.named_steps.values())[len(list(sklearn_pipeline.named_steps.values()))-1],
)
)
counter = counter + 1
return result_list
def get_models_from_autosklearn(automl):
result_list = []
counter = 0
for model_id in automl.show_models().keys():
model = automl.show_models()[model_id]['sklearn_classifier']
if (hasattr(model, 'base_estimator_')):
result_list.append(( str(model.base_estimator_).split('(')[0] + '_' + str(counter),
model.base_estimator_))
else:
result_list.append((str(model).split('(')[0] + '_' + str(counter), model))
counter = counter + 1
return result_list
# вибір моделей
def select_models(sklearn_models, count=None, isUniqModelsOnly=False):
if not isUniqModelsOnly:
return sklearn_models[:count] if count is not None else sklearn_models
result_list = []
for element in sklearn_models:
model_name = element[0].split('_')[0]
if model_name not in [uniq_model_name[0].split('_')[0] for uniq_model_name in result_list]:
result_list.append(element)
return result_list[:count] if count is not None else result_list
def eaSimpleCustom(population, toolbox, cxpb, mutpb, ngen, n_iter_no_change, stats=None,
halloffame=None, verbose=__debug__, greater_is_better=True):
logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
if halloffame is not None:
halloffame.update(population)
record = stats.compile(population) if stats else {}
logbook.record(gen=0, nevals=len(invalid_ind), **record)
if verbose:
print(logbook.stream)
# Begin the generational process
for gen in range(1, ngen + 1):
# Select the next generation individuals
offspring = toolbox.select(population, len(population))
# Vary the pool of individuals
offspring = varAnd(offspring, toolbox, cxpb, mutpb)
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
# Update the hall of fame with the generated individuals
if halloffame is not None:
halloffame.update(offspring)
# Replace the current population by the offspring
population[:] = offspring
# population[:] = (population + offspring).sort()[:len(population)]
# Append the current generation statistics to the logbook
record = stats.compile(population) if stats else {}
logbook.record(gen=gen, nevals=len(invalid_ind), **record)
if verbose:
print(logbook.stream)
if greater_is_better and n_iter_no_change != 0:
if earlyStoppingMax(logbook, n_iter_no_change):
break;
else:
if earlyStoppingMin(logbook, n_iter_no_change) and n_iter_no_change != 0:
break;
return population, logbook
def earlyStoppingMax(logbook, n_iter_no_change):
gen_list, max_values=logbook.select('gen', 'max')
if len(max_values) < n_iter_no_change:
return False
max_elem_index = max_values.index(max(max_values))
if max_elem_index >= len(max_values) - n_iter_no_change:
return False # продовжити алгоритм
# if (max_values[-n_iter_no_change] < max_values[-n_iter_no_change + 1 : ]).any():
# return False
return True
def earlyStoppingMin(logbook, n_iter_no_change):
gen_list, min_values=logbook.select('gen', 'min')
if len(min_values) < n_iter_no_change:
return False
min_elem_index = min_values.index(min(min_values))
if min_elem_index >= len(min_values) - n_iter_no_change:
return False
# if (min_values[-n_iter_no_change] > min_values[-n_iter_no_change + 1 : ]).any():
# return False
return True
def varAnd(population, toolbox, cxpb, mutpb):
offspring = [toolbox.clone(ind) for ind in population]
# Apply crossover and mutation on the offspring
for i in range(1, len(offspring), 2):
if random.random() < cxpb:
offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1],
offspring[i])
del offspring[i - 1].fitness.values, offspring[i].fitness.values
for i in range(len(offspring)):
if random.random() < mutpb:
offspring[i], = toolbox.mutate(offspring[i])
del offspring[i].fitness.values
return offspring
#@title imports
import numpy as np
import sklearn.metrics
import pandas as pd
import os
import matplotlib.pyplot as plt
import random
import autosklearn.classification
import autosklearn.regression
from sklearn.metrics import f1_score, r2_score
from deap import base, creator, tools, algorithms
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from tpot.export_utils import generate_pipeline_code, expr_to_tree, set_param_recursive
def gen_ensemble_builder_clf(X_train, X_test, y_train, y_test,
scoring=accuracy_score, greater_is_better=True,
use_tpot=True, use_autosklearn=True,
TPOT_object=None, autosklearn_object=None,
use_unique_type_models=False, models_count=None,
population_size=50, cxpb=0.6, mutpb=0.3,
indpb=0.4, ngen=500, verbose=False,
cx_operator=tools.cxTwoPoint, mut_operator=tools.mutUniformInt,
n_iter_no_change=20,
sel_operator = lambda individuals, k: tools.selTournament(individuals, k, tournsize=3)):
print('i am working!')
scorer = make_scorer(score_func=scoring, greater_is_better=greater_is_better)
if not use_tpot and not use_autosklearn:
raise ValueError("At least one of use_tpot or use_autosklearn must be True.")
if TPOT_object is not None and not isinstance(TPOT_object, TPOTClassifier):
raise TypeError("TPOT_object should be of type TPOTClassifier")
if autosklearn_object is not None and not isinstance(autosklearn_object, AutoSklearnClassifier):
raise TypeError("autosklearn_object should be of type AutoSklearnClassifier")
base_models = []
if use_tpot:
if TPOT_object is None:
TPOT_object = TPOTClassifier()
TPOT_object.fit(X_train, y_train)
else:
TPOT_object.fit(X_train, y_train)
base_models += get_models_from_tpot(TPOT_object)
if use_autosklearn:
if autosklearn_object is None:
autosklearn_object = autosklearn.classification.AutoSklearnClassifier()
autosklearn_object.fit(X_train, y_train)
else:
autosklearn_object.fit(X_train, y_train)
base_models += get_models_from_autosklearn(autosklearn_object)
base_models = select_models(base_models, models_count, use_unique_type_models)
# genetic algorithm
NBR_ITEMS=2
IND_INIT_SIZE=len(base_models)
def fitness_func(individual):
if all(i == 0 for i in individual):
return (0,)
estimators=[(t[0], t[1]) for i, t in enumerate(base_models) if individual[i]==1]
ensemble = VotingClassifier(estimators = estimators, voting='soft')
y_test_pred = ensemble.pred(X_test)
return (scorer(y_test, y_test_pred),)
hof = tools.HallOfFame(1)
s = tools.Statistics(lambda ind: ind.fitness.values[0])
s.register("avg", np.mean)
s.register("min", np.min)
s.register("max", np.max)
if greater_is_better:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
else:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("model_selector", random.randrange, NBR_ITEMS)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.model_selector, IND_INIT_SIZE)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", fitness_func)
toolbox.register("mate", cx_operator)
toolbox.register("mutate", mut_operator, low=0, up=1, indpb=indpb)
toolbox.register("select", sel_operator)
population = toolbox.population(n=population_size)
pop, log = eaSimpleCustom(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen, n_iter_no_change=n_iter_no_change, verbose=verbose, halloffame=hof, stats=s)
best_individual = hof[0]
estimators=[(t[0], t[1]) for i, t in enumerate(base_models) if best_individual[i]==1]
result_ensemble = VotingClassifier(estimators = estimators, voting='soft').fit(X_test, y_test)
hof.clear()
return {
"result ensemble": result_ensemble,
"TPOT": TPOT_object,
"autosklearn": autosklearn_object
}
def gen_ensemble_builder_regr(X_train, X_test, y_train, y_test,
scoring=r2_score, greater_is_better=True,
use_tpot=True, use_autosklearn=True,
TPOT_object=None, autosklearn_object=None,
use_unique_type_models=False, models_count=None,
population_size=50, cxpb=0.6, mutpb=0.5,
indpb=0.5, ngen=500, verbose=False,
cx_operator=tools.cxTwoPoint, mut_operator=tools.mutUniformInt,
n_iter_no_change=20,
sel_operator = lambda individuals, k: tools.selTournament(individuals, k, tournsize=3)):
"""Fit an ensemble builder classifier using TPOT and auto-sklearn.
Fit an ensemble builder classifier that can utilize TPOT and auto-sklearn
for model optimization. This function optimizes machine learning models
and builds an ensemble out of them.
Parameters
----------
X_train : array-like or sparse matrix of shape = [n_samples, n_features]
The training input samples.
X_test : array-like or sparse matrix of shape = [n_samples, n_features]
Test data input samples. Will be used to save test predictions for
all models. This allows to evaluate the performance of the ensemble
builder over time.
y_train : array-like, shape = [n_samples] or [n_samples, n_outputs]
The target classes for training.
y_test : array-like, shape = [n_samples] or [n_samples, n_outputs]
Test data target classes. Will be used to calculate the test error
of all models. This allows to evaluate the performance of the
ensemble builder over time.
scoring : callable, default=r2_score
The scoring function to use for evaluating the quality of the models.
This should be a function that takes two arrays as input (true and
predicted values) and returns a float. By default, it uses
`sklearn.metrics.r2_score`.
greater_is_better : bool, default=True
Whether the scoring function is maximizing (True) or minimizing (False).
use_tpot : bool, default=True
Whether to use TPOT for model optimization.
NOTE: at least one of use_tpot or use_autosklearn must be TRUE
use_autosklearn : bool, default=True
Whether to use auto-sklearn for model optimization.
NOTE: at least one of use_tpot or use_autosklearn must be TRUE
TPOT_object : TPOTRegressor object, default=None
A TPOTRegressor object to use for TPOT optimization. If None, a new
TPOTRegressor will be created.
autosklearn_object : AutoSklearnRegressor object, default=None
An AutoSklearnRegressor object to use for auto-sklearn optimization.
If None, a new AutoSklearnRegressor will be created.
use_unique_type_models : bool, default=False
Whether to only use unique model types in the ensemble. If True, each
model in the ensemble will be of a different type.
models_count : int, default=None
The number of models to include in the ensemble. If None, all models
will be included.
population_size : int, default=10
The number of individuals in the population.
cxpb : float, default=0.8
The probability of crossover for each pair of individuals.
mutpb : float, default=0.35
The probability of mutation for each individual.
indpb : float, default=0.5
The probability of each attribute to mutate for each individual.
ngen : int, default=100
The number of generations.
verbose : bool, default=False
Whether to print progress messages.
cx_operator : function, default=tools.cxTwoPoint
The crossover operator to use.
mut_operator : function, default=tools.mutUniformInt
The mutation operator to use.
n_iter_no_change : int, default=20
Maximum number of epochs the fitness function does not come better.
Set value to 0 to pass through all epochs.
Returns
-------
result : dict
A dictionary containing the following items:
- "result_ensemble": Fitted ensemble builder classifier.
- "TPOT": Fitted TPOTRegressor object used for TPOT optimization.
- "autosklearn": Fitted AutoSklearnRegressor object used for auto-sklearn optimization
"""
scorer = make_scorer(scoring, greater_is_better)
if not use_tpot and not use_autosklearn:
raise ValueError("At least one of use_tpot or use_autosklearn must be True.")
if TPOT_object is not None and not isinstance(TPOT_object, TPOTRegressor):
raise TypeError("TPOT_object should be of type TPOTRegressor")
if autosklearn_object is not None and not isinstance(autosklearn_object, AutoSklearnRegressor):
raise TypeError("autosklearn_object should be of type AutoSklearnRegressor")
base_models = []
if use_tpot:
if TPOT_object is None:
TPOT_object = TPOTRegressor()
TPOT_object.fit(X_train, y_train)
else:
TPOT_object.fit(X_train, y_train)
base_models += get_models_from_tpot(TPOT_object)
if use_autosklearn:
if autosklearn_object is None:
autosklearn_object = autosklearn.classification.AutoSklearnRegressor()
autosklearn_object.fit(X_train, y_train)
else:
autosklearn_object.fit(X_train, y_train)
base_models += get_models_from_autosklearn(autosklearn_object)
base_models = select_models(base_models, models_count, use_unique_type_models)
# Genetic algorithm
cv=5
NBR_ITEMS=2
IND_INIT_SIZE=len(base_models)
def fitness_func(individual):
if all(i == 0 for i in individual):
return (0,)
estimators=[(t[0], t[1]) for i, t in enumerate(input_models) if individual[i]==1]
ensemble = VotingClassifier(estimators = estimators, voting='soft')
y_test_pred = ensemble.pred(X_test)
return (scorer(y_test, y_test_pred),)
hof = tools.HallOfFame(1)
s = tools.Statistics(lambda ind: ind.fitness.values[0])
s.register("avg", np.mean)
s.register("min", np.min)
s.register("max", np.max)
if greater_is_better:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
else:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("model_selector", random.randrange, NBR_ITEMS)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.model_selector, IND_INIT_SIZE)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", fitness_func)
toolbox.register("mate", cx_operator)
toolbox.register("mutate", mut_operator, low=0, up=1, indpb=indpb)
toolbox.register("select", sel_operator)
population = toolbox.population(n=population_size)
pop, log = eaSimpleCustom(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen, n_iter_no_change=n_iter_no_change, verbose=verbose, halloffame=hof, stats=s)
best_individual = hof[0]
estimators=[(t[0], t[1]) for i, t in enumerate(input_models) if best_individual[i]==1]
result_ensemble = VotingClassifier(estimators = estimators, voting='soft').fit(X_test, y_test)
hof.clear()
return {
"result ensemble": result_ensemble,
"TPOT": TPOT_object,
"autosklearn": autosklearn_object
}