-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregression.py
160 lines (130 loc) · 5.67 KB
/
regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Predicting the median price of homes in a given Boston suburb in the mid-70's, given certain data points.
# Data:
# * 506 data point split into 404 training samples, 102 test samples.
# * Features have different scale.
# Prices in USD thousands.
import keras
import numpy as np
from loguru import logger
import matplotlib.pyplot as plt
def load_data():
"""
Load Keras dataset "Boston Housing Data".
Columns (features):
1. Per capita crime rate.
2. Proportion of residential land zoned for lots over 25,000 square feet.
3. Proportion of non-retail business acres per town.
4. Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
5. Nitric oxides concentration (parts per 10 million).
6. Average number of rooms per dwelling.
7- Proportion of owner-occupied units built prior to 1940.
8. Weighted distances to five Boston employment centres.
9. Index of accessibility to radial highways.
10. Full-value property-tax rate per $10,000.
11. Pupil-teacher ratio by town.
12. 1000 * (Bk - 0.63) ** 2 where Bk is the proportion of Black people by town.
13. % lower status of the population.
:return:
"""
(train_d, train_t), (test_d, test_t) = keras.datasets.boston_housing.load_data()
logger.info('Shape of data: ' + str(train_d.shape))
return train_d, train_t, test_d, test_t
def normalize_data(train_d, test_d):
"""
Feature-wise normalize (subtract the mean and divide by the standard deviation)
:param train_d:
:param test_d:
:return: None.
"""
# Gives a mean of 0 and standard deviation of 1 per feature.
# Mean and standard deviation comes from the training data set.
mean = train_d.mean(axis=0)
std = train_d.std(axis=0)
train_d -= mean
train_d /= std
test_d -= mean
test_d /= std
def build_model():
"""
Define the model with layers and functions.
:return: Keras compiled model.
"""
# Build network.
# Small data set --> we use a small network so we don't overfit too much.
# Scalar regression with unconstrained activation function (the network can learn to predict values in any range).
model = keras.models.Sequential()
# Two layers
model.add(keras.layers.Dense(64, activation='relu', input_shape=(train_data.shape[1],)))
model.add(keras.layers.Dense(64, activation='relu'))
# Single output (scalar).
model.add(keras.layers.Dense(1))
# Configuring the learning process.
# Optimizer: rmsprop (divide the gradient by a running average of its recent magnitude)
# Loss function: mse (mean squared error)
# Metrics: mae (mean absolute error), the absolute value of the difference between the predictions and the targets.
# Example: MAE = 0.5 means that predictions are off by 0.5 * 1000 = 500 USD.
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
def k_fold_cross_valid(k: int, num_epochs: int, batch_size: int, train_data, test_data):
"""
Train the model.
We have a small sample and splitting data into training and validation sets are therefore not optimal.
We use K-fold cross-validation instead:
* Split data into K partitions
* Instantiate K identical models and train eac on K-1 partitions
* Evaluate on last partition.
:param k: Number of partitions.
:param num_epochs: Number of epochs to train the model.
:param batch_size: Number of samples per gradient update.
:param train_data: Our training data.
:param test_data: Our test data.
:return: All MAEs for graph.
"""
num_val_samples = len(train_data) // k
all_mae_histories = []
for i in range(k):
# Normalize data within the current fold instead of globally.
normalize_data(train_data, test_data)
logger.info('Processing fold #' + str(i))
# Validation data for partition k.
valid_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
valid_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
# Training data for partition i
partial_train_data = np.concatenate(
[train_data[:i * num_val_samples],
train_data[(i + 1) * num_val_samples:]],
axis=0)
partial_train_targets = np.concatenate(
[train_targets[:i * num_val_samples],
train_targets[(i + 1) * num_val_samples:]],
axis=0)
model = build_model()
# Train model.
history = model.fit(partial_train_data, partial_train_targets, validation_data=(valid_data, valid_targets),
epochs=num_epochs, batch_size=batch_size, verbose=0)
mae_history = history.history['val_mean_absolute_error']
all_mae_histories.append(mae_history)
average_mae_history = [np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]
return average_mae_history
def plot_valid_scores(avg_maes: np.array):
"""
Plot average MAE for every epoch.
:param avg_maes: Array of MAE:s.
:return: None
"""
plt.plot(range(1, len(avg_maes) + 1), avg_maes)
plt.xlabel('Epochs')
plt.ylabel('Validation score: Mean Absolute Error (MAE)')
plt.show()
if __name__ == '__main__':
train_data, train_targets, test_data, test_targets = load_data()
k = 4
num_epochs = 500
batch_size = 1
avg_maes = k_fold_cross_valid(k, num_epochs, batch_size, train_data, test_data)
plot_valid_scores(avg_maes)
# New, compiled model
model = build_model()
model.fit(train_data, train_targets, epochs=80, batch_size=16, verbose=0)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)
logger.info('MAE score:' + str(test_mae_score))