-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression_01.py
104 lines (96 loc) · 4.4 KB
/
linear_regression_01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python3
#
# Linear Regression for a dataset read from a csv file. We applied closed form
# equations to determine theta1 and theta0 (James et al., 2021). We have two
# datasets: CASF-2016 Ki (de Azevedo et al., 2024) and Amsterdam Apartments
# dataset (Wolf, 2022).
# This code is for linear models as expressed by the following equation:
# y = theta0 + theta1*X, where theta0 is the fit intercept and theta1 the
# coeficient, theta0 and theta1 are the model’s parameters (Géron, 2023).
#
# References
# de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
# Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
# Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
# machine-learning models to explore the scoring function space. J Comput Chem.
# 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
#
# Géron, Aurélien. 2023. Hands-on Machine Learning with Scikit-Learn, Keras,
# and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems.
# 3rd ed. CA 95472: O’Reilly.
#
# James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
# statistical learning: With applications in R (2nd ed.). Springer.
#
# Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
# Supervised Learning. Andrew Wolf. Kindle Edition.
#
################################################################################
# Dr. Walter F. de Azevedo, Jr. #
# https://github.com/azevedolab #
# July 20, 2024 #
################################################################################
#
# Import section
import numpy as np
import reg2D_model
import csv
from scipy.stats import pearsonr, spearmanr
# Read csv file
# CASF-2016 dataset
# de Azevedo WF Jr, Quiroga R, Villarreal MA, da Silveira NJF,
# Bitencourt-Ferreira G, da Silva AD, Veit-Acosta M, Oliveira PR, Tutone M,
# Biziukova N, Poroikov V, Tarasova O, Baud S. SAnDReS 2.0: Development of
# machine-learning models to explore the scoring function space. J Comput Chem.
# 2024 Jun 20. doi: 10.1002/jcc.27449. Epub ahead of print. PMID: 38900052.
file_in = "CASF-2016_Ki_training.csv"
# Amsterdam Apartments dataset (Amsterdam)
# Wolf, Andrew. The Machine Learning Simplified: A Gentle Introduction to
# Supervised Learning. Andrew Wolf. Kindle Edition.
#file_in = "Amsterdam_apartments_training_set.csv"
# Get header from a csv file
fo = open(file_in,"r")
csv_in = csv.reader(fo)
for line in csv_in:
i_y = line.index("pKi")
i_X = line.index("Gauss 2")
#i_y = line.index("Price")
#i_X = line.index("Area")
break
fo.close()
# Get numerical data
data = np.genfromtxt(file_in, delimiter=',', skip_header = 1)
X = data[:,i_X]
y = data[:,i_y]
# Calculate parameters (theta0,theta1) as defined elsewhere
# James, G., Witten, D., Hastie, T., & Tibshirani, R. 2021. An introduction to
# statistical learning: With applications in R (2nd ed.). Springer.
X_bar = X.mean()
y_bar = y.mean()
theta1 = np.sum( (X - X_bar)*(y - y_bar) )/np.sum( (X - X_bar)**2 )
theta0 = y_bar - theta1*X_bar
# Make predictions
X_new = np.array([[X.min()],[X.max()]])
y_predict = theta0 + theta1*X_new
# Plotting with reg2D_model() function
theta0_str = "{:.8f}".format(theta0)
theta1_str = "{:.8f}".format(theta1)
#title_str = "CASF-2016 K$_i$ (Training Set)"
title_str = "Amsterdan Apartments (Training Set)"
label_str = "Predictions ( y = "+theta0_str+" + "+theta1_str+" x )"
reg2D_model.plotting(title_str,
#X,y,"b.",[0,120,0,120],
#X_new,y_predict,"r-",label_str,"Area(m$^2$)","Price(10K*Euros)",
X,y,"b.",[-200,3200,0,15],
X_new,y_predict,"r-",label_str,"Gauss 2","pK$_i$",
"upper left",True,file_in.replace(".csv",".png"),600)
# Show predictive performance
msg_out = "\nPredictive performance"
print(msg_out)
y_full_predictions = theta0 + theta1*X
rss = np.sum( np.square(y_full_predictions - y) )
p_corr = pearsonr(y,y_full_predictions)
s_corr = spearmanr(y,y_full_predictions)
print("RSS = {:.5f}".format(rss))
print("r = {:.5f}".format(p_corr[0]),"\tp-value = {:.5e}".format(p_corr[1]))
print("rho = {:.5f}".format(s_corr[0]),"\tp-value = {:.5e}".format(s_corr[1]))