Skip to content

Bharat-github6/Solubility-prediction-XlogP-

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 

Repository files navigation

Solubility-prediction-XlogP

import all the libraries

from rdkit.Chem import AllChem
from rdkit.Chem.Crippen import MolLogP
from rdkit.Chem import Descriptors
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

upload teh data.Data can be found here https://drive.google.com/file/d/1_rpECl9WVkY0sjOfEOsBZE2wdNHfIUy9/view?usp=share_link

data = Chem.SDMolSupplier('a.sdf')

Use RDKit to extract the information from teh data. The data was downloaded form pubchem and it is in sdf format and convert to DataFrame

data = []
for mol in suppl:
    #if mol is not None:
        # Get the molecular weight
        mw = Chem.rdMolDescriptors.CalcExactMolWt(mol)

        # Get the XLogP
        logp = MolLogP(mol)

        # Get the number of rotatable bonds
        rotatable_bonds = Chem.rdMolDescriptors.CalcNumRotatableBonds(mol)

        # Get the formal charge
        charge = Chem.rdmolops.GetFormalCharge(mol)

        # Get the polar surface area
        psa = Chem.rdMolDescriptors.CalcTPSA(mol)
        
        # Get the number of H-bond donors
        hbd = Descriptors.NumHDonors(mol)
        
        # Get the number of H-bond acceptors
        hba = Descriptors.NumHAcceptors(mol)

        # Get the SMILES
        smiles = Chem.MolToSmiles(mol)

        # Create a dictionary for the data
        mol_data = {
            'XLogP': logp,
            'Molecular Weight': mw,
            'smiles': smiles,
            'H-Bond Donors': hbd,
            'H-Bond Acceptors': hba,
            'charge': charge,
            'rotatable_bonds': rotatable_bonds,
            'polar surface area': psa
        }

        # Add the dictionary to the list
        data.append(mol_data)

# Create the pandas DataFrame
df = pd.DataFrame(data)

Split and train the model


X = df[['Molecular Weight', 'H-Bond Donors', 'H-Bond Acceptors', 'charge','rotatable_bonds', 'polar surface area']]
y = df['XLogP']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

Calculate the Mean Squared error and r-squared value and plot it.

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
plt.scatter(y_test, y_pred, color='tab:blue', alpha=0.5)
plt.title("Actual vs Predicted Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2)

print("Mean Squared Error:", mse)
print("R-squared value:", r2)
plt.show()

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published