Skip to main content

This project aims to train neural networks by compound-protein interactions and provides interpretation of the learned model by interactively showing transformed chemical landscape and visualized SAR for chemicals of interest.

Project description

VISAR Tutorial

This project aims to train neural networks by compound-protein interactions and provides interpretation of the learned model by interactively showing transformed chemical landscape and visualized SAR for chemicals of interest.

In this notebook, we will show a typical workflow of using VISAR for training neural network QSAR models and analyzing the trained model.

model training

With VISAR, we could set the hyperparameters candidate sets for screening, and manually pick the best one. The training is set to carried out 3 times as repeats, and training results at various learning steps are saved in log directory for further analysis.

import os
from Model_training_utils import ST_model_hyperparam_screen, ST_model_training
os.environ['CUDA_VISIBLE_DEVICES']='1'
# initialize parameters
task_names = ['T51'] # see dataset table in data directory for all available tasks
MT_dat_name = './data/MT_data_clean_Feb28.csv'
FP_type = 'Circular_2048'

params_dict = {
    "n_tasks": [1],
    "n_features": [2048], ## need modification given FP types
    "activation": ['relu'],
    "momentum": [.9],
    "batch_size": [128],
    "init": ['glorot_uniform'],
    "learning_rate": [0.01],
    "decay": [1e-6],
    "nb_epoch": [30],
    "dropouts": [.2, .4],
    "nb_layers": [1],
    "batchnorm": [False],
    #"layer_sizes": [(100, 20), (64, 24)],
    "layer_sizes": [(1024, 512),(1024,128) ,(512, 128),(512,64),(128,64),(64,32), 
                    (1024,512,128), (512,128,64), (128,64,32)],
    "penalty": [0.1]
}
# initialize model setup
import random
import time
random_seed = random.randint(0,1000)
local_time = time.localtime(time.time())
log_path = './logs/'
RUN_KEY = 'ST_%d_%d_%d_%d' % (local_time.tm_year, local_time.tm_mon, 
                              local_time.tm_mday, random_seed)
os.system('mkdir %s%s' % (log_path, RUN_KEY))
print(RUN_KEY)
# hyperparam screening using deepchem
RUN_KEY = 'ST_2019_4_23_136'
log_output = ST_model_hyperparam_screen(MT_dat_name, task_names, FP_type, params_dict, 
                                        log_path = './logs/'+RUN_KEY)
# manually pick the training parameters
best_hyperparams = {'T51': [(512, 128,1), 0.2]
                   }
# model training
RUN_KEY = 'ST_2019_4_23_136'
output_df = ST_model_training(MT_dat_name, FP_type, 
                              best_hyperparams, result_path = './logs/'+RUN_KEY)

build landscape and display interactive plot

Once specified the task name and the name of the trained model, the 'landscape_building' function would carry out the analysis, and the result could be interactively displayed by the function 'interactive_plot'.

from Model_landscape_utils import landscape_building, interactive_plot
from Model_training_utils import prepare_dataset, extract_clean_dataset
from keras import backend as K
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
import pandas as pd
from bokeh.plotting import output_notebook, show
output_notebook()
Using TensorFlow backend.
/root/anaconda3/envs/deepchem/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters




<div class="bk-root">
    <a href="https://bokeh.pydata.org" target="_blank" class="bk-logo bk-logo-small bk-logo-notebook"></a>
    <span id="ffb4a7ea-b82e-4f57-8174-d5cb03eebed1">Loading BokehJS ...</span>
</div>
# analysis set-up
task_name = 'T51'
db_name = './data/MT_data_clean_Feb28.csv'
FP_type = 'Circular_2048'
log_path = './logs/ST_2019_4_23_136/'
prev_model = './logs/ST_2019_4_23_136/T51_rep2_50.hdf5'
n_layer = 1
SAR_result_dir = log_path
output_sdf_name = log_path + 'T51_chemical_landscape.sdf'
plot_df = landscape_building(task_name, db_name, log_path, FP_type,
                       prev_model, n_layer, 
                       SAR_result_dir, output_sdf_name)
plot_df.head()
==== preparing dataset ... ====
Extracted dataset shape: (3202, 3)
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./logs/ST_2019_4_23_136//temp.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
TIMING: featurizing shard 0 took 14.154 s
TIMING: dataset construction took 14.381 s
Loading dataset from disk.
==== calculating transfer values ... ====
WARNING:tensorflow:From /root/anaconda3/envs/deepchem/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:1108: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
==== rendering SAR for chemicals on the landscape ... ====
==== packing sdf file ... ====


/root/anaconda3/envs/deepchem/lib/python3.5/site-packages/rdkit/Chem/PandasTools.py:410: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if np.issubdtype(type(cell_value), float):
T51 molregno salt_removed_smi coord1 coord2 Label imgs pred
54 1.346777 262348 Cc1nn2c(C#N)cccc2c1CN1CCN(c2ccc(Cl)cc2)CC1 -61.390247 28.508858 78 ./logs/ST_2019_4_23_136/262348_img.png 1.431429
55 1.657567 262350 N#Cc1cccc2c(CN3CCN(c4ccc(Cl)cc4)CC3)cnn12 -57.633320 30.381008 99 ./logs/ST_2019_4_23_136/262350_img.png 1.712855
72 0.999990 322 COc1ccc2c(c1)[nH]c1c(C)nccc12 -62.837112 18.570761 63 ./logs/ST_2019_4_23_136/322_img.png 1.038729
84 0.999990 351 COc1ccc2c3c([nH]c2c1)CNCC3 -66.213913 19.925594 102 ./logs/ST_2019_4_23_136/351_img.png 1.037158
104 4.892833 262631 N[C@@H](Cc1c[nH]c2ccc(O)cc12)C(=O)O 38.830685 -5.267498 9 ./logs/ST_2019_4_23_136/262631_img.png 4.937616
# make interactive plot
show(interactive_plot(plot_df, x_column = 'coord1', y_column = 'coord2', color_column = task_name, 
                      id_field = 'molregno', value_field = task_name, label_field = 'Label', 
                      pred_field = 'pred', size = 6))
# a helping function of loading dataframe from previously generated sdf file
from Model_landscape_utils import sdf2df
plot_df = sdf2df('./logs/ST_2019_4_23_136/T51_chemical_landscape.sdf')
plot_df['Label'] = [int(x) for x in plot_df['Label'].tolist()]
plot_df.head()
molregno imgs salt_removed_smi coord1 coord2 T51 pred Label
0 262348 ./logs/ST_2019_4_23_136/262348_img.png Cc1nn2c(C#N)cccc2c1CN1CCN(c2ccc(Cl)cc2)CC1 59.324326 45.583145 1.346777 1.431429 100
1 262350 ./logs/ST_2019_4_23_136/262350_img.png N#Cc1cccc2c(CN3CCN(c4ccc(Cl)cc4)CC3)cnn12 59.331421 40.36301 1.657567 1.712855 21
2 322 ./logs/ST_2019_4_23_136/322_img.png COc1ccc2c(c1)[nH]c1c(C)nccc12 51.826027 50.381916 0.99999 1.038729 88
3 351 ./logs/ST_2019_4_23_136/351_img.png COc1ccc2c3c([nH]c2c1)CNCC3 43.928574 52.7785 0.99999 1.037158 60
4 262631 ./logs/ST_2019_4_23_136/262631_img.png N[C@@H](Cc1c[nH]c2ccc(O)cc12)C(=O)O -34.64777 13.794934 4.892833 4.937616 78
# pick clusters of interest and pack them as an sdf for pharmacophore modeling
from Model_landscape_utils import df2sdf
import numpy as np
output_sdf_name = log_path + 'T51_Label13_chemical_landscape.sdf'
smiles_field = 'salt_removed_smi'
id_field = 'molregno'
filter_label = np.array([x == 21 for x in plot_df['Label'].tolist()])
custom_df = plot_df.loc[filter_label]
df2sdf(custom_df, output_sdf_name, smiles_field, id_field)
/root/anaconda3/envs/deepchem/lib/python3.5/site-packages/rdkit/Chem/PandasTools.py:296: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)
/root/anaconda3/envs/deepchem/lib/python3.5/site-packages/rdkit/Chem/PandasTools.py:410: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if np.issubdtype(type(cell_value), float):

Pharmacophore modeling using Align-it

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
raw_sdf_file = './T51_Label13_chemical_landscape.sdf'
ms = [x for x in Chem.SDMolSupplier(raw_sdf_file)]
len(ms)
# set home directory for following analysis
home_dir = './Result/'
os.chdir(home_dir)

# prepare ligand conformations
from rdkit import Chem
from rdkit.Chem import AllChem

raw_sdf_file = 'Label_7.sdf'
sdf_file = home_dir + 'Label7_rdkit_conf.sdf'
ms = [x for x in Chem.SDMolSupplier(raw_sdf_file)]
n_conf = 5
w = Chem.SDWriter(sdf_file)
for i in range(n_conf):
    ms_addH = [Chem.AddHs(m) for m in ms]
    for m in ms_addH:
        AllChem.EmbedMolecule(m)
        AllChem.MMFFOptimizeMoleculeConfs(m)
        w.write(m)

# process pharmacophores
result_dir = home_dir + 'Label7_rdkit_phars/'
output_name = 'Cluster7_'
proceed_pharmacophore(home_dir, sdf_file, result_dir, output_name)

The resulting pharamacophore model (saved as .phar files at home_dir) could be visualized in Pymol by align-it plugin.

analysis of custom chemicals

For new set of chemical of interst, users could prepare a seperate .csv file, with columns specifying the SMILES, ID and a dummy field of biological activity, and the 'landscape_positioning' function would take in the custom file and generate analysis results.

from Model_landscape_utils import landscape_positioning, interactive_plot
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
import pandas as pd
from bokeh.plotting import output_notebook, show
output_notebook()
<div class="bk-root">
    <a href="https://bokeh.pydata.org" target="_blank" class="bk-logo bk-logo-small bk-logo-notebook"></a>
    <span id="75c98913-bdfd-4a0d-a58c-a8aeeeee0c08">Loading BokehJS ...</span>
</div>
# set custom file
custom_file = './Result/custom_df.csv'
custom_smi_field = "smiles"
custom_id_field = 'molname'
custom_task_field = 'dummy'

# set the landscape to compare to
landscape_sdf = './logs/ST_2019_4_23_136/T51_chemical_landscape.sdf'
task_name = 'T51'
db_name = './data/MT_data_clean_Feb28.csv'
FP_type = 'Circular_2048'
log_path = './logs/'
prev_model = './logs/ST_2019_4_23_136/T51_rep2_50.hdf5'
n_layer = 1
custom_SAR_result_dir = log_path
custom_sdf_name = log_path + 'custom_chemicals_on_T51_landscape.sdf'
plot_df = landscape_positioning(custom_file, custom_smi_field, custom_id_field, custom_task_field,
                        landscape_sdf, task_name, db_name, FP_type, log_path,
                        prev_model, n_layer, custom_SAR_result_dir, custom_sdf_name)
==== preparing dataset ... ====
Extracted dataset shape: (3202, 3)
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./logs//temp.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
TIMING: featurizing shard 0 took 14.038 s
TIMING: dataset construction took 14.269 s
Loading dataset from disk.
Extracted dataset shape: (2, 3)
Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./Result/custom_df.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.011 s
TIMING: dataset construction took 0.021 s
Loading dataset from disk.
==== calculating transfer values ... ====
==== rendering SAR for chemicals on the landscape ... ====
==== packing sdf file ... ====


/root/anaconda3/envs/deepchem/lib/python3.5/site-packages/rdkit/Chem/PandasTools.py:410: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if np.issubdtype(type(cell_value), float):
plot_df.head()
coord1 coord2 molregno T51 Label salt_removed_smi imgs pred group
0 5.507763 1.298865 RIT 0.000000 65 CC1=C(CCN2CCC(=C(c3ccc(F)cc3)c4ccc(F)cc4)CC2)C(=O)N5C=CSC5=N1 ./logs/RIT_img.png 3.344871 15
1 17.061337 53.975483 ERG 0.000000 59 CN1C[C@@H](C=C2[C@H]1Cc3c[nH]c4cccc2c34)C(=O)N[C@]5(C)O[C@@]6(O)[C@@H]7CCCN7C(=O)[C@H](Cc8ccccc8)N6C5=O ./logs/ERG_img.png 4.034642 15
2 -38.608498 -23.622177 262348 1.346777 104 Cc1nn2c(C#N)cccc2c1CN1CCN(c2ccc(Cl)cc2)CC1 ./logs/ST_2019_4_23_136/262348_img.png 2.174304 6
3 -47.691116 -16.576090 262350 1.657567 58 N#Cc1cccc2c(CN3CCN(c4ccc(Cl)cc4)CC3)cnn12 ./logs/ST_2019_4_23_136/262350_img.png 2.069756 6
4 -90.054886 -12.611736 322 0.999990 13 COc1ccc2c(c1)[nH]c1c(C)nccc12 ./logs/ST_2019_4_23_136/322_img.png 1.043407 6
show(interactive_plot(plot_df, 'coord1', 'coord2', color_column = 'group', id_field = 'molregno', 
                      value_field = task_name, label_field = 'Label',pred_field = 'pred', size = 'group'))
from Model_landscape_utils import sdf2df
plot_df = sdf2df('./logs/custom_chemicals_on_T51_landscape.sdf')
plot_df['Label'] = [int(x) for x in plot_df['Label'].tolist()]
plot_df.head()
molregno T51 imgs Label coord1 coord2 pred salt_removed_smi
0 RIT 0.0 ./logs/RIT_img.png 65 5.507763 1.298865 3.344871 CC1=C(CCN2CCC(=C(c3ccc(F)cc3)c4ccc(F)cc4)CC2)C(=O)N5C=CSC5=N1
1 ERG 0.0 ./logs/ERG_img.png 59 17.061337 53.975483 4.034642 CN1C[C@@H](C=C2[C@H]1Cc3c[nH]c4cccc2c34)C(=O)N[C@]5(C)O[C@@]6(O)[C@@H]7CCCN7C(=O)[C@H](Cc8ccccc8)N6C5=O
2 262348 1.346777 ./logs/ST_2019_4_23_136/262348_img.png 104 -38.608498 -23.622177 2.174304 Cc1nn2c(C#N)cccc2c1CN1CCN(c2ccc(Cl)cc2)CC1
3 262350 1.657567 ./logs/ST_2019_4_23_136/262350_img.png 58 -47.691116 -16.57609 2.069756 N#Cc1cccc2c(CN3CCN(c4ccc(Cl)cc4)CC3)cnn12
4 322 0.99999 ./logs/ST_2019_4_23_136/322_img.png 13 -90.054886 -12.611736 1.043407 COc1ccc2c(c1)[nH]c1c(C)nccc12
# pick clusters of interest and pack them as an sdf fur pharmacophore modeling
custom_filter = landscape_df['Label'] == 7
df2sdf(df, output_sdf_name, smiles_field, id_field, custom_filter = None)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

visar-0.2.0.tar.gz (23.7 kB view hashes)

Uploaded Source

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page