化合物の溶解度予測（基本編）

2020年12月13日
2021年8月9日
実践
QSPR, 回帰, 実践

オープンになっているデータセット、具体的には1290化合物の水溶解度のデータで構造物性相関解析（いわゆるQSPR）を行う。

目的

分子の水溶解度（LogS）を予測すること

諸条件

データ

1290分子の水溶解度（LogS）データ（sdfファイル形式）
（金子研究室HPのデータセットを利用）

変数作成

RDkit 記述子を使用（参考HP）

前処理

以下の変数を削除
- 分散が0の変数
- 互いに相関係数が高い（0.95以上）変数ペアの内、片方の変数
  （金子研モジュール使用）
- 50%以上の分子(645分子）で同じ値をとる変数
  (テストデータの割合を50%に設定しているため）
変数は全て標準化（平均：0, 分散：1）

解析手法

テストデータの割合：1/2（50%）
予測器：ランダムフォレスト
ハイパーパラメータ最適化：なし（省略のため）

前準備

データの取得（金子研究室HPのデータセットを利用）
モジュールのインストール

conda install -c conda-forge rdkit
pip install dcekit

コード

モジュールのインポート

import os
import pandas as pd
import numpy as np
import math
import matplotlib.figure as figure
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from dcekit.variable_selection import search_high_rate_of_same_values, search_highly_correlated_variables

データの読み込みと変数作成

#データの読み込み
suppl = Chem.SDMolSupplier('logSdataset1290_2d.sdf')
mols = [mol for mol in suppl if mol is not None]

#SDFファイル中のプロパティの取得
property_list = list(mols[0].GetPropNames())
print(property_list)
# =>['CAS_Number', 'logS']

#RDkit記述子の計算
descriptor_names = [descriptor_name[0] for descriptor_name in Chem.Descriptors._descList]
descriptor_calculation = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
RDkit = [descriptor_calculation.CalcDescriptors(mol) for mol in mols]

#データフレーム作成
df = pd.DataFrame(RDkit, columns = descriptor_names)

#溶解度(logS)の値を取得
df['logS'] =  [mol.GetProp('logS') if 'logS' in mol.GetPropNames() else 'None' for mol in mols]

変数のスクリーニング

df = df.astype(float)

#欠損のある変数を削除
df.dropna(axis=1,inplace=True)

#分散が０の変数削除
del_num1 = np.where(df.var() == 0)
df = df.drop(df.columns[del_num1], axis=1)

#変数選択（互いに相関関係にある変数の内一方を削除）  
threshold_of_r = 0.95 #変数選択するときの相関係数の絶対値の閾値
corr_var = search_highly_correlated_variables(df, threshold_of_r)
df.drop(df.columns[corr_var], axis=1, inplace=True)

#同じ値を多くもつ変数の削除
inner_fold_number = 2 #CVでの分割数（予定）
rate_of_same_value = []
threshold_of_rate_of_same_value = (inner_fold_number-1)/inner_fold_number
for col in df.columns:
    same_value_number = df[col].value_counts()
    rate_of_same_value.append(float(same_value_number[same_value_number.index[0]] / df.shape[0]))
del_var_num = np.where(np.array(rate_of_same_value) >= threshold_of_rate_of_same_value)
df.drop(df.columns[del_var_num], axis=1, inplace=True)
print(df.shape)
# => (1290, 55)

訓練・テストデータの準備と標準化

df_X = df.drop(['logS'],axis=1)
df_y = df.loc[:,['logS']]
X = df_X.values
y = df_y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

#StandardScalerを使って標準化
#(テストデータは訓練データの平均と分散を使って標準化する)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
scaled_X_train = scaler_X.fit_transform(X_train)
scaled_y_train = scaler_y.fit_transform(y_train)
scaled_X_test = scaler_X.transform(X_test)
scaled_y_test = scaler_y.transform(y_test)

学習と予測

model = rf()
model.fit(scaled_X_train, scaled_y_train)
scaled_pred_y_train = np.ndarray.flatten(model.predict(scaled_X_train))
scaled_pred_y_test = np.ndarray.flatten(model.predict(scaled_X_test))
pred_y_train = scaler_y.inverse_transform(scaled_pred_y_train)
pred_y_test = scaler_y.inverse_transform(scaled_pred_y_test)

予測精度の計算と可視化

print('r2_train:{}'.format(round(r2_score(y_train, pred_y_train),3)))
print('r2_test:{}'.format(round(r2_score(y_test, pred_y_test),3)))
print('MAE_train:{}'.format(round(mean_absolute_error(y_train, pred_y_train),3)))
print('MAE_test:{}'.format(round(mean_absolute_error(y_test, pred_y_test),3)))
print('RMSE_train:{}'.format(round(np.sqrt(mean_squared_error(y_train, pred_y_train)),3)))
print('RMSE_test:{}'.format(round(np.sqrt(mean_squared_error(y_test, pred_y_test)),3)))
"""
(output)
r2_train:0.985
r2_test:0.914
MAE_train:0.182
MAE_test:0.457
RMSE_train:0.244
RMSE_test:0.597
"""
plt.figure(figsize=(6,6))
plt.scatter(y_train, pred_y_train,color='blue',alpha=0.3,label='training data')
plt.scatter(y_test, pred_y_test, color='red',alpha=0.3,label='test data')

y_max_ = max(y_train.max(), pred_y_train.max(),y_test.max(), pred_y_test.max())
y_min_ = min(y_train.min(), pred_y_train.min(),y_test.min(), pred_y_test.min())
y_max = y_max_ + (y_max_ - y_min_) * 0.1
y_min = y_min_ - (y_max_ - y_min_) * 0.1

plt.plot([y_min , y_max],[y_min, y_max], 'k-')
plt.ylim(y_min, y_max)
plt.xlim(y_min, y_max)
plt.xlabel('logS(observed)',fontsize=20)
plt.ylabel('logS(predicted)',fontsize=20)
plt.legend(loc='best',fontsize=15)
plt.title('yyplot(logS)',fontsize=20)
plt.show()

おまけ）変数重要度の可視化

#重要度のpandas DataFrameの作成
df_fi =  pd.DataFrame({'feature_importance':model.feature_importances_}, index=df_X.columns)
df_fi.sort_values(by='feature_importance', ascending=False,inplace=True)
df_fi_top10 = df_fi.iloc[:5,:]

#グラフの作成
x_pos = np.arange(len(df_fi_top10))
x_label = df_fi_top10.index
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_subplot(1, 1, 1)
ax1.barh(x_pos, df_fi_top10['feature_importance'], color='b')
ax1.set_title('feature_importance',fontsize=18)
ax1.set_yticks(x_pos)
ax1.set_yticks(np.arange(-1,len(x_label))+0.5, minor=True)
ax1.set_yticklabels(x_label, fontsize=14)
ax1.set_xticks(np.arange(-10,11,2)/10)
ax1.set_xticklabels(np.arange(-10,11,2)/10,fontsize=12)
ax1.grid(which='minor',axis='y',color='black',linestyle='-', linewidth=1)
ax1.grid(which='major',axis='x',linestyle='--', linewidth=1)
plt.show()

ちなみに、最も重要度の高い「MolLogP」は、水/オクタノール分配係数（水溶性・脂溶性の指標）なので、上位にランクインするのは妥当な結果と言える。