I am using SVM to make a prediction on bacterial genomes. The output of the program are short DNA sequences that best predict a given class.
I am using this function to get the best model parameters:
def fitmodel(X, pheno, estimator, parameters) :
#Separate data into train/test sets
#Perform a Grid search to identify the best hyperparameters
#Call predict on the estimator with the best found parameters using test dataset
#Generates statistics about the predictor
kfold = KFold(n_splits=5)
nk = 0
for train_index, test_index in kfold.split(X, pheno):
nk = nk + 1
print("Running " + str(nk) + " fold...")
X_train = X.iloc[train_index]
y_train = pheno[train_index]
X_test = X.iloc[test_index]
y_test = pheno[test_index]
print("Performing GRID search...")
gs_clf = GridSearchCV(estimator=estimator, param_grid=parameters, cv=5, n_jobs=-1, scoring='balanced_accuracy')
print("Fitting model...")
gs_clf.fit(X_train, y_train)
print("Predicting-Train...")
y_pred_train = gs_clf.predict(X_train)
y_pred_train[y_pred_train<0.5] = 0
y_pred_train[y_pred_train>0.5] = 1
print("Confusion matrix train for the fold " + str(nk))
print(confusion_matrix(y_train, y_pred_train))
print("Metrics report of training for the fold " + str(nk) +": " + classification_report(y_train, y_pred_train))
y_pr = gs_clf.decision_function(X_train)
auc = roc_auc_score(y_train, y_pr)
print('AUC: %.3f' % auc)
print("Predicting-Test...")
y_pred = gs_clf.predict(X_test)
y_pred[y_pred<0.5] = 0
y_pred[y_pred>0.5] = 1
print("Best hyperparameters for the fold " + str(nk))
print(gs_clf.best_params_)
print("Confusion matrix test for the fold " + str(nk))
print(confusion_matrix(y_test, y_pred))
print("Metrics report of testing for the fold " + str(nk) +": " + classification_report(y_test, y_pred))
y_pr_test = gs_clf.decision_function(X_test)
aucTest = roc_auc_score(y_test, y_pr_test)
print('AUC: %.3f' % aucTest)
return gs_clf
I would like to know why if I use the estimator that returned the function "fitmodel" it gives me a specific result but if I create a new estimator with the same parameters as the estimator that returned this function it gives me another result.
This is the main function:
svm = SVC(class_weight='balanced')
svm_params = {
'C': [0.01],
'gamma': [1e-06, 1e-05],
'kernel': ['linear']
}
svm_model = fitmodel(X, pheno, svm, svm_params)
To know what were the parameters of the estimator returned by the "fitmodel()" function, I used this:
print(svm_model.best_estimator_.get_params())
This was the result:
{'C': 0.01, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1e-06, 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Then I created a new SVM estimator using this parameters:
FinalModel = SVC(C=0.01, kernel='linear', gamma=1e-06, class_weight='balanced')
FinalModell.fit(X,pheno)
FinalModel.predict(X)
But the results are not the same if I run this:
feature_names_top, top_coefficients = plot_coefficients(svm_model, X.columns)
print("Top positive predictors SVN: ", feature_names_top[-100:])
print("Top positive predictors weigth SVN: ", top_coefficients[-100:])
Or this:
feature_names_top, top_coefficients = plot_coefficients_final(FinalModel, X.columns)
print("Top positive predictors SVN: ", feature_names_top[-100:])
print("Top positive predictors weigth SVN: ", top_coefficients[-100:])
Using these functions (that are the same but differs in the first sentence):
def plot_coefficients_final(classifier, feature_names, top_features=100):
coef = classifier.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:] #imprime los Ășltimos 20
top_negative_coefficients = np.argsort(coef)[:top_features] #imprime los primeros 20
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
coef_top_positive = np.empty(top_features,dtype=object)
coef_top_negative = np.empty(top_features,dtype=object)
coef_top_coefficients = np.empty(top_features*2,dtype=object)
m=0
for n in top_positive_coefficients:
coef_top_positive[m] = coef[n]
m = m + 1
m=0
for n in top_negative_coefficients:
coef_top_negative[m] = coef[n]
m = m + 1
coef_top_coefficients = np.hstack([coef_top_negative,coef_top_positive])
feature_names_top = np.empty(top_features*2,dtype=object)
j=0
for i in top_coefficients:
feature_names_top[j] = feature_names[i]
j = j + 1
return feature_names_top, coef_top_coefficients
and
def plot_coefficients(classifier, feature_names, top_features=100):
coef = classifier.best_estimator_.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:] #imprime los Ășltimos 20
top_negative_coefficients = np.argsort(coef)[:top_features] #imprime los primeros 20
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
coef_top_positive = np.empty(top_features,dtype=object)
coef_top_negative = np.empty(top_features,dtype=object)
coef_top_coefficients = np.empty(top_features*2,dtype=object)
m=0
for n in top_positive_coefficients:
coef_top_positive[m] = coef[n]
m = m + 1
m=0
for n in top_negative_coefficients:
coef_top_negative[m] = coef[n]
m = m + 1
coef_top_coefficients = np.hstack([coef_top_negative,coef_top_positive])
feature_names_top = np.empty(top_features*2,dtype=object)
j=0
for i in top_coefficients:
feature_names_top[j] = feature_names[i]
j = j + 1
return feature_names_top, coef_top_coefficients
Could someone please tell me why?
Thank you very much.
Mensur, thanks for your answer.
I think that maybe my question was misunderstood.
My question is why if I use the "svm_model" predictor and the "FinalModel" predictor (that supposedly have both the same parameters) with the full dataset (X) in the corresponding plot_coefficients function, the results I get are not the same?