本笔记来源于B站Up主: 有Li 的影像组学系列教学视频
本节(23)主要讲解: 主成分析PCA,影像组学降维和特征筛选的区别
0. PCA(Principal component analysis)的数学原理
1. 导入包
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, cross_val_score,KFold,RepeatedKFold,GridSearchCV
from sklearn import svm
2. 导入及准备数据
xlsx1_filePath = 'C:/Users/RONG/Desktop/PCA/data_A.xlsx'
xlsx2_filePath = 'C:/Users/RONG/Desktop/PCA/data_B.xlsx'
data_1 = pd.read_excel(xlsx1_filePath)
data_2 = pd.read_excel(xlsx2_filePath)
rows_1,__ = data_1.shape
rows_2,__ = data_2.shape
data_1.insert(0,'label',[0]*rows_1)
data_2.insert(0,'label',[1]*rows_2)
data = pd.concat([data_1,data_2])
data = shuffle(data)
data = data.fillna(0)
X = data[data.columns[1:]]
y = data['label']
colNames = X.columns
X = X.astype(np.float64)
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = colNames
3. PCA 降维
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)
from sklearn.decomposition import PCA
model_pca = PCA(n_components = 0.99)
model_pca.fit(X_train)
print(model_pca.explained_variance_)
print(model_pca.explained_variance_ratio_)
Output:
# > [69.49194245 43.88043848 21.39962158 9.81580993 4.56301384 3.26179038]
# > [0.45226538 0.28558136 0.13927238 0.06388296 0.02969687 0.02122829]
4. 将降维后的数据进行训练
# svm in PCA
X_train_pca = model_pca.transform(X_train)
X_test_pca = model_pca.transform(X_test)
print(X_train_pca.shape,X_test_pca.shape)
model_svm = svm.SVC(kernel = 'rbf',gamma = "auto",probability = True).fit(X_train_pca,y_train)
score_svm = model_svm.score(X_test_pca,y_test)
print(score_svm)
Output:
# > (11, 6) (6, 6)
# > 0.8333333333333334