Python 机器学习入门:Scikit-learn 实战指南
机器学习是人工智能的核心技术之一,而 Scikit-learn 是 Python 最流行的机器学习库。本文将带你从零开始,系统地掌握机器学习的基础知识和实战技能。
🤖 机器学习基础
1. 机器学习类型
python
# 机器学习主要分为三大类
# 1. 监督学习(Supervised Learning)
# - 有标签数据
# - 预测连续值(回归)或离散类别(分类)
# 示例:房价预测、邮件分类
# 2. 无监督学习(Unsupervised Learning)
# - 无标签数据
# - 发现数据中的模式或结构
# 示例:客户聚类、异常检测
# 3. 强化学习(Reinforcement Learning)
# - 通过与环境交互学习
# - 基于奖励和惩罚优化策略
# 示例:游戏 AI、机器人控制2. 机器学习工作流程
python
# 典型的机器学习工作流程
"""
1. 数据收集
↓
2. 数据预处理
↓
3. 特征工程
↓
4. 模型选择
↓
5. 模型训练
↓
6. 模型评估
↓
7. 模型优化
↓
8. 模型部署
"""📊 数据准备
1. 数据加载
python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import train_test_split
# 使用内置数据集
# 鸢尾花数据集(分类)
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
print(f"鸢尾花数据集: {X_iris.shape}, 标签: {np.unique(y_iris)}")
# 创建示例数据集
# 分类问题
from sklearn.datasets import make_classification
X_class, y_class = make_classification(
n_samples=1000, # 样本数
n_features=20, # 特征数
n_informative=10, # 有用特征数
n_classes=2, # 类别数
random_state=42
)
# 回归问题
from sklearn.datasets import make_regression
X_reg, y_reg = make_regression(
n_samples=1000,
n_features=10,
n_informative=5,
noise=0.1,
random_state=42
)
# 聚类问题
from sklearn.datasets import make_blobs
X_cluster, y_cluster = make_blobs(
n_samples=500,
centers=3,
n_features=2,
random_state=42
)
# 从 CSV 文件加载
# df = pd.read_csv('data.csv')
# X = df.drop('target', axis=1)
# y = df['target']2. 数据分割
python
# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_iris, y_iris,
test_size=0.2, # 20% 作为测试集
random_state=42, # 随机种子
stratify=y_iris # 分层抽样,保持类别比例
)
# 三分割(训练集、验证集、测试集)
X_train, X_temp, y_train, y_temp = train_test_split(
X_iris, y_iris, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, random_state=42
)
print(f"训练集: {X_train.shape}")
print(f"验证集: {X_val.shape}")
print(f"测试集: {X_test.shape}")3. 数据预处理
python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
# 缺失值处理
# 创建带缺失值的数据
X_with_nan = X_class.copy()
X_with_nan[np.random.rand(*X_with_nan.shape) < 0.1] = np.nan
# 均值填充
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_with_nan)
# 中位数填充
imputer_median = SimpleImputer(strategy='median')
X_imputed_median = imputer_median.fit_transform(X_with_nan)
# 特征缩放
# 标准化(均值为0,标准差为1)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_class)
print(f"标准化前: 均值={X_class.mean():.2f}, 标准差={X_class.std():.2f}")
print(f"标准化后: 均值={X_standardized.mean():.2f}, 标准差={X_standardized.std():.2f}")
# 归一化(值域[0, 1])
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X_class)
# 标签编码(将字符串标签转换为数字)
# y_cat = ['cat', 'dog', 'cat', 'bird']
# encoder = LabelEncoder()
# y_encoded = encoder.fit_transform(y_cat)
# print(y_encoded) # [0, 1, 0, 2]
# One-Hot 编码(用于分类特征)
from sklearn.preprocessing import OneHotEncoder
# categorical_features = [['red'], ['green'], ['blue']]
# encoder = OneHotEncoder()
# one_hot = encoder.fit_transform(categorical_features).toarray()4. 特征选择
python
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
# 选择最佳 K 个特征
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_class, y_class)
# 查看特征得分
feature_scores = pd.DataFrame({
'feature': range(X_class.shape[1]),
'score': selector.scores_
}).sort_values('score', ascending=False)
print("特征重要性排序:")
print(feature_scores.head(10))
# 递归特征消除(RFE)
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_class, y_class)
# 查看被选中的特征
selected_features = np.where(rfe.support_)[0]
print(f"被选中的特征索引: {selected_features}")🎯 监督学习:分类
1. 基础分类算法
python
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# 准备数据
X_train, X_test, y_train, y_test = train_test_split(
X_class, y_class, test_size=0.2, random_state=42
)
# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 定义模型
models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingClassifier(random_state=42),
'SVM': SVC(kernel='rbf', random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Naive Bayes': GaussianNB()
}
# 训练和评估模型
results = {}
for name, model in models.items():
# 训练模型
if name in ['SVM', 'KNN', 'Logistic Regression']:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# 评估模型
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"\n{name}:")
print(f"准确率: {accuracy:.4f}")
print(f"分类报告:\n{classification_report(y_test, y_pred)}")
# 比较模型性能
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values('Accuracy', ascending=False)
print("\n模型性能排名:")
print(results_df)2. 模型评估
python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, roc_curve, confusion_matrix
)
import matplotlib.pyplot as plt
# 使用最佳模型
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
# 计算各种指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"精确率 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1 分数 (F1-Score): {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\n混淆矩阵:")
print(cm)
# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()
# ROC 曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label=f'ROC 曲线 (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机分类器')
plt.xlabel('假正例率 (FPR)')
plt.ylabel('真正例率 (TPR)')
plt.title('ROC 曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()3. 超参数调优
python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
# 网格搜索
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳准确率: {grid_search.best_score_:.4f}")
# 使用最佳模型
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"测试集准确率: {accuracy_score(y_test, y_pred):.4f}")
# 随机搜索(参数空间大时更高效)
from scipy.stats import randint
param_dist = {
'n_estimators': randint(50, 200),
'max_depth': [None] + list(range(10, 31)),
'min_samples_split': randint(2, 11),
'min_samples_leaf': randint(1, 5)
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_dist,
n_iter=20, # 随机尝试20组参数
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42
)
random_search.fit(X_train, y_train)
print(f"\n随机搜索最佳参数: {random_search.best_params_}")
print(f"随机搜索最佳准确率: {random_search.best_score_:.4f}")
# 交叉验证
cv_scores = cross_val_score(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train, y_train,
cv=5,
scoring='accuracy'
)
print(f"\n交叉验证分数: {cv_scores}")
print(f"平均准确率: {cv_scores.mean():.4f}")
print(f"标准差: {cv_scores.std():.4f}")📈 监督学习:回归
1. 基础回归算法
python
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# 准备回归数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 标准化数据
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)
# 定义回归模型
regression_models = {
'Linear Regression': LinearRegression(),
'Ridge': Ridge(alpha=1.0),
'Lasso': Lasso(alpha=1.0),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'SVR': SVR(kernel='rbf')
}
# 训练和评估回归模型
regression_results = {}
for name, model in regression_models.items():
# 训练模型
if name in ['Ridge', 'Lasso', 'SVR']:
model.fit(X_train_reg_scaled, y_train_reg)
y_pred_reg = model.predict(X_test_reg_scaled)
else:
model.fit(X_train_reg, y_train_reg)
y_pred_reg = model.predict(X_test_reg)
# 评估模型
mse = mean_squared_error(y_test_reg, y_pred_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
regression_results[name] = {
'MSE': mse,
'MAE': mae,
'R2': r2
}
print(f"\n{name}:")
print(f"均方误差 (MSE): {mse:.4f}")
print(f"平均绝对误差 (MAE): {mae:.4f}")
print(f"R² 分数: {r2:.4f}")
# 比较模型性能
regression_results_df = pd.DataFrame(regression_results).T
regression_results_df = regression_results_df.sort_values('R2', ascending=False)
print("\n回归模型性能排名:")
print(regression_results_df)2. 回归结果可视化
python
# 使用最佳回归模型
best_reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = best_reg_model.predict(X_test_reg)
# 可视化预测结果
plt.figure(figsize=(12, 5))
# 预测值 vs 真实值
plt.subplot(1, 2, 1)
plt.scatter(y_test_reg, y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()],
[y_test_reg.min(), y_test_reg.max()],
'r--', linewidth=2)
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('预测值 vs 真实值')
plt.grid(True, alpha=0.3)
# 残差图
residuals = y_test_reg - y_pred_reg
plt.subplot(1, 2, 2)
plt.scatter(y_pred_reg, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差图')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 特征重要性
if hasattr(best_reg_model, 'feature_importances_'):
feature_importance = pd.DataFrame({
'feature': range(X_reg.shape[1]),
'importance': best_reg_model.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importance)), feature_importance['importance'])
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('特征重要性')
plt.xticks(range(len(feature_importance)), feature_importance['feature'])
plt.grid(True, alpha=0.3, axis='y')
plt.show()🔍 无监督学习
1. 聚类分析
python
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# 准备聚类数据
X_train_cluster, X_test_cluster = train_test_split(
X_cluster, test_size=0.2, random_state=42
)
# K-means 聚类
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_cluster)
# 计算 Silhouette 分数
silhouette_avg = silhouette_score(X_cluster, kmeans_labels)
print(f"K-means Silhouette 分数: {silhouette_avg:.4f}")
# 可视化 K-means 聚类结果
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=kmeans_labels,
cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3)
plt.title('K-means 聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
# DBSCAN 聚类
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_cluster)
plt.subplot(1, 3, 2)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=dbscan_labels,
cmap='viridis', alpha=0.6)
plt.title('DBSCAN 聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
# 层次聚类
agg_cluster = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_cluster.fit_predict(X_cluster)
plt.subplot(1, 3, 3)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=agg_labels,
cmap='viridis', alpha=0.6)
plt.title('层次聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.tight_layout()
plt.show()
# 确定最佳聚类数(肘部法则)
inertia = []
K_range = range(1, 11)
for k in K_range:
kmeans_temp = KMeans(n_clusters=k, random_state=42)
kmeans_temp.fit(X_cluster)
inertia.append(kmeans_temp.inertia_)
plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('聚类数 (K)')
plt.ylabel('惯性')
plt.title('肘部法则确定最佳聚类数')
plt.grid(True, alpha=0.3)
plt.show()2. 降维技术
python
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# 使用鸢尾花数据集进行降维
X_iris_scaled = StandardScaler().fit_transform(X_iris)
# PCA 降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_iris_scaled)
# 可视化 PCA 结果
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris,
cmap='viridis', alpha=0.6)
plt.xlabel('主成分 1')
plt.ylabel('主成分 2')
plt.title(f'PCA 降维 (解释方差: {pca.explained_variance_ratio_.sum():.2f})')
plt.colorbar(scatter, label='类别')
# t-SNE 降维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_iris_scaled)
plt.subplot(1, 2, 2)
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_iris,
cmap='viridis', alpha=0.6)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE 降维')
plt.colorbar(scatter, label='类别')
plt.tight_layout()
plt.show()
# 查看主成分解释的方差
print("各主成分解释的方差比:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
print(f"PC{i+1}: {ratio:.4f}")
print(f"\n累计解释方差: {pca.explained_variance_ratio_.cumsum()}")🎯 实战案例:客户流失预测
python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
# 1. 创建模拟客户数据
np.random.seed(42)
n_customers = 5000
data = {
'age': np.random.randint(18, 80, n_customers),
'monthly_bill': np.random.uniform(30, 200, n_customers),
'usage_minutes': np.random.uniform(100, 2000, n_customers),
'customer_service_calls': np.random.poisson(2, n_customers),
'account_tenure': np.random.randint(1, 120, n_customers),
'has_internet': np.random.choice([0, 1], n_customers, p=[0.3, 0.7]),
'has_phone': np.random.choice([0, 1], n_customers, p=[0.2, 0.8]),
'has_streaming': np.random.choice([0, 1], n_customers, p=[0.4, 0.6]),
}
# 创建标签(流失与否)
# 高账单、高服务呼叫次数、短账户期限的客户更可能流失
churn_prob = (
0.3 * (data['monthly_bill'] > 100) +
0.4 * (data['customer_service_calls'] > 3) +
0.3 * (data['account_tenure'] < 12) +
np.random.normal(0, 0.2, n_customers)
)
data['churn'] = (churn_prob > 0.5).astype(int)
df = pd.DataFrame(data)
# 2. 数据探索
print("数据概览:")
print(df.info())
print("\n数据统计:")
print(df.describe())
# 流失率
churn_rate = df['churn'].mean()
print(f"\n流失率: {churn_rate:.2%}")
# 3. 特征工程
# 创建新特征
df['bill_per_minute'] = df['monthly_bill'] / (df['usage_minutes'] + 1)
df['calls_per_month'] = df['customer_service_calls'] / (df['account_tenure'] / 30 + 1)
# 4. 准备训练数据
X = df.drop('churn', axis=1)
y = df['churn']
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 5. 模型训练与比较
models = {
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}
results = {}
for name, model in models.items():
if name == 'Logistic Regression':
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
results[name] = {
'model': model,
'accuracy': model.score(X_test if name != 'Logistic Regression' else X_test_scaled, y_test),
'roc_auc': roc_auc_score(y_test, y_pred_proba)
}
print(f"\n{name}:")
print(classification_report(y_test, y_pred))
# 6. 选择最佳模型
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']
print(f"\n最佳模型: {best_model_name}")
print(f"ROC-AUC 分数: {results[best_model_name]['roc_auc']:.4f}")
# 7. 特征重要性分析
if hasattr(best_model, 'feature_importances_'):
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('重要性')
plt.title('特征重要性')
plt.grid(True, alpha=0.3, axis='x')
plt.gca().invert_yaxis()
plt.show()
# 8. 模型部署建议
print("\n=== 模型部署建议 ===")
print(f"1. 模型性能: {results[best_model_name]['roc_auc']:.2%} 的 AUC 分数")
print(f"2. 关键特征: {feature_importance['feature'].tolist()[:3]}")
print(f"3. 风险客户识别: 预测流失概率 > 0.7 的客户应优先联系")
print(f"4. 持续监控: 定期重新训练模型以适应数据变化")📚 最佳实践
1. 数据预处理清单
python
# 完整的数据预处理流程
"""
1. 数据清洗
- 处理缺失值
- 处理异常值
- 处理重复值
2. 特征编码
- 标签编码
- One-Hot 编码
- 特征哈希(大数据)
3. 特征缩放
- 标准化(StandardScaler)
- 归一化(MinMaxScaler)
- 稳健缩放(RobustScaler)
4. 特征选择
- 过滤法(SelectKBest)
- 包装法(RFE)
- 嵌入法(特征重要性)
5. 特征工程
- 创建交互特征
- 创建多项式特征
- 时间序列特征
"""2. 模型选择指南
python
# 根据问题类型选择模型
"""
分类问题:
- 线性可分: Logistic Regression, SVM
- 非线性: Random Forest, Gradient Boosting, Neural Networks
- 高维数据: SVM, Logistic Regression
- 解释性重要: Decision Tree, Logistic Regression
回归问题:
- 线性关系: Linear Regression, Ridge, Lasso
- 非线性: Random Forest, Gradient Boosting, SVR
- 时间序列: ARIMA, Prophet, LSTM
聚类问题:
- 球形聚类: K-means
- 任意形状: DBSCAN, Spectral Clustering
- 层次结构: Agglomerative Clustering
"""3. 模型评估要点
python
# 关键评估指标
"""
分类:
- 准确率(Accuracy):整体正确率
- 精确率(Precision):预测为正的样本中实际为正的比例
- 召回率(Recall):实际为正的样本中被预测为正的比例
- F1 分数:精确率和召回率的调和平均
- AUC-ROC:模型区分正负样本的能力
回归:
- MSE:均方误差
- MAE:平均绝对误差
- RMSE:均方根误差
- R²:决定系数
"""🎓 学习路径
机器学习基础(2周)
- 机器学习概念和类型
- 数据预处理
- 特征工程
监督学习(3-4周)
- 分类算法
- 回归算法
- 模型评估
无监督学习(2-3周)
- 聚类算法
- 降维技术
- 异常检测
模型优化(2-3周)
- 超参数调优
- 集成学习
- 模型融合
实战项目(4-6周)
- 客户流失预测
- 推荐系统
- 图像识别
📖 推荐资源
官方文档
推荐书籍
- 《Hands-On Machine Learning》- Aurélien Géron
- 《Python Machine Learning》- Sebastian Raschka
- 《Introduction to Statistical Learning》- Gareth James
在线课程
- Coursera - Machine Learning
- Kaggle Learn - Machine Learning
- fast.ai - Practical Deep Learning
掌握机器学习,开启人工智能之旅!

