Announcement

👇Official Account👇

图片

Welcome to join the group & private message

Article first/tail QR code

Skip to content

Python 机器学习入门:Scikit-learn 实战指南

机器学习是人工智能的核心技术之一,而 Scikit-learn 是 Python 最流行的机器学习库。本文将带你从零开始,系统地掌握机器学习的基础知识和实战技能。

🤖 机器学习基础

1. 机器学习类型

python
# 机器学习主要分为三大类

# 1. 监督学习(Supervised Learning)
# - 有标签数据
# - 预测连续值(回归)或离散类别(分类)
# 示例:房价预测、邮件分类

# 2. 无监督学习(Unsupervised Learning)
# - 无标签数据
# - 发现数据中的模式或结构
# 示例:客户聚类、异常检测

# 3. 强化学习(Reinforcement Learning)
# - 通过与环境交互学习
# - 基于奖励和惩罚优化策略
# 示例:游戏 AI、机器人控制

2. 机器学习工作流程

python
# 典型的机器学习工作流程
"""
1. 数据收集

2. 数据预处理

3. 特征工程

4. 模型选择

5. 模型训练

6. 模型评估

7. 模型优化

8. 模型部署
"""

📊 数据准备

1. 数据加载

python
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import train_test_split

# 使用内置数据集
# 鸢尾花数据集(分类)
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
print(f"鸢尾花数据集: {X_iris.shape}, 标签: {np.unique(y_iris)}")

# 创建示例数据集
# 分类问题
from sklearn.datasets import make_classification
X_class, y_class = make_classification(
    n_samples=1000,    # 样本数
    n_features=20,     # 特征数
    n_informative=10,  # 有用特征数
    n_classes=2,      # 类别数
    random_state=42
)

# 回归问题
from sklearn.datasets import make_regression
X_reg, y_reg = make_regression(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    noise=0.1,
    random_state=42
)

# 聚类问题
from sklearn.datasets import make_blobs
X_cluster, y_cluster = make_blobs(
    n_samples=500,
    centers=3,
    n_features=2,
    random_state=42
)

# 从 CSV 文件加载
# df = pd.read_csv('data.csv')
# X = df.drop('target', axis=1)
# y = df['target']

2. 数据分割

python
# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_iris, y_iris, 
    test_size=0.2,        # 20% 作为测试集
    random_state=42,      # 随机种子
    stratify=y_iris       # 分层抽样,保持类别比例
)

# 三分割(训练集、验证集、测试集)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_iris, y_iris, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"训练集: {X_train.shape}")
print(f"验证集: {X_val.shape}")
print(f"测试集: {X_test.shape}")

3. 数据预处理

python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# 缺失值处理
# 创建带缺失值的数据
X_with_nan = X_class.copy()
X_with_nan[np.random.rand(*X_with_nan.shape) < 0.1] = np.nan

# 均值填充
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_with_nan)

# 中位数填充
imputer_median = SimpleImputer(strategy='median')
X_imputed_median = imputer_median.fit_transform(X_with_nan)

# 特征缩放
# 标准化(均值为0,标准差为1)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_class)

print(f"标准化前: 均值={X_class.mean():.2f}, 标准差={X_class.std():.2f}")
print(f"标准化后: 均值={X_standardized.mean():.2f}, 标准差={X_standardized.std():.2f}")

# 归一化(值域[0, 1])
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X_class)

# 标签编码(将字符串标签转换为数字)
# y_cat = ['cat', 'dog', 'cat', 'bird']
# encoder = LabelEncoder()
# y_encoded = encoder.fit_transform(y_cat)
# print(y_encoded)  # [0, 1, 0, 2]

# One-Hot 编码(用于分类特征)
from sklearn.preprocessing import OneHotEncoder
# categorical_features = [['red'], ['green'], ['blue']]
# encoder = OneHotEncoder()
# one_hot = encoder.fit_transform(categorical_features).toarray()

4. 特征选择

python
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# 选择最佳 K 个特征
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_class, y_class)

# 查看特征得分
feature_scores = pd.DataFrame({
    'feature': range(X_class.shape[1]),
    'score': selector.scores_
}).sort_values('score', ascending=False)

print("特征重要性排序:")
print(feature_scores.head(10))

# 递归特征消除(RFE)
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_class, y_class)

# 查看被选中的特征
selected_features = np.where(rfe.support_)[0]
print(f"被选中的特征索引: {selected_features}")

🎯 监督学习:分类

1. 基础分类算法

python
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 准备数据
X_train, X_test, y_train, y_test = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义模型
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB()
}

# 训练和评估模型
results = {}
for name, model in models.items():
    # 训练模型
    if name in ['SVM', 'KNN', 'Logistic Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # 评估模型
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"\n{name}:")
    print(f"准确率: {accuracy:.4f}")
    print(f"分类报告:\n{classification_report(y_test, y_pred)}")

# 比较模型性能
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values('Accuracy', ascending=False)
print("\n模型性能排名:")
print(results_df)

2. 模型评估

python
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix
)
import matplotlib.pyplot as plt

# 使用最佳模型
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# 计算各种指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"精确率 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1 分数 (F1-Score): {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\n混淆矩阵:")
print(cm)

# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
import seaborn as sns
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()

# ROC 曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, linewidth=2, label=f'ROC 曲线 (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机分类器')
plt.xlabel('假正例率 (FPR)')
plt.ylabel('真正例率 (TPR)')
plt.title('ROC 曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

3. 超参数调优

python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# 网格搜索
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳准确率: {grid_search.best_score_:.4f}")

# 使用最佳模型
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"测试集准确率: {accuracy_score(y_test, y_pred):.4f}")

# 随机搜索(参数空间大时更高效)
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(range(10, 31)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist,
    n_iter=20,  # 随机尝试20组参数
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print(f"\n随机搜索最佳参数: {random_search.best_params_}")
print(f"随机搜索最佳准确率: {random_search.best_score_:.4f}")

# 交叉验证
cv_scores = cross_val_score(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X_train, y_train,
    cv=5,
    scoring='accuracy'
)

print(f"\n交叉验证分数: {cv_scores}")
print(f"平均准确率: {cv_scores.mean():.4f}")
print(f"标准差: {cv_scores.std():.4f}")

📈 监督学习:回归

1. 基础回归算法

python
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 准备回归数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# 标准化数据
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

# 定义回归模型
regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR(kernel='rbf')
}

# 训练和评估回归模型
regression_results = {}
for name, model in regression_models.items():
    # 训练模型
    if name in ['Ridge', 'Lasso', 'SVR']:
        model.fit(X_train_reg_scaled, y_train_reg)
        y_pred_reg = model.predict(X_test_reg_scaled)
    else:
        model.fit(X_train_reg, y_train_reg)
        y_pred_reg = model.predict(X_test_reg)
    
    # 评估模型
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    mae = mean_absolute_error(y_test_reg, y_pred_reg)
    r2 = r2_score(y_test_reg, y_pred_reg)
    
    regression_results[name] = {
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }
    
    print(f"\n{name}:")
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"平均绝对误差 (MAE): {mae:.4f}")
    print(f"R² 分数: {r2:.4f}")

# 比较模型性能
regression_results_df = pd.DataFrame(regression_results).T
regression_results_df = regression_results_df.sort_values('R2', ascending=False)
print("\n回归模型性能排名:")
print(regression_results_df)

2. 回归结果可视化

python
# 使用最佳回归模型
best_reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = best_reg_model.predict(X_test_reg)

# 可视化预测结果
plt.figure(figsize=(12, 5))

# 预测值 vs 真实值
plt.subplot(1, 2, 1)
plt.scatter(y_test_reg, y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()], 
         [y_test_reg.min(), y_test_reg.max()], 
         'r--', linewidth=2)
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('预测值 vs 真实值')
plt.grid(True, alpha=0.3)

# 残差图
residuals = y_test_reg - y_pred_reg
plt.subplot(1, 2, 2)
plt.scatter(y_pred_reg, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差图')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 特征重要性
if hasattr(best_reg_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': range(X_reg.shape[1]),
        'importance': best_reg_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_importance)), feature_importance['importance'])
    plt.xlabel('特征')
    plt.ylabel('重要性')
    plt.title('特征重要性')
    plt.xticks(range(len(feature_importance)), feature_importance['feature'])
    plt.grid(True, alpha=0.3, axis='y')
    plt.show()

🔍 无监督学习

1. 聚类分析

python
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# 准备聚类数据
X_train_cluster, X_test_cluster = train_test_split(
    X_cluster, test_size=0.2, random_state=42
)

# K-means 聚类
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_cluster)

# 计算 Silhouette 分数
silhouette_avg = silhouette_score(X_cluster, kmeans_labels)
print(f"K-means Silhouette 分数: {silhouette_avg:.4f}")

# 可视化 K-means 聚类结果
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=kmeans_labels, 
            cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='x', s=200, linewidths=3)
plt.title('K-means 聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')

# DBSCAN 聚类
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_cluster)

plt.subplot(1, 3, 2)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=dbscan_labels, 
            cmap='viridis', alpha=0.6)
plt.title('DBSCAN 聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')

# 层次聚类
agg_cluster = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_cluster.fit_predict(X_cluster)

plt.subplot(1, 3, 3)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=agg_labels, 
            cmap='viridis', alpha=0.6)
plt.title('层次聚类')
plt.xlabel('特征 1')
plt.ylabel('特征 2')

plt.tight_layout()
plt.show()

# 确定最佳聚类数(肘部法则)
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_cluster)
    inertia.append(kmeans_temp.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('聚类数 (K)')
plt.ylabel('惯性')
plt.title('肘部法则确定最佳聚类数')
plt.grid(True, alpha=0.3)
plt.show()

2. 降维技术

python
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# 使用鸢尾花数据集进行降维
X_iris_scaled = StandardScaler().fit_transform(X_iris)

# PCA 降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_iris_scaled)

# 可视化 PCA 结果
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris, 
                      cmap='viridis', alpha=0.6)
plt.xlabel('主成分 1')
plt.ylabel('主成分 2')
plt.title(f'PCA 降维 (解释方差: {pca.explained_variance_ratio_.sum():.2f})')
plt.colorbar(scatter, label='类别')

# t-SNE 降维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_iris_scaled)

plt.subplot(1, 2, 2)
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_iris, 
                      cmap='viridis', alpha=0.6)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE 降维')
plt.colorbar(scatter, label='类别')

plt.tight_layout()
plt.show()

# 查看主成分解释的方差
print("各主成分解释的方差比:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {ratio:.4f}")

print(f"\n累计解释方差: {pca.explained_variance_ratio_.cumsum()}")

🎯 实战案例:客户流失预测

python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

# 1. 创建模拟客户数据
np.random.seed(42)
n_customers = 5000

data = {
    'age': np.random.randint(18, 80, n_customers),
    'monthly_bill': np.random.uniform(30, 200, n_customers),
    'usage_minutes': np.random.uniform(100, 2000, n_customers),
    'customer_service_calls': np.random.poisson(2, n_customers),
    'account_tenure': np.random.randint(1, 120, n_customers),
    'has_internet': np.random.choice([0, 1], n_customers, p=[0.3, 0.7]),
    'has_phone': np.random.choice([0, 1], n_customers, p=[0.2, 0.8]),
    'has_streaming': np.random.choice([0, 1], n_customers, p=[0.4, 0.6]),
}

# 创建标签(流失与否)
# 高账单、高服务呼叫次数、短账户期限的客户更可能流失
churn_prob = (
    0.3 * (data['monthly_bill'] > 100) +
    0.4 * (data['customer_service_calls'] > 3) +
    0.3 * (data['account_tenure'] < 12) +
    np.random.normal(0, 0.2, n_customers)
)
data['churn'] = (churn_prob > 0.5).astype(int)

df = pd.DataFrame(data)

# 2. 数据探索
print("数据概览:")
print(df.info())
print("\n数据统计:")
print(df.describe())

# 流失率
churn_rate = df['churn'].mean()
print(f"\n流失率: {churn_rate:.2%}")

# 3. 特征工程
# 创建新特征
df['bill_per_minute'] = df['monthly_bill'] / (df['usage_minutes'] + 1)
df['calls_per_month'] = df['customer_service_calls'] / (df['account_tenure'] / 30 + 1)

# 4. 准备训练数据
X = df.drop('churn', axis=1)
y = df['churn']

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. 模型训练与比较
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'model': model,
        'accuracy': model.score(X_test if name != 'Logistic Regression' else X_test_scaled, y_test),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    print(f"\n{name}:")
    print(classification_report(y_test, y_pred))

# 6. 选择最佳模型
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print(f"\n最佳模型: {best_model_name}")
print(f"ROC-AUC 分数: {results[best_model_name]['roc_auc']:.4f}")

# 7. 特征重要性分析
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.xlabel('重要性')
    plt.title('特征重要性')
    plt.grid(True, alpha=0.3, axis='x')
    plt.gca().invert_yaxis()
    plt.show()

# 8. 模型部署建议
print("\n=== 模型部署建议 ===")
print(f"1. 模型性能: {results[best_model_name]['roc_auc']:.2%} 的 AUC 分数")
print(f"2. 关键特征: {feature_importance['feature'].tolist()[:3]}")
print(f"3. 风险客户识别: 预测流失概率 > 0.7 的客户应优先联系")
print(f"4. 持续监控: 定期重新训练模型以适应数据变化")

📚 最佳实践

1. 数据预处理清单

python
# 完整的数据预处理流程
"""
1. 数据清洗
   - 处理缺失值
   - 处理异常值
   - 处理重复值

2. 特征编码
   - 标签编码
   - One-Hot 编码
   - 特征哈希(大数据)

3. 特征缩放
   - 标准化(StandardScaler)
   - 归一化(MinMaxScaler)
   - 稳健缩放(RobustScaler)

4. 特征选择
   - 过滤法(SelectKBest)
   - 包装法(RFE)
   - 嵌入法(特征重要性)

5. 特征工程
   - 创建交互特征
   - 创建多项式特征
   - 时间序列特征
"""

2. 模型选择指南

python
# 根据问题类型选择模型
"""
分类问题:
- 线性可分: Logistic Regression, SVM
- 非线性: Random Forest, Gradient Boosting, Neural Networks
- 高维数据: SVM, Logistic Regression
- 解释性重要: Decision Tree, Logistic Regression

回归问题:
- 线性关系: Linear Regression, Ridge, Lasso
- 非线性: Random Forest, Gradient Boosting, SVR
- 时间序列: ARIMA, Prophet, LSTM

聚类问题:
- 球形聚类: K-means
- 任意形状: DBSCAN, Spectral Clustering
- 层次结构: Agglomerative Clustering
"""

3. 模型评估要点

python
# 关键评估指标
"""
分类:
- 准确率(Accuracy):整体正确率
- 精确率(Precision):预测为正的样本中实际为正的比例
- 召回率(Recall):实际为正的样本中被预测为正的比例
- F1 分数:精确率和召回率的调和平均
- AUC-ROC:模型区分正负样本的能力

回归:
- MSE:均方误差
- MAE:平均绝对误差
- RMSE:均方根误差
- R²:决定系数
"""

🎓 学习路径

  1. 机器学习基础(2周)

    • 机器学习概念和类型
    • 数据预处理
    • 特征工程
  2. 监督学习(3-4周)

    • 分类算法
    • 回归算法
    • 模型评估
  3. 无监督学习(2-3周)

    • 聚类算法
    • 降维技术
    • 异常检测
  4. 模型优化(2-3周)

    • 超参数调优
    • 集成学习
    • 模型融合
  5. 实战项目(4-6周)

    • 客户流失预测
    • 推荐系统
    • 图像识别

📖 推荐资源

  • 官方文档

  • 推荐书籍

    • 《Hands-On Machine Learning》- Aurélien Géron
    • 《Python Machine Learning》- Sebastian Raschka
    • 《Introduction to Statistical Learning》- Gareth James
  • 在线课程

    • Coursera - Machine Learning
    • Kaggle Learn - Machine Learning
    • fast.ai - Practical Deep Learning

掌握机器学习,开启人工智能之旅!

上次更新于: