极市导读
对于数据挖掘项目,本文将学习如何建模调参?从简单的模型开始,如何去建立一个模型;如何进行交叉验证;如何调节参数优化等。 >>加入极市CV技术交流群,走在计算机视觉的最前沿
模型没有很好或足够数量的训练训练集
模型的训练特征过于简单
模型没有很好或足够数量的训练训练集
训练数据和测试数据有偏差
模型的训练过度,过于复杂,没有学到主要的特征
sklearn.linear_model.LinearRegression(fit_intercept=
True,normalize=
False,copy_X=
True,n_jobs=
1
model = LinearRegression(normalize=True)
model.fit(data_x, data_y)
model.intercept_, model.coef_
'intercept:'+ str(model.intercept_)
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=
lambda x:x[
1], reverse=
True)
## output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_y = np.log(data_y + 1)
# 交叉验证
scores = cross_val_score(LinearRegression(normalize=True), X=data_x, \
y=data_y, cv=5, scoring=make_scorer(mean_absolute_error))
np.mean(scores)
import datetime
sample_feature = sample_feature.reset_index(drop=True)
split_point = len(sample_feature) // 5 * 4
train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()
train_X = train[continuous_feature_names]
train_y_ln = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_ln = np.log(val['price'] + 1)
model = model.fit(train_X, train_y_ln)
fill_between()
train_sizes - 第一个参数表示覆盖的区域
train_scores_mean - train_scores_std - 第二个参数表示覆盖的下限
train_scores_mean + train_scores_std - 第三个参数表示覆盖的上限
color - 表示覆盖区域的颜色
alpha - 覆盖区域的透明度,越大越不透明 [0,1]
mean_absolute_error(
val_y_ln,
model
.predict(
val_X))
0.19443858353490887
models = [LinearRegression(),
Ridge(),
Lasso()]
result = dict()
for model
in models:
model_name = str(model).split(
'(')[0]
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
result[model_name] = scores
print(model_name +
' is finished')
result = pd.DataFrame(result)
result.index = [
'cv' + str(x)
for x
in range(1, 6)]
result
SVR:用于标签连续值的回归问题
SVC:用于分类标签的分类问题
from sklearn.linear_model
import LinearRegression
from sklearn.svm
import SVC
from sklearn.tree
import DecisionTreeRegressor
from sklearn.ensemble
import RandomForestRegressor
from sklearn.ensemble
import GradientBoostingRegressor
from sklearn.neural_network
import MLPRegressor
from xgboost.sklearn
import XGBRegressor
from lightgbm.sklearn
import LGBMRegressor
models = [LinearRegression(),
DecisionTreeRegressor(),
RandomForestRegressor(),
GradientBoostingRegressor(),
MLPRegressor(solver=
'lbfgs', max_iter=
100),
XGBRegressor(n_estimators =
100, objective=
'reg:squarederror'),
LGBMRegressor(n_estimators =
100)]
result = dict()
for model
in models:
model_name = str(model).split(
'(')[
0]
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=
0, cv =
5, scoring=make_scorer(mean_absolute_error))
result[model_name] = scores
print(model_name +
' is finished')
result = pd.DataFrame(result)
result.index = [
'cv' + str(x)
for x
in range(
1,
6)]
result
objectives = [
"rank:map",
"reg:gamma",
"count:poisson",
"reg:tweedie",
"reg:squaredlogerror"]
max_depths = [1, 3, 5, 10, 15]
lambdas = [.1, 1, 2, 3, 4]
best_obj = dict()
for obj in objective:
model = LGBMRegressor(objective=obj)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_obj[obj] = score
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_leaves[leaves] = score
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_depth[depth] = score
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_y)
clf.best_params_
model = LGBMRegressor(objective='regression',
num_leaves=55,
max_depth=15)
np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
0.13626164479243302
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
val = cross_val_score(
LGBMRegressor(objective =
'regression_l1',
num_leaves=
int(num_leaves),
max_depth=
int(max_depth),
subsample = subsample,
min_child_samples =
int(min_child_samples)
),
X=train_X, y=train_y_ln, verbose=
0, cv =
5, scoring=make_scorer(mean_absolute_error)
).mean()
return
1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (
2,
100),
'max_depth': (
2,
100),
'subsample': (
0.1,
1),
'min_child_samples' : (
2,
100)
}
)
rf_bo.maximize()
公众号后台回复“项目实践”获取50+CV项目实践机会~
“
点击阅读原文进入CV社区
收获更多技术干货