# 기본적으로 사용할 파이썬 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 데이터 다운로드(song_data.csv / song_info.csv)
!gdown --id "1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-"
!gdown --id "1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs"
Downloading...
From: https://drive.google.com/uc?id=1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-
To: /content/song_data.csv
2.22MB [00:00, 67.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs
To: /content/song_info.csv
100% 1.24M/1.24M [00:00<00:00, 67.9MB/s]
# Pandas를 사용해서 CSV 파일 불러오기
# CSV 파일을 업로드 하세요
spotify_song_data= pd.read_csv("song_data.csv")
spotify_song_info= pd.read_csv("song_info.csv")
song_info=spotify_song_info.copy()
song_data=spotify_song_data.copy()
song_info.head()
song_name | artist_name | album_names | playlist | |
---|---|---|---|---|
0 | Boulevard of Broken Dreams | Green Day | Greatest Hits: God's Favorite Band | 00s Rock Anthems |
1 | In The End | Linkin Park | Hybrid Theory | 00s Rock Anthems |
2 | Seven Nation Army | The White Stripes | Elephant | 00s Rock Anthems |
3 | By The Way | Red Hot Chili Peppers | By The Way (Deluxe Version) | 00s Rock Anthems |
4 | How You Remind Me | Nickelback | Silver Side Up | 00s Rock Anthems |
song_data.head()
song_name | song_popularity | song_duration_ms | acousticness | danceability | energy | instrumentalness | key | liveness | loudness | audio_mode | speechiness | tempo | time_signature | audio_valence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Boulevard of Broken Dreams | 73 | 262333 | 0.005520 | 0.496 | 0.682 | 0.000029 | 8 | 0.0589 | -4.095 | 1 | 0.0294 | 167.060 | 4 | 0.474 |
1 | In The End | 66 | 216933 | 0.010300 | 0.542 | 0.853 | 0.000000 | 3 | 0.1080 | -6.407 | 0 | 0.0498 | 105.256 | 4 | 0.370 |
2 | Seven Nation Army | 76 | 231733 | 0.008170 | 0.737 | 0.463 | 0.447000 | 0 | 0.2550 | -7.828 | 1 | 0.0792 | 123.881 | 4 | 0.324 |
3 | By The Way | 74 | 216933 | 0.026400 | 0.451 | 0.970 | 0.003550 | 0 | 0.1020 | -4.938 | 1 | 0.1070 | 122.444 | 4 | 0.198 |
4 | How You Remind Me | 56 | 223826 | 0.000954 | 0.447 | 0.766 | 0.000000 | 10 | 0.1130 | -5.065 | 1 | 0.0313 | 172.011 | 4 | 0.574 |
song_data.isnull().sum()
song_name 0
song_popularity 0
song_duration_ms 0
acousticness 0
danceability 0
energy 0
instrumentalness 0
key 0
liveness 0
loudness 0
audio_mode 0
speechiness 0
tempo 0
time_signature 0
audio_valence 0
dtype: int64
song_data.set_index("song_name",drop=True, inplace=True)
song_data.head(3)
song_popularity | song_duration_ms | acousticness | danceability | energy | instrumentalness | key | liveness | loudness | audio_mode | speechiness | tempo | time_signature | audio_valence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
song_name | ||||||||||||||
Boulevard of Broken Dreams | 73 | 262333 | 0.00552 | 0.496 | 0.682 | 0.000029 | 8 | 0.0589 | -4.095 | 1 | 0.0294 | 167.060 | 4 | 0.474 |
In The End | 66 | 216933 | 0.01030 | 0.542 | 0.853 | 0.000000 | 3 | 0.1080 | -6.407 | 0 | 0.0498 | 105.256 | 4 | 0.370 |
Seven Nation Army | 76 | 231733 | 0.00817 | 0.737 | 0.463 | 0.447000 | 0 | 0.2550 | -7.828 | 1 | 0.0792 | 123.881 | 4 | 0.324 |
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(song_data, test_size=0.2, random_state=42)
X_train = train_set.drop("song_popularity", axis=1)
y_train = train_set["song_popularity"].copy()
X_test = test_set.drop("song_popularity", axis=1)
y_test = test_set["song_popularity"].copy()
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((15068, 13), (15068,), (3767, 13), (3767,))
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler = std_scaler.transform(X_test)
선형 모델 적합
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train_scaled, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
3-fold CV RMSE 평균
from sklearn.model_selection import cross_val_score
scores_lin = cross_val_score(lin_model, X_train_scaled, y_train,
scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_lin))
21.38831943908056
모델 적합
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
svm_model = SVR()
param_svm = [{'kernel':['rbf'], 'C':[0.1, 1]}]
grid_svm = GridSearchCV(svm_model, param_svm, cv=3,
scoring="neg_mean_squared_error",
return_train_score=True)
grid_svm.fit(X_train_scaled, y_train)
GridSearchCV(cv=3, error_score=nan,
estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
epsilon=0.1, gamma='scale', kernel='rbf',
max_iter=-1, shrinking=True, tol=0.001,
verbose=False),
iid='deprecated', n_jobs=None,
param_grid=[{'C': [0.1, 1], 'kernel': ['rbf']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='neg_mean_squared_error', verbose=0)
C값에 대한 그리드 탐색(3-fold)
cvres = grid_svm.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
21.516582484945115 {'C': 0.1, 'kernel': 'rbf'}
21.18512933550173 {'C': 1, 'kernel': 'rbf'}
모델 적합
from sklearn.tree import DecisionTreeRegressor
tree_model = DecisionTreeRegressor(max_depth=2)
tree_model.fit(X_train_scaled, y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
3-fold CV RMSE 평균
scores_tree = cross_val_score(tree_model, X_train_scaled, y_train,
scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_tree))
21.418182150412523
모델 적합
from sklearn.ensemble import RandomForestRegressor
param_rf = [{'n_estimators':[300], 'max_features':["sqrt", "log2"]}]
rf_model = RandomForestRegressor(n_jobs=-1)
grid_rf = GridSearchCV(rf_model, param_rf, cv=3,
scoring="neg_mean_squared_error",
return_train_score=True)
grid_rf.fit(X_train_scaled, y_train)
GridSearchCV(cv=3, error_score=nan,
estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
criterion='mse', max_depth=None,
max_features='auto',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=-1,
oob_score=False, random_state=None,
verbose=0, warm_start=False),
iid='deprecated', n_jobs=None,
param_grid=[{'max_features': ['sqrt', 'log2'],
'n_estimators': [300]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='neg_mean_squared_error', verbose=0)
max_features에 따른 RMSE (그리드 탐색)
cvres = grid_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
17.643624793579466 {'max_features': 'sqrt', 'n_estimators': 300}
17.645751050659012 {'max_features': 'log2', 'n_estimators': 300}
앞서 5에서 여러 모델의 튜닝 및 평가 결과를 반영하여, 최종적으로 테스트 세트를 예측 후 RMSE 로 평가
from sklearn.metrics import mean_squared_error
final_svm = grid_svm.best_estimator_
final_rf = grid_rf.best_estimator_
y_pred_linear = lin_model.predict(X_test_scaled)
y_pred_svm = final_svm.predict(X_test_scaled)
y_pred_tree = tree_model.predict(X_test_scaled)
y_pred_rf = final_rf.predict(X_test_scaled)
print(f"선형모델 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_linear))}")
print(f"svm RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_svm_rbf))}")
print(f"결정트리 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_tree))}")
print(f"랜덤포레스트 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_rf))}")
선형모델 RMSE : 21.48576103236179
svm RMSE : 21.28565325225383
결정트리 RMSE : 21.586728996253697
랜덤포레스트 RMSE : 17.043021157240226