Spotify Popularity Prediction-ML Practice

# 기본적으로 사용할 파이썬 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1. 프로젝트 목표 및 목적

2. 관련 데이터 수집

# 데이터 다운로드(song_data.csv / song_info.csv)
!gdown --id "1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-"
!gdown --id "1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs"
Downloading...
From: https://drive.google.com/uc?id=1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-
To: /content/song_data.csv
2.22MB [00:00, 67.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs
To: /content/song_info.csv
100% 1.24M/1.24M [00:00<00:00, 67.9MB/s]
# Pandas를 사용해서 CSV 파일 불러오기
# CSV 파일을 업로드 하세요
spotify_song_data= pd.read_csv("song_data.csv")
spotify_song_info= pd.read_csv("song_info.csv") 
song_info=spotify_song_info.copy()
song_data=spotify_song_data.copy()
song_info.head()
song_name artist_name album_names playlist
0 Boulevard of Broken Dreams Green Day Greatest Hits: God's Favorite Band 00s Rock Anthems
1 In The End Linkin Park Hybrid Theory 00s Rock Anthems
2 Seven Nation Army The White Stripes Elephant 00s Rock Anthems
3 By The Way Red Hot Chili Peppers By The Way (Deluxe Version) 00s Rock Anthems
4 How You Remind Me Nickelback Silver Side Up 00s Rock Anthems
song_data.head()
song_name song_popularity song_duration_ms acousticness danceability energy instrumentalness key liveness loudness audio_mode speechiness tempo time_signature audio_valence
0 Boulevard of Broken Dreams 73 262333 0.005520 0.496 0.682 0.000029 8 0.0589 -4.095 1 0.0294 167.060 4 0.474
1 In The End 66 216933 0.010300 0.542 0.853 0.000000 3 0.1080 -6.407 0 0.0498 105.256 4 0.370
2 Seven Nation Army 76 231733 0.008170 0.737 0.463 0.447000 0 0.2550 -7.828 1 0.0792 123.881 4 0.324
3 By The Way 74 216933 0.026400 0.451 0.970 0.003550 0 0.1020 -4.938 1 0.1070 122.444 4 0.198
4 How You Remind Me 56 223826 0.000954 0.447 0.766 0.000000 10 0.1130 -5.065 1 0.0313 172.011 4 0.574
song_data.isnull().sum()
song_name           0
song_popularity     0
song_duration_ms    0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
audio_mode          0
speechiness         0
tempo               0
time_signature      0
audio_valence       0
dtype: int64

3. 데이터 전처리

4. 머신러닝을 위한 데이터 전처리

song_data.set_index("song_name",drop=True, inplace=True)
song_data.head(3)
song_popularity song_duration_ms acousticness danceability energy instrumentalness key liveness loudness audio_mode speechiness tempo time_signature audio_valence
song_name
Boulevard of Broken Dreams 73 262333 0.00552 0.496 0.682 0.000029 8 0.0589 -4.095 1 0.0294 167.060 4 0.474
In The End 66 216933 0.01030 0.542 0.853 0.000000 3 0.1080 -6.407 0 0.0498 105.256 4 0.370
Seven Nation Army 76 231733 0.00817 0.737 0.463 0.447000 0 0.2550 -7.828 1 0.0792 123.881 4 0.324
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(song_data, test_size=0.2, random_state=42)

X_train = train_set.drop("song_popularity", axis=1)
y_train = train_set["song_popularity"].copy()

X_test = test_set.drop("song_popularity", axis=1)
y_test = test_set["song_popularity"].copy()

X_train.shape, y_train.shape, X_test.shape, y_test.shape
((15068, 13), (15068,), (3767, 13), (3767,))
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler = std_scaler.transform(X_test)

5. 학습

5.1 선형 모델

선형 모델 적합

from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(X_train_scaled, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

3-fold CV RMSE 평균

from sklearn.model_selection import cross_val_score
scores_lin = cross_val_score(lin_model, X_train_scaled, y_train, 
                scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_lin))
21.38831943908056

5.2 SVM

모델 적합

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svm_model = SVR()

param_svm = [{'kernel':['rbf'], 'C':[0.1, 1]}]

grid_svm = GridSearchCV(svm_model, param_svm, cv=3,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_svm.fit(X_train_scaled, y_train)
GridSearchCV(cv=3, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.1, 1], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

C값에 대한 그리드 탐색(3-fold)

cvres = grid_svm.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
21.516582484945115 {'C': 0.1, 'kernel': 'rbf'}
21.18512933550173 {'C': 1, 'kernel': 'rbf'}

5.3 결정 트리

모델 적합

from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=2)
tree_model.fit(X_train_scaled, y_train)
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

3-fold CV RMSE 평균

scores_tree = cross_val_score(tree_model, X_train_scaled, y_train, 
                              scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_tree))
21.418182150412523

5.4 랜덤 포레스트

모델 적합

from sklearn.ensemble import RandomForestRegressor

param_rf = [{'n_estimators':[300], 'max_features':["sqrt", "log2"]}]

rf_model = RandomForestRegressor(n_jobs=-1)
grid_rf = GridSearchCV(rf_model, param_rf, cv=3,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)
grid_rf.fit(X_train_scaled, y_train)
GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_features': ['sqrt', 'log2'],
                          'n_estimators': [300]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

max_features에 따른 RMSE (그리드 탐색)

cvres = grid_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
17.643624793579466 {'max_features': 'sqrt', 'n_estimators': 300}
17.645751050659012 {'max_features': 'log2', 'n_estimators': 300}

6. 평가 및 튜닝

앞서 5에서 여러 모델의 튜닝 및 평가 결과를 반영하여, 최종적으로 테스트 세트를 예측 후 RMSE 로 평가

from sklearn.metrics import mean_squared_error

final_svm = grid_svm.best_estimator_
final_rf = grid_rf.best_estimator_
y_pred_linear = lin_model.predict(X_test_scaled)
y_pred_svm = final_svm.predict(X_test_scaled)
y_pred_tree = tree_model.predict(X_test_scaled)
y_pred_rf = final_rf.predict(X_test_scaled)

print(f"선형모델 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_linear))}")
print(f"svm RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_svm_rbf))}")
print(f"결정트리 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_tree))}")
print(f"랜덤포레스트 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_rf))}")
선형모델 RMSE : 21.48576103236179
svm RMSE : 21.28565325225383
결정트리 RMSE : 21.586728996253697
랜덤포레스트 RMSE : 17.043021157240226