Spotify Popularity Prediction-ML Practice

# 기본적으로 사용할 파이썬 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1. 프로젝트 목표 및 목적

노래의 인기를 예측해보자!

2. 관련 데이터 수집

# 데이터 다운로드(song_data.csv / song_info.csv)
!gdown --id "1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-"
!gdown --id "1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs"

Downloading...
From: https://drive.google.com/uc?id=1tFSeDdJYrOcVzQnxHQdJmV2Bp60rJ9O-
To: /content/song_data.csv
2.22MB [00:00, 67.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NMpQ2nanbUUGOUo1PLBWgbxy59O69SZs
To: /content/song_info.csv
100% 1.24M/1.24M [00:00<00:00, 67.9MB/s]

# Pandas를 사용해서 CSV 파일 불러오기
# CSV 파일을 업로드 하세요
spotify_song_data= pd.read_csv("song_data.csv")
spotify_song_info= pd.read_csv("song_info.csv") 
song_info=spotify_song_info.copy()
song_data=spotify_song_data.copy()

song_info.head()

	song_name	artist_name	album_names	playlist
0	Boulevard of Broken Dreams	Green Day	Greatest Hits: God's Favorite Band	00s Rock Anthems
1	In The End	Linkin Park	Hybrid Theory	00s Rock Anthems
2	Seven Nation Army	The White Stripes	Elephant	00s Rock Anthems
3	By The Way	Red Hot Chili Peppers	By The Way (Deluxe Version)	00s Rock Anthems
4	How You Remind Me	Nickelback	Silver Side Up	00s Rock Anthems

song_data.head()

	song_name	song_popularity	song_duration_ms	acousticness	danceability	energy	instrumentalness	key	liveness	loudness	audio_mode	speechiness	tempo	time_signature	audio_valence
0	Boulevard of Broken Dreams	73	262333	0.005520	0.496	0.682	0.000029	8	0.0589	-4.095	1	0.0294	167.060	4	0.474
1	In The End	66	216933	0.010300	0.542	0.853	0.000000	3	0.1080	-6.407	0	0.0498	105.256	4	0.370
2	Seven Nation Army	76	231733	0.008170	0.737	0.463	0.447000	0	0.2550	-7.828	1	0.0792	123.881	4	0.324
3	By The Way	74	216933	0.026400	0.451	0.970	0.003550	0	0.1020	-4.938	1	0.1070	122.444	4	0.198
4	How You Remind Me	56	223826	0.000954	0.447	0.766	0.000000	10	0.1130	-5.065	1	0.0313	172.011	4	0.574

song_data.isnull().sum()

song_name           0
song_popularity     0
song_duration_ms    0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
audio_mode          0
speechiness         0
tempo               0
time_signature      0
audio_valence       0
dtype: int64

3. 데이터 전처리

생략

4. 머신러닝을 위한 데이터 전처리

song_data.set_index("song_name",drop=True, inplace=True)
song_data.head(3)

	song_popularity	song_duration_ms	acousticness	danceability	energy	instrumentalness	key	liveness	loudness	audio_mode	speechiness	tempo	time_signature	audio_valence
song_name
Boulevard of Broken Dreams	73	262333	0.00552	0.496	0.682	0.000029	8	0.0589	-4.095	1	0.0294	167.060	4	0.474
In The End	66	216933	0.01030	0.542	0.853	0.000000	3	0.1080	-6.407	0	0.0498	105.256	4	0.370
Seven Nation Army	76	231733	0.00817	0.737	0.463	0.447000	0	0.2550	-7.828	1	0.0792	123.881	4	0.324

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(song_data, test_size=0.2, random_state=42)

X_train = train_set.drop("song_popularity", axis=1)
y_train = train_set["song_popularity"].copy()

X_test = test_set.drop("song_popularity", axis=1)
y_test = test_set["song_popularity"].copy()

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15068, 13), (15068,), (3767, 13), (3767,))

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler = std_scaler.transform(X_test)

5. 학습

5.1 선형 모델

선형 모델 적합

from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

3-fold CV RMSE 평균

from sklearn.model_selection import cross_val_score
scores_lin = cross_val_score(lin_model, X_train_scaled, y_train, 
                scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_lin))

21.38831943908056

5.2 SVM

모델 적합

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svm_model = SVR()

param_svm = [{'kernel':['rbf'], 'C':[0.1, 1]}]

grid_svm = GridSearchCV(svm_model, param_svm, cv=3,
                        scoring="neg_mean_squared_error",
                        return_train_score=True)

grid_svm.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.1, 1], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

C값에 대한 그리드 탐색(3-fold)

cvres = grid_svm.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

21.516582484945115 {'C': 0.1, 'kernel': 'rbf'}
21.18512933550173 {'C': 1, 'kernel': 'rbf'}

5.3 결정 트리

모델 적합

from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=2)
tree_model.fit(X_train_scaled, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

3-fold CV RMSE 평균

scores_tree = cross_val_score(tree_model, X_train_scaled, y_train, 
                              scoring="neg_mean_squared_error", cv=3)
np.mean(np.sqrt(-scores_tree))

21.418182150412523

5.4 랜덤 포레스트

모델 적합

from sklearn.ensemble import RandomForestRegressor

param_rf = [{'n_estimators':[300], 'max_features':["sqrt", "log2"]}]

rf_model = RandomForestRegressor(n_jobs=-1)
grid_rf = GridSearchCV(rf_model, param_rf, cv=3,
                           scoring="neg_mean_squared_error",
                           return_train_score=True)

grid_rf.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_features': ['sqrt', 'log2'],
                          'n_estimators': [300]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=0)

max_features에 따른 RMSE (그리드 탐색)

cvres = grid_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

17.643624793579466 {'max_features': 'sqrt', 'n_estimators': 300}
17.645751050659012 {'max_features': 'log2', 'n_estimators': 300}

randomForest가 3-fold CV결과, RMSE가 가장 낮다.(max_features에 따른 결과는 거의 동일하다)

6. 평가 및 튜닝

선형 모델, SVM, 결정 트리, 랜덤 포레스트의 평가 값을 확인

앞서 5에서 여러 모델의 튜닝 및 평가 결과를 반영하여, 최종적으로 테스트 세트를 예측 후 RMSE 로 평가

from sklearn.metrics import mean_squared_error

final_svm = grid_svm.best_estimator_
final_rf = grid_rf.best_estimator_

y_pred_linear = lin_model.predict(X_test_scaled)
y_pred_svm = final_svm.predict(X_test_scaled)
y_pred_tree = tree_model.predict(X_test_scaled)
y_pred_rf = final_rf.predict(X_test_scaled)

print(f"선형모델 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_linear))}")
print(f"svm RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_svm_rbf))}")
print(f"결정트리 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_tree))}")
print(f"랜덤포레스트 RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_rf))}")

선형모델 RMSE : 21.48576103236179
svm RMSE : 21.28565325225383
결정트리 RMSE : 21.586728996253697
랜덤포레스트 RMSE : 17.043021157240226