대회 설명
7일(Day 0~ Day6) 동안의 데이터를 인풋으로 활용하여, 향후 2일(Day7 ~ Day8) 동안의 30분 간격의 발전량(TARGET)을 예측 (1일당 48개씩 총 96개 타임스텝에 대한 예측)
데이터 구성
Hour - 시간
Minute - 분
DHI - 수평면 산란일사량(Diffuse Horizontal Irradiance (W/m2))
DNI - 직달일사량(Direct Normal Irradiance (W/m2))
WS - 풍속(Wind Speed (m/s))
RH - 상대습도(Relative Humidity (%))
T - 기온(Temperature (Degree C))
Target - 태양광 발전량 (kW)
모델링 과정
본 대회에서 Light GBM을 활용한 예측과 CNN을 활용한 예측을 진행하였다. 본 대회의 평가지표인 pinball_loss(quantile)를 최소화하는 모델 구축을 위해 두 가지 방법 모두 quantile_loss를 loss_function으로 설정하였다.
또한, 예측해야하는 2일 중 각각의 날짜를 Target값으로 설정하여 결론적으로는, 일자별(2) * 0.1~0.9quantile별(9)별로 총 18번의 학습을 통해 예측값을 구하였다.
Light GBM
본 대회의 데이터는 Target값인 태양광 발전량이 0인 시간대(해가 떠있지 않은 시간)가 높은 비중을 차지하고 있다.
해가 떠있는 시간대에서의 Target값에 대한 정밀한 예측값 산출을 위해 Binary Classification(0 : 발전량 X / 1 : 발전량 O) 진행 후(Validation 정확도 99% 이상), 1로 예측된 값에 대해서만 추가적인 Regression 모델링 작업을 진행하였습니다. 파생 변수로는 일출/일몰 시간, 일정 날짜 동안의 동시간대 기본 변수들의 평균값 등을 추가적으로 생성하였다.
github : https://github.com/hoonsnote/Dacon/blob/8b574e16b9deacb30cdf95569a46d5b8efea2307/Solar%20power%20generation%20forecast/1.LightGBM.ipynb
- Library import 및 데이터 불러오기
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, RobustScaler, MinMaxScaler
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, recall_score, roc_curve, precision_score, f1_score, auc, mean_absolute_error
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import accuracy_score
filepath = '/content/drive/MyDrive/dacon/solar/'
train=pd.read_csv(filepath+'train/train.csv')
# for i in range(0,81):
# test = "test_%d = pd.read_csv('/content/drive/MyDrive/dacon/solar/test/%d.csv')"%(i,i)
# exec(test)
submission=pd.read_csv(filepath+'sample_submission.csv')
# 일별 Target MAX 값 Column 생성
def max_feature(data):
max = data.groupby('Day').max()[['TARGET']].reset_index()
data = pd.merge(data, max, on ='Day', how ='left')
data.rename({'TARGET_x' : 'TARGET', 'TARGET_y':'max_tagrget'}, axis=1, inplace=True)
return data
# 일출/일몰 시간 변수 생성
def suntime_feature(data):
data['day_time'] = data['Hour'] * 2 + data['Minute'] * 1/30
no_0 = data[data['TARGET'] != 0 ]
rise = no_0.groupby('Day').min()[['day_time']].reset_index()
data = pd.merge(data, rise, on ='Day', how ='left')
data.rename({'day_time_x' : 'day_time', 'day_time_y':'rising_time'}, axis=1, inplace=True)
sunset = no_0.groupby('Day').max()[['day_time']].reset_index()
data = pd.merge(data, sunset, on ='Day', how ='left')
data.rename({'day_time_x' : 'day_time', 'day_time_y':'set_time'}, axis=1, inplace=True)
return data
# 동시간대의 4, 7일 별 Target, DHI, DNI, T 평균값 생성
step = [4,7]
date_time = np.arange(0,48,1)
def mean_feature(data, feature):
for i in tqdm(step):
tmp_df = pd.DataFrame()
for j in date_time:
tmp_df = tmp_df.append(data[data['day_time'] == j].rolling(window=i).mean())[[feature]]
data.loc[:,'mean_{}'.format(feature)+str(i)] = tmp_df.sort_index().values
def new_day(data):
data.reset_index(inplace= True)
data['new_day'] = None
data['new_day'][0] = 0
for i in tqdm(range(1, len(data))):
if data['Day'][i] == data['Day'][i-1]:
data['new_day'][i] = data['new_day'][i-1]
else:
data['new_day'][i] = data['new_day'][i-1] + 1
return data
# 파생변수 생성
train = max_feature(train)
train = suntime_feature(train)
feature = ['TARGET','DHI','DNI','T']
for feat in feature:
mean_feature(train,feat)
train = new_day(train)
# 훈련용 Target 값 Columns 생성
train['target1'] = train['TARGET'].shift(-48) # 1일 만큼 Shift한 변수 생성
train['target2'] = train['TARGET'].shift(-96) # 2일 만큼 Shift한 변수 생성
- Binary Classification Modeling¶
# 전처리 과정 중 발생한 null값 포함 row 제거
train = train.dropna()
train.reset_index(drop = True, inplace=True)
train_copy = train.copy()
features = train.columns
features = features.drop(['target1','target2','Minute','Day','Hour'])
# Binary 종속 변수 생성
train['target1_c'] = train['target1'].apply(lambda x: int(1) if x > 0 else 0)
train['target2_c'] = train['target2'].apply(lambda x: int(1) if x > 0 else 0)
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(train[features], train['target1_c'], test_size=0.2, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(train[features], train['target2_c'], test_size=0.2, random_state=0)
start = time.time()
RF1 = RandomForestClassifier(random_state=0)
RF1.fit(X_train_1, Y_train_1)
print(time.time() - start)
RF2 = RandomForestClassifier(random_state=0)
RF2.fit(X_train_2, Y_train_2)
print(time.time() - start)
print(accuracy_score(Y_valid_1, RF1.predict(X_valid_1)))
print(accuracy_score(Y_valid_2, RF2.predict(X_valid_2)))
# Binary Classfication의 Accuracy 99% 이상임을 확인
def make_features(train):
train.reset_index(inplace= True)
train['new_day'] = None
train['new_day'][0] = 0
for i in tqdm(range(1, len(train))):
if train['Day'][i] == train['Day'][i-1]:
train['new_day'][i] = train['new_day'][i-1]
else:
train['new_day'][i] = train['new_day'][i-1] + 1
max = train.groupby('new_day').max()[['TARGET']].reset_index()
train = pd.merge(train, max, on ='new_day', how ='left')
train.rename({'TARGET_x' : 'TARGET', 'TARGET_y':'max_tagrget'}, axis=1, inplace=True)
train['day_time'] = train['Hour'] * 2 + train['Minute'] * 1/30
no_0 = train[train['TARGET'] != 0 ]
rise = no_0.groupby('new_day').min()[['day_time']].reset_index()
train = pd.merge(train, rise, on ='new_day', how ='left')
train.rename({'day_time_x' : 'day_time', 'day_time_y':'rising_time'}, axis=1, inplace=True)
sunset = no_0.groupby('new_day').max()[['day_time']].reset_index()
train = pd.merge(train, sunset, on ='new_day', how ='left')
train.rename({'day_time_x' : 'day_time', 'day_time_y':'set_time'}, axis=1, inplace=True)
return train
test = pd.DataFrame()
for i in tqdm(range(81)):
file_path = '/content/drive/MyDrive/dacon/solar/test/' + str(i) + '.csv'
temp = pd.read_csv(file_path)
test = test.append(temp)
test = make_features(df_test)
feature = ['TARGET','DHI','DNI','T']
for feat in feature:
mean_feature(test,feat)
test = test[test['Day'] == 6]
test.reset_index(drop = True, inplace=True)
# Test Set 에 대해서 0, 1 Classification 이후, 1인 값들의 Index 추출
preds1 = RF1.predict(test[features])
preds2 = RF2.predict(test[features])
test['pred_c_1'] = preds1
test['pred_c_2'] = preds2
reg1 = test[test['pred_c_1'] == 1]
reg2 = test[test['pred_c_2'] == 1]
ind1 = reg1.index
ind2 = reg2.index
- Light GBM Modeling(예측값 0 제외 Regression)
from lightgbm import LGBMRegressor
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
def LGBM(q, X_train, y_train, X_valid, y_valid, X_test):
model = LGBMRegressor(objective='quantile', alpha=q,
n_estimators=500, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)
model.fit(X_train, y_train, eval_metric = ['quantile'],
eval_set=[(X_valid, y_valid)], early_stopping_rounds=300, verbose=500)
pred = pd.Series(model.predict(X_test).round(2))
return pred, model
# Target 예측
def prediction(X_train, y_train, X_valid, y_valid, X_test):
LGBM_models=[]
LGBM_actual_pred = pd.DataFrame()
for q in quantiles:
print(q)
pred , model = LGBM(q, X_train, y_train, X_valid, y_valid, X_test)
LGBM_models.append(model)
LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)
LGBM_actual_pred.columns=quantiles
return LGBM_models, LGBM_actual_pred
a = train.iloc[:42000]
b = train.iloc[42000:]
X_valid_1 = np.array(b[b['target1'] > 0][features])
y_valid_1 = np.array(b[b['target1'] > 0]['target1'])
X_train_1 = np.array(a[a['target1'] > 0][features])
y_train_1 = np.array(a[a['target1'] > 0]['target1'])
X_test = np.array(test[features].iloc[ind1])
start = time.time()
models_1, results_1 = prediction(X_train_1, y_train_1, X_valid_1, y_valid_1, X_test)
results_1.sort_index()[:48]
print(time.time() - start)
results_1['index'] = ind1
results_2['index'] = ind2
del test['index']
test.reset_index(inplace=True)
test = pd.merge(test, results_1, on='index',how='left')
test = pd.merge(test, results_2, on='index',how='left')
first_cols = ['0.1_x', '0.2_x', '0.3_x', '0.4_x', '0.5_x', '0.6_x','0.7_x', '0.8_x', '0.9_x']
second_cols = ['0.1_y', '0.2_y', '0.3_y', '0.4_y', '0.5_y','0.6_y', '0.7_y', '0.8_y', '0.9_y']
cols = submission.columns.tolist()[1:]
prediction = []
for a in range(81):
prediction.extend(test.iloc[a*48:a*48+48][first_cols].values)
prediction.extend(test.iloc[a*48:a*48+48][second_cols].values)
submission[cols] = prediction
submission.fillna(0, inplace=True)
submission.set_index('id', inplace=True)
submission[submission <= 0] = 0 # 음수는 0으로 변환
submission[submission >= 100] = 99.9 # 최대값 99.9로 제한
submission.reset_index(inplace=True)
submission.iloc[10:30]