import gc
import glob
import os
import sys
import time
import traceback
from contextlib import contextmanager
from enum import Enum
from typing import List, Optional
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from joblib import Parallel, delayed
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import minmax_scale
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
DATA_DIR = '../input'
USE_PRECOMPUTE_FEATURES = True
PREDICT_CNN = True
PREDICT_MLP = True
PREDICT_GBDT = True
PREDICT_TABNET = False
GBDT_NUM_MODELS = 5 #3
GBDT_LR = 0.02 # 0.1
NN_VALID_TH = 0.185
NN_MODEL_TOP_N = 5
TAB_MODEL_TOP_N = 3
ENSEMBLE_METHOD = 'mean'
NN_NUM_MODELS = 10
TABNET_NUM_MODELS = 5
IS_1ST_STAGE = False
SHORTCUT_NN_IN_1ST_STAGE = True # early-stop training to save GPU quota
SHORTCUT_GBDT_IN_1ST_STAGE = True
MEMORY_TEST_MODE = False
@contextmanager
def timer(name: str):
s = time.time()
yield
elapsed = time.time() - s
print(f'[{name}] {elapsed: .3f}sec')
def print_trace(name: str = ''):
print(f'ERROR RAISED IN {name or "anonymous"}')
print(traceback.format_exc())
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl
train = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'train.csv'))
stock_ids = set(train['stock_id'])
class DataBlock(Enum):
TRAIN = 1
TEST = 2
BOTH = 3
def load_stock_data(stock_id: int, directory: str) -> pd.DataFrame:
return pd.read_parquet(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', directory, f'stock_id={stock_id}'))
def load_data(stock_id: int, stem: str, block: DataBlock) -> pd.DataFrame:
if block == DataBlock.TRAIN:
return load_stock_data(stock_id, f'{stem}_train.parquet')
elif block == DataBlock.TEST:
return load_stock_data(stock_id, f'{stem}_test.parquet')
else:
return pd.concat([
load_data(stock_id, stem, DataBlock.TRAIN),
load_data(stock_id, stem, DataBlock.TEST)
]).reset_index(drop=True)
def load_book(stock_id: int, block: DataBlock=DataBlock.TRAIN) -> pd.DataFrame:
return load_data(stock_id, 'book', block)
def load_trade(stock_id: int, block=DataBlock.TRAIN) -> pd.DataFrame:
return load_data(stock_id, 'trade', block)
def calc_wap1(df: pd.DataFrame) -> pd.Series:
wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
return wap
def calc_wap2(df: pd.DataFrame) -> pd.Series:
wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
return wap
def realized_volatility(series):
return np.sqrt(np.sum(series**2))
def log_return(series: np.ndarray):
return np.log(series).diff()
def log_return_df2(series: np.ndarray):
return np.log(series).diff(2)
def flatten_name(prefix, src_names):
ret = []
for c in src_names:
if c[0] in ['time_id', 'stock_id']:
ret.append(c[0])
else:
ret.append('.'.join([prefix] + list(c)))
return ret
def make_book_feature(stock_id, block = DataBlock.TRAIN):
book = load_book(stock_id, block)
book['wap1'] = calc_wap1(book)
book['wap2'] = calc_wap2(book)
book['log_return1'] = book.groupby(['time_id'])['wap1'].apply(log_return)
book['log_return2'] = book.groupby(['time_id'])['wap2'].apply(log_return)
book['log_return_ask1'] = book.groupby(['time_id'])['ask_price1'].apply(log_return)
book['log_return_ask2'] = book.groupby(['time_id'])['ask_price2'].apply(log_return)
book['log_return_bid1'] = book.groupby(['time_id'])['bid_price1'].apply(log_return)
book['log_return_bid2'] = book.groupby(['time_id'])['bid_price2'].apply(log_return)
book['wap_balance'] = abs(book['wap1'] - book['wap2'])
book['price_spread'] = (book['ask_price1'] - book['bid_price1']) / ((book['ask_price1'] + book['bid_price1']) / 2)
book['bid_spread'] = book['bid_price1'] - book['bid_price2']
book['ask_spread'] = book['ask_price1'] - book['ask_price2']
book['total_volume'] = (book['ask_size1'] + book['ask_size2']) + (book['bid_size1'] + book['bid_size2'])
book['volume_imbalance'] = abs((book['ask_size1'] + book['ask_size2']) - (book['bid_size1'] + book['bid_size2']))
features = {
'seconds_in_bucket': ['count'],
'wap1': [np.sum, np.mean, np.std],
'wap2': [np.sum, np.mean, np.std],
'log_return1': [np.sum, realized_volatility, np.mean, np.std],
'log_return2': [np.sum, realized_volatility, np.mean, np.std],
'log_return_ask1': [np.sum, realized_volatility, np.mean, np.std],
'log_return_ask2': [np.sum, realized_volatility, np.mean, np.std],
'log_return_bid1': [np.sum, realized_volatility, np.mean, np.std],
'log_return_bid2': [np.sum, realized_volatility, np.mean, np.std],
'wap_balance': [np.sum, np.mean, np.std],
'price_spread':[np.sum, np.mean, np.std],
'bid_spread':[np.sum, np.mean, np.std],
'ask_spread':[np.sum, np.mean, np.std],
'total_volume':[np.sum, np.mean, np.std],
'volume_imbalance':[np.sum, np.mean, np.std]
}
agg = book.groupby('time_id').agg(features).reset_index(drop=False)
agg.columns = flatten_name('book', agg.columns)
agg['stock_id'] = stock_id
for time in [450, 300, 150]:
d = book[book['seconds_in_bucket'] >= time].groupby('time_id').agg(features).reset_index(drop=False)
d.columns = flatten_name(f'book_{time}', d.columns)
agg = pd.merge(agg, d, on='time_id', how='left')
return agg
def make_trade_feature(stock_id, block = DataBlock.TRAIN):
trade = load_trade(stock_id, block)
trade['log_return'] = trade.groupby('time_id')['price'].apply(log_return)
features = {
'log_return':[realized_volatility],
'seconds_in_bucket':['count'],
'size':[np.sum],
'order_count':[np.mean],
}
agg = trade.groupby('time_id').agg(features).reset_index()
agg.columns = flatten_name('trade', agg.columns)
agg['stock_id'] = stock_id
for time in [450, 300, 150]:
d = trade[trade['seconds_in_bucket'] >= time].groupby('time_id').agg(features).reset_index(drop=False)
d.columns = flatten_name(f'trade_{time}', d.columns)
agg = pd.merge(agg, d, on='time_id', how='left')
return agg
def make_book_feature_v2(stock_id, block = DataBlock.TRAIN):
book = load_book(stock_id, block)
prices = book.set_index('time_id')[['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2']]
time_ids = list(set(prices.index))
ticks = {}
for tid in time_ids:
try:
price_list = prices.loc[tid].values.flatten()
price_diff = sorted(np.diff(sorted(set(price_list))))
ticks[tid] = price_diff[0]
except Exception:
print_trace(f'tid={tid}')
ticks[tid] = np.nan
dst = pd.DataFrame()
dst['time_id'] = np.unique(book['time_id'])
dst['stock_id'] = stock_id
dst['tick_size'] = dst['time_id'].map(ticks)
return dst
def make_features(base, block):
stock_ids = set(base['stock_id'])
with timer('books'):
books = Parallel(n_jobs=-1)(delayed(make_book_feature)(i, block) for i in stock_ids)
book = pd.concat(books)
with timer('trades'):
trades = Parallel(n_jobs=-1)(delayed(make_trade_feature)(i, block) for i in stock_ids)
trade = pd.concat(trades)
with timer('extra features'):
df = pd.merge(base, book, on=['stock_id', 'time_id'], how='left')
df = pd.merge(df, trade, on=['stock_id', 'time_id'], how='left')
#df = make_extra_features(df)
return df
def make_features_v2(base, block):
stock_ids = set(base['stock_id'])
with timer('books(v2)'):
books = Parallel(n_jobs=-1)(delayed(make_book_feature_v2)(i, block) for i in stock_ids)
book_v2 = pd.concat(books)
d = pd.merge(base, book_v2, on=['stock_id', 'time_id'], how='left')
return d
if USE_PRECOMPUTE_FEATURES:
with timer('load feather'):
df = pd.read_feather(os.path.join(DATA_DIR, 'optiver-df2', 'features_v2.f'))
else:
df = make_features(train, DataBlock.TRAIN)
# v2
df = make_features_v2(df, DataBlock.TRAIN)
df.to_feather('features_v2.f') # save cache
test = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'test.csv'))
if len(test) == 3:
print('is 1st stage')
IS_1ST_STAGE = True
if IS_1ST_STAGE and MEMORY_TEST_MODE:
print('use copy of training data as test data to immitate 2nd stage RAM usage.')
test_df = df.iloc[:170000].copy()
test_df['time_id'] += 32767
test_df['row_id'] = ''
else:
test_df = make_features(test, DataBlock.TEST)
test_df = make_features_v2(test_df, DataBlock.TEST)
print(df.shape)
print(test_df.shape)
df = pd.concat([df, test_df.drop('row_id', axis=1)]).reset_index(drop=True)
N_NEIGHBORS_MAX = 80
def make_neighbors(df, k_neighbors, feature_col, n=5):
feature_pivot = df.pivot('time_id', 'stock_id', feature_col)
feature_pivot = feature_pivot.fillna(feature_pivot.mean())
feature_pivot.head()
neighbors = np.zeros((n, *feature_pivot.shape))
for i in range(n):
neighbors[i, :, :] += feature_pivot.values[k_neighbors[:, i], :]
return feature_pivot, neighbors
def make_neighbors_stock(df, k_neighbors, feature_col, n=5):
feature_pivot = df.pivot('time_id', 'stock_id', feature_col)
feature_pivot = feature_pivot.fillna(feature_pivot.mean())
neighbors = np.zeros((n, *feature_pivot.shape))
for i in range(n):
neighbors[i, :, :] += feature_pivot.values[:, k_neighbors[:, i]]
return feature_pivot, neighbors
def make_nn_feature(df, neighbors, columns, index, feature_col,
n=5, agg=np.mean, postfix='',
exclude_self=False, exact=False):
start = 1 if exclude_self else 0
if exact:
pivot_aggs = pd.DataFrame(neighbors[n-1,:,:], columns=columns, index=index)
else:
pivot_aggs = pd.DataFrame(agg(neighbors[start:n,:,:], axis=0), columns=columns, index=index)
dst = pivot_aggs.unstack().reset_index()
dst.columns = ['stock_id', 'time_id', f'{feature_col}_nn{n}{postfix}_{agg.__name__}']
return dst
class Neighbors:
def __init__(self, pivot, p, metric='minkowski', metric_params=None):
nn = NearestNeighbors(n_neighbors=N_NEIGHBORS_MAX, p=p, metric=metric, metric_params=metric_params)
nn.fit(pivot)
self.distances, self.neighbors = nn.kneighbors(pivot, return_distance=True)
# the tau itself is meaningless for GBDT, but useful as input to aggregate in Nearest Neighbor features
df['trade.tau'] = np.sqrt(1 / df['trade.seconds_in_bucket.count'])
df['trade_150.tau'] = np.sqrt(1 / df['trade_150.seconds_in_bucket.count'])
df['book.tau'] = np.sqrt(1 / df['book.seconds_in_bucket.count'])
df['real_price'] = 0.01 / df['tick_size']
with timer('knn fit'):
df_pv = df[['stock_id', 'time_id']].copy()
df_pv['price'] = 0.01 / df['tick_size']
df_pv['vol'] = df['book.log_return1.realized_volatility']
df_pv['trade.tau'] = df['trade.tau']
df_pv['trade.size.sum'] = df['book.total_volume.sum']
pivot = df_pv.pivot('time_id', 'stock_id', 'price')
pivot = pivot.fillna(pivot.mean())
pivot = pd.DataFrame(minmax_scale(pivot))
k_neighbors_time_price_c = Neighbors(pivot, 2, metric='canberra')
k_neighbors_time_price_m = Neighbors(pivot, 2, metric='mahalanobis', metric_params={'V':np.cov(pivot.values.T)})
k_neighbors_stock_price = Neighbors(minmax_scale(pivot.transpose()), 1)
pivot = df_pv.pivot('time_id', 'stock_id', 'vol')
pivot = pivot.fillna(pivot.mean())
pivot = pd.DataFrame(minmax_scale(pivot))
k_neighbors_time_vol = Neighbors(pivot, 1)
k_neighbors_stock_vol = Neighbors(minmax_scale(pivot.transpose()), 1)
pivot = df_pv.pivot('time_id', 'stock_id', 'trade.size.sum')
pivot = pivot.fillna(pivot.mean())
pivot = pd.DataFrame(minmax_scale(pivot))
k_neighbors_time_size_m = Neighbors(pivot, 2, metric='mahalanobis', metric_params={'V':np.cov(pivot.values.T)})
k_neighbors_time_size_c = Neighbors(pivot, 2, metric='canberra')
k_neighbors_stock_size = Neighbors(minmax_scale(pivot.transpose()), 1)
# features with large changes over time are converted to relative ranks within time-id
df['trade.order_count.mean'] = df.groupby('time_id')['trade.order_count.mean'].rank()
df['book.total_volume.sum'] = df.groupby('time_id')['book.total_volume.sum'].rank()
df['book.total_volume.mean'] = df.groupby('time_id')['book.total_volume.mean'].rank()
df['book.total_volume.std'] = df.groupby('time_id')['book.total_volume.std'].rank()
df['trade.tau'] = df.groupby('time_id')['trade.tau'].rank()
for dt in [150, 300, 450]:
df[f'book_{dt}.total_volume.sum'] = df.groupby('time_id')[f'book_{dt}.total_volume.sum'].rank()
df[f'book_{dt}.total_volume.mean'] = df.groupby('time_id')[f'book_{dt}.total_volume.mean'].rank()
df[f'book_{dt}.total_volume.std'] = df.groupby('time_id')[f'book_{dt}.total_volume.std'].rank()
df[f'trade_{dt}.order_count.mean'] = df.groupby('time_id')[f'trade_{dt}.order_count.mean'].rank()
def make_nearest_neighbor_feature(df: pd.DataFrame) -> pd.DataFrame:
df2 = df.copy()
print(df2.shape)
feature_cols_stock = {
'book.log_return1.realized_volatility': [np.mean, np.min, np.max, np.std],
'trade.seconds_in_bucket.count': [np.mean],
'trade.tau': [np.mean],
'trade_150.tau': [np.mean],
'book.tau': [np.mean],
'trade.size.sum': [np.mean],
'book.seconds_in_bucket.count': [np.mean],
}
feature_cols = {
'book.log_return1.realized_volatility': [np.mean, np.min, np.max, np.std],
'real_price': [np.max, np.mean, np.min],
'trade.seconds_in_bucket.count': [np.mean],
'trade.tau': [np.mean],
'trade.size.sum': [np.mean],
'book.seconds_in_bucket.count': [np.mean],
'trade_150.tau_nn20_sv_mean': [np.mean], # "volatilityの傾向が似ている20銘柄での直前300secの平均tau" の、近い時刻での平均
'trade.size.sum_nn20_sv_mean': [np.mean],
}
time_id_neigbor_sizes = [3, 5, 10, 20, 40]
time_id_neigbor_sizes_vol = [2, 3, 5, 10, 20, 40]
stock_id_neighbor_sizes = [10, 20, 40]
ndf = None
cols = []
def _add_ndf(ndf, dst):
if ndf is None:
return dst
else:
ndf[dst.columns[-1]] = dst[dst.columns[-1]].astype(np.float32)
return ndf
# neighbor stock_id
for feature_col in feature_cols_stock.keys():
try:
feature_pivot, neighbors_stock_price = make_neighbors_stock(df2, k_neighbors_stock_price.neighbors, feature_col, n=N_NEIGHBORS_MAX)
_, neighbors_stock_vol = make_neighbors_stock(df2, k_neighbors_stock_vol.neighbors, feature_col, n=N_NEIGHBORS_MAX)
#_, neighbors_stock_size = make_neighbors_stock(df2, k_neighbors_stock_size.neighbors, feature_col, n=N_NEIGHBORS_MAX)
columns = feature_pivot.columns
index = feature_pivot.index
for agg in feature_cols_stock[feature_col]:
for n in stock_id_neighbor_sizes:
exclude_self = True
exact = False
try:
dst = make_nn_feature(df2, neighbors_stock_price, columns, index, feature_col, n=n, agg=agg, postfix='_s',
exclude_self=exclude_self, exact=exact)
ndf = _add_ndf(ndf, dst)
dst = make_nn_feature(df2, neighbors_stock_vol, columns, index, feature_col, n=n, agg=agg, postfix='_sv',
exclude_self=exclude_self, exact=exact)
ndf = _add_ndf(ndf, dst)
except Exception:
print_trace('stock-id nn')
pass
del feature_pivot, neighbors_stock_price, neighbors_stock_vol
except Exception:
print_trace('stock-id nn')
pass
df2 = pd.merge(df2, ndf, on=['time_id', 'stock_id'], how='left')
ndf = None
print(df2.shape)
# neighbor time_id
for feature_col in feature_cols.keys():
try:
feature_pivot, neighbors_price_c = make_neighbors(df2, k_neighbors_time_price_c.neighbors, feature_col, n=N_NEIGHBORS_MAX)
_, neighbors_price_m = make_neighbors(df2, k_neighbors_time_price_m.neighbors, feature_col, n=N_NEIGHBORS_MAX)
_, neighbors_vol = make_neighbors(df2, k_neighbors_time_vol.neighbors, feature_col, n=N_NEIGHBORS_MAX)
_, neighbors_size_m = make_neighbors(df2, k_neighbors_time_size_m.neighbors, feature_col, n=N_NEIGHBORS_MAX)
_, neighbors_size_c = make_neighbors(df2, k_neighbors_time_size_c.neighbors, feature_col, n=N_NEIGHBORS_MAX)
columns = feature_pivot.columns
index = feature_pivot.index
if 'volatility' in feature_col:
time_id_ns = time_id_neigbor_sizes_vol
else:
time_id_ns = time_id_neigbor_sizes
for agg in feature_cols[feature_col]:
for n in time_id_ns:
exclude_self = True #n >= 10
exclude_self2 = False
exact = False
try:
dst = make_nn_feature(df2, neighbors_price_c, columns, index, feature_col, n=n, agg=agg, postfix='_price_c',
exclude_self=exclude_self, exact=exact)
ndf = _add_ndf(ndf, dst)
dst = make_nn_feature(df2, neighbors_price_m, columns, index, feature_col, n=n, agg=agg, postfix='_price_m',
exclude_self=exclude_self2, exact=exact)
ndf = _add_ndf(ndf, dst)
dst = make_nn_feature(df2, neighbors_vol, columns, index, feature_col, n=n, agg=agg, postfix='_v',
exclude_self=exclude_self2, exact=exact)
ndf = _add_ndf(ndf, dst)
dst = make_nn_feature(df2, neighbors_size_m, columns, index, feature_col, n=n, agg=agg, postfix='_size_m',
exclude_self=exclude_self2, exact=exact)
ndf = _add_ndf(ndf, dst)
dst = make_nn_feature(df2, neighbors_size_c, columns, index, feature_col, n=n, agg=agg, postfix='_size_c',
exclude_self=exclude_self2, exact=exact)
ndf = _add_ndf(ndf, dst)
except Exception:
print_trace('time-id nn')
cols.append(dst.columns[-1])
del feature_pivot, neighbors_price_c, neighbors_price_m, neighbors_vol, neighbors_size_m, neighbors_size_c
except Exception:
print_trace('time-id nn')
df2 = pd.merge(df2, ndf, on=['time_id', 'stock_id'], how='left')
# features further derived from nearest neighbor features
try:
for sz in time_id_neigbor_sizes:
df2[f'real_price_rankmin_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_amin"]
df2[f'real_price_rankmax_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_amax"]
df2[f'real_price_rankmean_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_mean"]
for sz in time_id_neigbor_sizes_vol:
df2[f'vol_rankmin_{sz}'] = df2['book.log_return1.realized_volatility'] / df2[f"book.log_return1.realized_volatility_nn{sz}_price_c_amin"]
df2[f'vol_rankmax_{sz}'] = df2['book.log_return1.realized_volatility'] / df2[f"book.log_return1.realized_volatility_nn{sz}_price_c_amax"]
price_cols = [c for c in df2.columns if 'real_price' in c and 'rank' not in c]
for c in price_cols:
del df2[c]
for sz in time_id_neigbor_sizes_vol:
tgt = f'book.log_return1.realized_volatility_nn{sz}_price_m_mean'
df2[f'{tgt}_rank'] = df2.groupby('time_id')[tgt].rank()
except Exception:
print_trace('nn features')
return df2
gc.collect()
with timer('make nearest neighbor feature'):
df2 = make_nearest_neighbor_feature(df)
print(df2.shape)
df2.reset_index(drop=True).to_feather('optiver_df2.f')
gc.collect()
# skew correction for NN
cols_to_log = [
'trade.size.sum',
'trade_150.size.sum',
'trade_300.size.sum',
'trade_450.size.sum',
'volume_imbalance'
]
for c in df2.columns:
for check in cols_to_log:
try:
if check in c:
df2[c] = np.log(df2[c]+1)
break
except Exception:
print_trace('log1p')
# Rolling average of RV for similar trading volume
try:
df2.sort_values(by=['stock_id', 'book.total_volume.sum'], inplace=True)
df2.reset_index(drop=True, inplace=True)
df2['realized_volatility_roll3_by_book.total_volume.mean'] = df2.groupby('stock_id')['book.log_return1.realized_volatility'].rolling(3, center=True, min_periods=1).mean().reset_index().sort_values(by=['level_1'])['book.log_return1.realized_volatility'].values
df2['realized_volatility_roll10_by_book.total_volume.mean'] = df2.groupby('stock_id')['book.log_return1.realized_volatility'].rolling(10, center=True, min_periods=1).mean().reset_index().sort_values(by=['level_1'])['book.log_return1.realized_volatility'].values
except Exception:
print_trace('mean RV')
# stock-id embedding (helps little)
try:
lda_n = 3
lda = LatentDirichletAllocation(n_components=lda_n, random_state=0)
stock_id_emb = pd.DataFrame(lda.fit_transform(pivot.transpose()), index=df_pv.pivot('time_id', 'stock_id', 'vol').columns)
for i in range(lda_n):
df2[f'stock_id_emb{i}'] = df2['stock_id'].map(stock_id_emb[i])
except Exception:
print_trace('LDA')
df_train = df2[~df2.target.isnull()].copy()
df_test = df2[df2.target.isnull()].copy()
del df2, df_pv
gc.collect()
%matplotlib inline
@contextmanager
def timer(name):
s = time.time()
yield
e = time.time() - s
print(f"[{name}] {e:.3f}sec")
def calc_price2(df):
tick = sorted(np.diff(sorted(np.unique(df.values.flatten()))))[0]
return 0.01 / tick
def calc_prices(r):
df = pd.read_parquet(r.book_path, columns=['time_id', 'ask_price1', 'ask_price2', 'bid_price1', 'bid_price2'])
df = df.set_index('time_id')
df = df.groupby(level='time_id').apply(calc_price2).to_frame('price').reset_index()
df['stock_id'] = r.stock_id
return df
def sort_manifold(df, clf):
df_ = df.set_index('time_id')
df_ = pd.DataFrame(minmax_scale(df_.fillna(df_.mean())))
X_compoents = clf.fit_transform(df_)
dft = df.reindex(np.argsort(X_compoents[:,0])).reset_index(drop=True)
return np.argsort(X_compoents[:, 0]), X_compoents
def reconstruct_time_id_order():
with timer('load files'):
df_files = pd.DataFrame(
{'book_path': glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/**/*.parquet')}) \
.eval('stock_id = book_path.str.extract("stock_id=(\d+)").astype("int")', engine='python')
df_target = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
df_target = df_target.groupby('time_id').target.mean()
with timer('calc prices'):
df_prices = pd.concat(Parallel(n_jobs=4, verbose=51)(delayed(calc_prices)(r) for _, r in df_files.iterrows()))
df_prices = df_prices.pivot('time_id', 'stock_id', 'price')
df_prices.columns = [f'stock_id={i}' for i in df_prices.columns]
df_prices = df_prices.reset_index(drop=False)
with timer('t-SNE(400) -> 50'):
clf = TSNE(n_components=1, perplexity=400, random_state=0, n_iter=2000)
order, X_compoents = sort_manifold(df_prices, clf)
clf = TSNE(n_components=1, perplexity=50, random_state=0, init=X_compoents, n_iter=2000, method='exact')
order, X_compoents = sort_manifold(df_prices, clf)
df_ordered = df_prices.reindex(order).reset_index(drop=True)
if df_ordered['stock_id=61'].iloc[0] > df_ordered['stock_id=61'].iloc[-1]:
df_ordered = df_ordered.reindex(df_ordered.index[::-1]).reset_index(drop=True)
# AMZN
plt.plot(df_ordered['stock_id=61'])
return df_ordered[['time_id']]
with timer('calculate order of time-id'):
if USE_PRECOMPUTE_FEATURES:
timeid_order = pd.read_csv(os.path.join(DATA_DIR, 'optiver-time-id-ordered', 'time_id_order.csv'))
else:
timeid_order = reconstruct_time_id_order()
with timer('make folds'):
timeid_order['time_id_order'] = np.arange(len(timeid_order))
df_train['time_id_order'] = df_train['time_id'].map(timeid_order.set_index('time_id')['time_id_order'])
df_train = df_train.sort_values(['time_id_order', 'stock_id']).reset_index(drop=True)
folds_border = [3830 - 383*4, 3830 - 383*3, 3830 - 383*2, 3830 - 383*1]
time_id_orders = df_train['time_id_order']
folds = []
for i, border in enumerate(folds_border):
idx_train = np.where(time_id_orders < border)[0]
idx_valid = np.where((border <= time_id_orders) & (time_id_orders < border + 383))[0]
folds.append((idx_train, idx_valid))
print(f"folds{i}: train={len(idx_train)}, valid={len(idx_valid)}")
del df_train['time_id_order']
def rmspe(y_true, y_pred):
return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
def feval_RMSPE(preds, train_data):
labels = train_data.get_label()
return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False
# from: https://blog.amedama.jp/entry/lightgbm-cv-feature-importance
def plot_importance(cvbooster, figsize=(10, 10)):
raw_importances = cvbooster.feature_importance(importance_type='gain')
feature_name = cvbooster.boosters[0].feature_name()
importance_df = pd.DataFrame(data=raw_importances,
columns=feature_name)
# 平均値でソートする
sorted_indices = importance_df.mean(axis=0).sort_values(ascending=False).index
sorted_importance_df = importance_df.loc[:, sorted_indices]
# 上位をプロットする
PLOT_TOP_N = 80
plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
_, ax = plt.subplots(figsize=figsize)
ax.grid()
ax.set_xscale('log')
ax.set_ylabel('Feature')
ax.set_xlabel('Importance')
sns.boxplot(data=sorted_importance_df[plot_cols],
orient='h',
ax=ax)
plt.show()
def get_X(df_src):
cols = [c for c in df_src.columns if c not in ['time_id', 'target', 'tick_size']]
return df_src[cols]
class EnsembleModel:
def __init__(self, models: List[lgb.Booster], weights: Optional[List[float]] = None):
self.models = models
self.weights = weights
features = list(self.models[0].feature_name())
for m in self.models[1:]:
assert features == list(m.feature_name())
def predict(self, x):
predicted = np.zeros((len(x), len(self.models)))
for i, m in enumerate(self.models):
w = self.weights[i] if self.weights is not None else 1
predicted[:, i] = w * m.predict(x)
ttl = np.sum(self.weights) if self.weights is not None else len(self.models)
return np.sum(predicted, axis=1) / ttl
def feature_name(self) -> List[str]:
return self.models[0].feature_name()
lr = GBDT_LR
if SHORTCUT_GBDT_IN_1ST_STAGE and IS_1ST_STAGE:
# to save GPU quota
lr = 0.3
params = {
'objective': 'regression',
'verbose': 0,
'metric': '',
'reg_alpha': 5,
'reg_lambda': 5,
'min_data_in_leaf': 1000,
'max_depth': -1,
'num_leaves': 128,
'colsample_bytree': 0.3,
'learning_rate': lr
}
X = get_X(df_train)
y = df_train['target']
X.to_feather('X.f')
df_train[['target']].to_feather('y.f')
gc.collect()
print(X.shape)
if PREDICT_GBDT:
ds = lgb.Dataset(X, y, weight=1/np.power(y, 2))
with timer('lgb.cv'):
ret = lgb.cv(params, ds, num_boost_round=8000, folds=folds, #cv,
feval=feval_RMSPE, stratified=False,
return_cvbooster=True, verbose_eval=20,
early_stopping_rounds=int(40*0.1/lr))
print(f"# overall RMSPE: {ret['RMSPE-mean'][-1]}")
best_iteration = len(ret['RMSPE-mean'])
for i in range(len(folds)):
y_pred = ret['cvbooster'].boosters[i].predict(X.iloc[folds[i][1]], num_iteration=best_iteration)
y_true = y.iloc[folds[i][1]]
print(f"# fold{i} RMSPE: {rmspe(y_true, y_pred)}")
if i == len(folds) - 1:
np.save('pred_gbdt.npy', y_pred)
plot_importance(ret['cvbooster'], figsize=(10, 20))
boosters = []
with timer('retraining'):
for i in range(GBDT_NUM_MODELS):
params['seed'] = i
boosters.append(lgb.train(params, ds, num_boost_round=int(1.1*best_iteration)))
booster = EnsembleModel(boosters)
del ret
del ds
gc.collect()
import gc
import os
import pickle
import random
from typing import List, Tuple, Optional, Union
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.decomposition import PCA
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
null_check_cols = [
'book.log_return1.realized_volatility',
'book_150.log_return1.realized_volatility',
'book_300.log_return1.realized_volatility',
'book_450.log_return1.realized_volatility',
'trade.log_return.realized_volatility',
'trade_150.log_return.realized_volatility',
'trade_300.log_return.realized_volatility',
'trade_450.log_return.realized_volatility'
]
def seed_everything(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
def rmspe_metric(y_true, y_pred):
rmspe = np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
return rmspe
def rmspe_loss(y_true, y_pred):
rmspe = torch.sqrt(torch.mean(torch.square((y_true - y_pred) / y_true)))
return rmspe
class RMSPE(Metric):
def __init__(self):
self._name = "rmspe"
self._maximize = False
def __call__(self, y_true, y_score):
return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))
def RMSPELoss_Tabnet(y_pred, y_true):
return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()
class AverageMeter:
"""Computes and stores the average and current value"""
def __init__(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
class TabularDataset(Dataset):
def __init__(self, x_num: np.ndarray, x_cat: np.ndarray, y: Optional[np.ndarray]):
super().__init__()
self.x_num = x_num
self.x_cat = x_cat
self.y = y
def __len__(self):
return len(self.x_num)
def __getitem__(self, idx):
if self.y is None:
return self.x_num[idx], torch.LongTensor(self.x_cat[idx])
else:
return self.x_num[idx], torch.LongTensor(self.x_cat[idx]), self.y[idx]
class MLP(nn.Module):
def __init__(self,
src_num_dim: int,
n_categories: List[int],
dropout: float = 0.0,
hidden: int = 50,
emb_dim: int = 10,
dropout_cat: float = 0.2,
bn: bool = False):
super().__init__()
self.embs = nn.ModuleList([
nn.Embedding(x, emb_dim) for x in n_categories])
self.cat_dim = emb_dim * len(n_categories)
self.dropout_cat = nn.Dropout(dropout_cat)
if bn:
self.sequence = nn.Sequential(
nn.Linear(src_num_dim + self.cat_dim, hidden),
nn.Dropout(dropout),
nn.BatchNorm1d(hidden),
nn.ReLU(),
nn.Linear(hidden, hidden),
nn.Dropout(dropout),
nn.BatchNorm1d(hidden),
nn.ReLU(),
nn.Linear(hidden, 1)
)
else:
self.sequence = nn.Sequential(
nn.Linear(src_num_dim + self.cat_dim, hidden),
nn.Dropout(dropout),
nn.ReLU(),
nn.Linear(hidden, hidden),
nn.Dropout(dropout),
nn.ReLU(),
nn.Linear(hidden, 1)
)
def forward(self, x_num, x_cat):
embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
x_all = torch.cat([x_num, x_cat_emb], 1)
x = self.sequence(x_all)
return torch.squeeze(x)
class CNN(nn.Module):
def __init__(self,
num_features: int,
hidden_size: int,
n_categories: List[int],
emb_dim: int = 10,
dropout_cat: float = 0.2,
channel_1: int = 256,
channel_2: int = 512,
channel_3: int = 512,
dropout_top: float = 0.1,
dropout_mid: float = 0.3,
dropout_bottom: float = 0.2,
weight_norm: bool = True,
two_stage: bool = True,
celu: bool = True,
kernel1: int = 5,
leaky_relu: bool = False):
super().__init__()
num_targets = 1
cha_1_reshape = int(hidden_size / channel_1)
cha_po_1 = int(hidden_size / channel_1 / 2)
cha_po_2 = int(hidden_size / channel_1 / 2 / 2) * channel_3
self.cat_dim = emb_dim * len(n_categories)
self.cha_1 = channel_1
self.cha_2 = channel_2
self.cha_3 = channel_3
self.cha_1_reshape = cha_1_reshape
self.cha_po_1 = cha_po_1
self.cha_po_2 = cha_po_2
self.two_stage = two_stage
self.expand = nn.Sequential(
nn.BatchNorm1d(num_features + self.cat_dim),
nn.Dropout(dropout_top),
nn.utils.weight_norm(nn.Linear(num_features + self.cat_dim, hidden_size), dim=None),
nn.CELU(0.06) if celu else nn.ReLU()
)
def _norm(layer, dim=None):
return nn.utils.weight_norm(layer, dim=dim) if weight_norm else layer
self.conv1 = nn.Sequential(
nn.BatchNorm1d(channel_1),
nn.Dropout(dropout_top),
_norm(nn.Conv1d(channel_1, channel_2, kernel_size=kernel1, stride=1, padding=kernel1 // 2, bias=False)),
nn.ReLU(),
nn.AdaptiveAvgPool1d(output_size=cha_po_1),
nn.BatchNorm1d(channel_2),
nn.Dropout(dropout_top),
_norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
nn.ReLU()
)
if self.two_stage:
self.conv2 = nn.Sequential(
nn.BatchNorm1d(channel_2),
nn.Dropout(dropout_mid),
_norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
nn.ReLU(),
nn.BatchNorm1d(channel_2),
nn.Dropout(dropout_bottom),
_norm(nn.Conv1d(channel_2, channel_3, kernel_size=5, stride=1, padding=2, bias=True)),
nn.ReLU()
)
self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)
self.flt = nn.Flatten()
if leaky_relu:
self.dense = nn.Sequential(
nn.BatchNorm1d(cha_po_2),
nn.Dropout(dropout_bottom),
_norm(nn.Linear(cha_po_2, num_targets), dim=0),
nn.LeakyReLU()
)
else:
self.dense = nn.Sequential(
nn.BatchNorm1d(cha_po_2),
nn.Dropout(dropout_bottom),
_norm(nn.Linear(cha_po_2, num_targets), dim=0)
)
self.embs = nn.ModuleList([nn.Embedding(x, emb_dim) for x in n_categories])
self.cat_dim = emb_dim * len(n_categories)
self.dropout_cat = nn.Dropout(dropout_cat)
def forward(self, x_num, x_cat):
embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
x = torch.cat([x_num, x_cat_emb], 1)
x = self.expand(x)
x = x.reshape(x.shape[0], self.cha_1, self.cha_1_reshape)
x = self.conv1(x)
if self.two_stage:
x = self.conv2(x) * x
x = self.max_po_c2(x)
x = self.flt(x)
x = self.dense(x)
return torch.squeeze(x)
def preprocess_nn(
X: pd.DataFrame,
scaler: Optional[StandardScaler] = None,
scaler_type: str = 'standard',
n_pca: int = -1,
na_cols: bool = True):
if na_cols:
#for c in X.columns:
for c in null_check_cols:
if c in X.columns:
X[f"{c}_isnull"] = X[c].isnull().astype(int)
cat_cols = [c for c in X.columns if c in ['time_id', 'stock_id']]
num_cols = [c for c in X.columns if c not in cat_cols]
X_num = X[num_cols].values.astype(np.float32)
X_cat = np.nan_to_num(X[cat_cols].values.astype(np.int32))
def _pca(X_num_):
if n_pca > 0:
pca = PCA(n_components=n_pca, random_state=0)
return pca.fit_transform(X_num)
return X_num
if scaler is None:
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)
X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
return _pca(X_num), X_cat, cat_cols, scaler
else:
X_num = scaler.transform(X_num) #TODO: infでも大丈夫?
X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
return _pca(X_num), X_cat, cat_cols
def train_epoch(data_loader: DataLoader,
model: nn.Module,
optimizer,
scheduler,
device,
clip_grad: float = 1.5):
model.train()
losses = AverageMeter()
step = 0
for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Training'):
batch_size = x_num.size(0)
x_num = x_num.to(device, dtype=torch.float)
x_cat = x_cat.to(device)
y = y.to(device, dtype=torch.float)
loss = rmspe_loss(y, model(x_num, x_cat))
losses.update(loss.detach().cpu().numpy(), batch_size)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
optimizer.step()
optimizer.zero_grad()
if scheduler is not None:
scheduler.step()
step += 1
return losses.avg
def evaluate(data_loader: DataLoader, model, device):
model.eval()
losses = AverageMeter()
final_targets = []
final_outputs = []
with torch.no_grad():
for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Evaluating'):
batch_size = x_num.size(0)
x_num = x_num.to(device, dtype=torch.float)
x_cat = x_cat.to(device)
y = y.to(device, dtype=torch.float)
with torch.no_grad():
output = model(x_num, x_cat)
loss = rmspe_loss(y, output)
# record loss
losses.update(loss.detach().cpu().numpy(), batch_size)
targets = y.detach().cpu().numpy()
output = output.detach().cpu().numpy()
final_targets.append(targets)
final_outputs.append(output)
final_targets = np.concatenate(final_targets)
final_outputs = np.concatenate(final_outputs)
try:
metric = rmspe_metric(final_targets, final_outputs)
except:
metric = None
return final_outputs, final_targets, losses.avg, metric
def predict_nn(X: pd.DataFrame,
model: Union[List[MLP], MLP],
scaler: StandardScaler,
device,
ensemble_method='mean'):
if not isinstance(model, list):
model = [model]
for m in model:
m.eval()
X_num, X_cat, cat_cols = preprocess_nn(X.copy(), scaler=scaler)
valid_dataset = TabularDataset(X_num, X_cat, None)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
batch_size=512,
shuffle=False,
num_workers=4)
final_outputs = []
with torch.no_grad():
for x_num, x_cat in tqdm(valid_loader, position=0, leave=True, desc='Evaluating'):
x_num = x_num.to(device, dtype=torch.float)
x_cat = x_cat.to(device)
outputs = []
with torch.no_grad():
for m in model:
output = m(x_num, x_cat)
outputs.append(output.detach().cpu().numpy())
if ensemble_method == 'median':
pred = np.nanmedian(np.array(outputs), axis=0)
else:
pred = np.array(outputs).mean(axis=0)
final_outputs.append(pred)
final_outputs = np.concatenate(final_outputs)
return final_outputs
def predict_tabnet(X: pd.DataFrame,
model: Union[List[TabNetRegressor], TabNetRegressor],
scaler: StandardScaler,
ensemble_method='mean'):
if not isinstance(model, list):
model = [model]
X_num, X_cat, cat_cols = preprocess_nn(X.copy(), scaler=scaler)
X_processed = np.concatenate([X_cat, X_num], axis=1)
predicted = []
for m in model:
predicted.append(m.predict(X_processed))
if ensemble_method == 'median':
pred = np.nanmedian(np.array(predicted), axis=0)
else:
pred = np.array(predicted).mean(axis=0)
return pred
def train_tabnet(X: pd.DataFrame,
y: pd.DataFrame,
folds: List[Tuple],
batch_size: int = 1024,
lr: float = 1e-3,
model_path: str = 'fold_{}.pth',
scaler_type: str = 'standard',
output_dir: str = 'artifacts',
epochs: int = 250,
seed: int = 42,
n_pca: int = -1,
na_cols: bool = True,
patience: int = 10,
factor: float = 0.5,
gamma: float = 2.0,
lambda_sparse: float = 8.0,
n_steps: int = 2,
scheduler_type: str = 'cosine',
n_a: int = 16):
seed_everything(seed)
os.makedirs(output_dir, exist_ok=True)
y = y.values.astype(np.float32)
X_num, X_cat, cat_cols, scaler = preprocess_nn(X.copy(), scaler_type=scaler_type, n_pca=n_pca, na_cols=na_cols)
best_losses = []
best_predictions = []
for cv_idx, (train_idx, valid_idx) in enumerate(folds):
X_tr, X_va = X_num[train_idx], X_num[valid_idx]
X_tr_cat, X_va_cat = X_cat[train_idx], X_cat[valid_idx]
y_tr, y_va = y[train_idx], y[valid_idx]
y_tr = y_tr.reshape(-1,1)
y_va = y_va.reshape(-1,1)
X_tr = np.concatenate([X_tr_cat, X_tr], axis=1)
X_va = np.concatenate([X_va_cat, X_va], axis=1)
cat_idxs = [0]
cat_dims = [128]
if scheduler_type == 'cosine':
scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False)
scheduler_fn = CosineAnnealingWarmRestarts
else:
scheduler_params = {'mode': 'min', 'min_lr': 1e-7, 'patience': patience, 'factor': factor, 'verbose': True}
scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau
model = TabNetRegressor(
cat_idxs=cat_idxs,
cat_dims=cat_dims,
cat_emb_dim=1,
n_d=n_a,
n_a=n_a,
n_steps=n_steps,
gamma=gamma,
n_independent=2,
n_shared=2,
lambda_sparse=lambda_sparse,
optimizer_fn=torch.optim.Adam,
optimizer_params={'lr': lr},
mask_type="entmax",
scheduler_fn=scheduler_fn,
scheduler_params=scheduler_params,
seed=seed,
verbose=10
#device_name=device,
#clip_value=1.5
)
model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], max_epochs=epochs, patience=50, batch_size=1024*20,
virtual_batch_size=batch_size, num_workers=4, drop_last=False, eval_metric=[RMSPE], loss_fn=RMSPELoss_Tabnet)
path = os.path.join(output_dir, model_path.format(cv_idx))
model.save_model(path)
predicted = model.predict(X_va)
rmspe = rmspe_metric(y_va, predicted)
best_losses.append(rmspe)
best_predictions.append(predicted)
return best_losses, best_predictions, scaler, model
def train_nn(X: pd.DataFrame,
y: pd.DataFrame,
folds: List[Tuple],
device,
emb_dim: int = 25,
batch_size: int = 1024,
model_type: str = 'mlp',
mlp_dropout: float = 0.0,
mlp_hidden: int = 64,
mlp_bn: bool = False,
cnn_hidden: int = 64,
cnn_channel1: int = 32,
cnn_channel2: int = 32,
cnn_channel3: int = 32,
cnn_kernel1: int = 5,
cnn_celu: bool = False,
cnn_weight_norm: bool = False,
dropout_emb: bool = 0.0,
lr: float = 1e-3,
weight_decay: float = 0.0,
model_path: str = 'fold_{}.pth',
scaler_type: str = 'standard',
output_dir: str = 'artifacts',
scheduler_type: str = 'onecycle',
optimizer_type: str = 'adam',
max_lr: float = 0.01,
epochs: int = 30,
seed: int = 42,
n_pca: int = -1,
batch_double_freq: int = 50,
cnn_dropout: float = 0.1,
na_cols: bool = True,
cnn_leaky_relu: bool = False,
patience: int = 8,
factor: float = 0.5):
seed_everything(seed)
os.makedirs(output_dir, exist_ok=True)
y = y.values.astype(np.float32)
X_num, X_cat, cat_cols, scaler = preprocess_nn(X.copy(), scaler_type=scaler_type, n_pca=n_pca, na_cols=na_cols)
best_losses = []
best_predictions = []
for cv_idx, (train_idx, valid_idx) in enumerate(folds):
X_tr, X_va = X_num[train_idx], X_num[valid_idx]
X_tr_cat, X_va_cat = X_cat[train_idx], X_cat[valid_idx]
y_tr, y_va = y[train_idx], y[valid_idx]
cur_batch = batch_size
best_loss = 1e10
best_prediction = None
print(f"fold {cv_idx} train: {X_tr.shape}, valid: {X_va.shape}")
train_dataset = TabularDataset(X_tr, X_tr_cat, y_tr)
valid_dataset = TabularDataset(X_va, X_va_cat, y_va)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cur_batch, shuffle=True,
num_workers=4)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=cur_batch, shuffle=False,
num_workers=4)
if model_type == 'mlp':
model = MLP(X_tr.shape[1],
n_categories=[128],
dropout=mlp_dropout, hidden=mlp_hidden, emb_dim=emb_dim,
dropout_cat=dropout_emb, bn=mlp_bn)
elif model_type == 'cnn':
model = CNN(X_tr.shape[1],
hidden_size=cnn_hidden,
n_categories=[128],
emb_dim=emb_dim,
dropout_cat=dropout_emb,
channel_1=cnn_channel1,
channel_2=cnn_channel2,
channel_3=cnn_channel3,
two_stage=False,
kernel1=cnn_kernel1,
celu=cnn_celu,
dropout_top=cnn_dropout,
dropout_mid=cnn_dropout,
dropout_bottom=cnn_dropout,
weight_norm=cnn_weight_norm,
leaky_relu=cnn_leaky_relu)
else:
raise NotImplementedError()
model = model.to(device)
if optimizer_type == 'adamw':
opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
elif optimizer_type == 'adam':
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
else:
raise NotImplementedError()
scheduler = epoch_scheduler = None
if scheduler_type == 'onecycle':
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, pct_start=0.1, div_factor=1e3,
max_lr=max_lr, epochs=epochs,
steps_per_epoch=len(train_loader))
elif scheduler_type == 'reduce':
epoch_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,
mode='min',
min_lr=1e-7,
patience=patience,
verbose=True,
factor=factor)
for epoch in range(epochs):
if epoch > 0 and epoch % batch_double_freq == 0:
cur_batch = cur_batch * 2
print(f'batch: {cur_batch}')
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=cur_batch,
shuffle=True,
num_workers=4)
train_loss = train_epoch(train_loader, model, opt, scheduler, device)
predictions, valid_targets, valid_loss, rmspe = evaluate(valid_loader, model, device=device)
print(f"epoch {epoch}, train loss: {train_loss:.3f}, valid rmspe: {rmspe:.3f}")
if epoch_scheduler is not None:
epoch_scheduler.step(rmspe)
if rmspe < best_loss:
print(f'new best:{rmspe}')
best_loss = rmspe
best_prediction = predictions
torch.save(model, os.path.join(output_dir, model_path.format(cv_idx)))
best_predictions.append(best_prediction)
best_losses.append(best_loss)
del model, train_dataset, valid_dataset, train_loader, valid_loader, X_tr, X_va, X_tr_cat, X_va_cat, y_tr, y_va, opt
if scheduler is not None:
del scheduler
gc.collect()
return best_losses, best_predictions, scaler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
del df, df_train
gc.collect()
def get_top_n_models(models, scores, top_n):
if len(models) <= top_n:
print('number of models are less than top_n. all models will be used')
return models
sorted_ = [(y, x) for y, x in sorted(zip(scores, models), key=lambda pair: pair[0])]
print(f'scores(sorted): {[y for y, _ in sorted_]}')
return [x for _, x in sorted_][:top_n]
if PREDICT_MLP:
model_paths = []
scores = []
if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
print('shortcut to save quota...')
epochs = 3
valid_th = 100
else:
epochs = 30
valid_th = NN_VALID_TH
for i in range(NN_NUM_MODELS):
# MLP
nn_losses, nn_preds, scaler = train_nn(X, y,
[folds[-1]],
device=device,
batch_size=512,
mlp_bn=True,
mlp_hidden=256,
mlp_dropout=0.0,
emb_dim=30,
epochs=epochs,
lr=0.002,
max_lr=0.0055,
weight_decay=1e-7,
model_path='mlp_fold_{}' + f"_seed{i}.pth",
seed=i)
if nn_losses[0] < NN_VALID_TH:
print(f'model of seed {i} added.')
scores.append(nn_losses[0])
model_paths.append(f'artifacts/mlp_fold_0_seed{i}.pth')
np.save(f'pred_mlp_seed{i}.npy', nn_preds[0])
model_paths = get_top_n_models(model_paths, scores, NN_MODEL_TOP_N)
mlp_model = [torch.load(path, device) for path in model_paths]
print(f'total {len(mlp_model)} models will be used.')
if PREDICT_CNN:
model_paths = []
scores = []
if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
print('shortcut to save quota...')
epochs = 3
valid_th = 100
else:
epochs = 50
valid_th = NN_VALID_TH
for i in range(NN_NUM_MODELS):
nn_losses, nn_preds, scaler = train_nn(X, y,
[folds[-1]],
device=device,
cnn_hidden=8*128,
batch_size=1280,
model_type='cnn',
emb_dim=30,
epochs=epochs, #epochs,
cnn_channel1=128,
cnn_channel2=3*128,
cnn_channel3=3*128,
lr=0.00038, #0.0011,
max_lr=0.0013,
weight_decay=6.5e-6,
optimizer_type='adam',
scheduler_type='reduce',
model_path='cnn_fold_{}' + f"_seed{i}.pth",
seed=i,
cnn_dropout=0.0,
cnn_weight_norm=True,
cnn_leaky_relu=False,
patience=8,
factor=0.3)
if nn_losses[0] < valid_th:
model_paths.append(f'artifacts/cnn_fold_0_seed{i}.pth')
scores.append(nn_losses[0])
np.save(f'pred_cnn_seed{i}.npy', nn_preds[0])
model_paths = get_top_n_models(model_paths, scores, NN_MODEL_TOP_N)
cnn_model = [torch.load(path, device) for path in model_paths]
print(f'total {len(cnn_model)} models will be used.')
if PREDICT_TABNET:
tab_model = []
scores = []
if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
print('shortcut to save quota...')
epochs = 10
valid_th = 1000
else:
print('train full')
epochs = 250
valid_th = NN_VALID_TH
for i in range(TABNET_NUM_MODELS):
nn_losses, nn_preds, scaler, model = train_tabnet(X, y,
[folds[-1]],
batch_size=1280,
epochs=epochs, #epochs,
lr=0.04,
patience=50,
factor=0.5,
gamma=1.6,
lambda_sparse=3.55e-6,
seed=i,
n_a=36)
if nn_losses[0] < valid_th:
tab_model.append(model)
scores.append(nn_losses[0])
np.save(f'pred_tab_seed{i}.npy', nn_preds[0])
model.save_model(f'artifacts/tabnet_fold_0_seed{i}')
tab_model = get_top_n_models(tab_model, scores, TAB_MODEL_TOP_N)
print(f'total {len(tab_model)} models will be used.')
del X, y
gc.collect()
X_test = get_X(df_test)
print(X_test.shape)
df_pred = pd.DataFrame()
df_pred['row_id'] = df_test['stock_id'].astype(str) + '-' + df_test['time_id'].astype(str)
predictions = {}
prediction_weights = {}
if PREDICT_GBDT:
gbdt_preds = booster.predict(X_test)
predictions['gbdt'] = gbdt_preds
prediction_weights['gbdt'] = 4
if PREDICT_MLP and mlp_model:
try:
mlp_preds = predict_nn(X_test, mlp_model, scaler, device, ensemble_method=ENSEMBLE_METHOD)
print(f'mlp: {mlp_preds.shape}')
predictions['mlp'] = mlp_preds
prediction_weights['mlp'] = 1
except:
print(f'failed to predict mlp: {traceback.format_exc()}')
if PREDICT_CNN and cnn_model:
try:
cnn_preds = predict_nn(X_test, cnn_model, scaler, device, ensemble_method=ENSEMBLE_METHOD)
print(f'cnn: {cnn_preds.shape}')
predictions['cnn'] = cnn_preds
prediction_weights['cnn'] = 4
except:
print(f'failed to predict cnn: {traceback.format_exc()}')
if PREDICT_TABNET and tab_model:
try:
tab_preds = predict_tabnet(X_test, tab_model, scaler, ensemble_method=ENSEMBLE_METHOD).flatten()
print(f'tab: {tab_preds.shape}')
predictions['tab'] = tab_preds
prediction_weights['tab'] = 1
except:
print(f'failed to predict tab: {traceback.format_exc()}')
overall_preds = None
overall_weight = np.sum(list(prediction_weights.values()))
print(f'prediction will be made by: {list(prediction_weights.keys())}')
for name, preds in predictions.items():
w = prediction_weights[name] / overall_weight
if overall_preds is None:
overall_preds = preds * w
else:
overall_preds += preds * w
df_pred['target'] = np.clip(overall_preds, 0, None)
sub = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'sample_submission.csv'))
submission = pd.merge(sub[['row_id']], df_pred[['row_id', 'target']], how='left')
submission['target'] = submission['target'].fillna(0)
submission.to_csv('submission.csv', index=False)