In [1]:
import gc
import glob
import os
import sys
import time
import traceback
from contextlib import contextmanager
from enum import Enum
from typing import List, Optional

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

from joblib import Parallel, delayed
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import minmax_scale
from tqdm import tqdm_notebook as tqdm


%matplotlib inline

DATA_DIR = '../input'

USE_PRECOMPUTE_FEATURES = True
PREDICT_CNN = True
PREDICT_MLP = True
PREDICT_GBDT = True
PREDICT_TABNET = False

GBDT_NUM_MODELS = 5 #3
GBDT_LR = 0.02  # 0.1

NN_VALID_TH = 0.185
NN_MODEL_TOP_N = 5
TAB_MODEL_TOP_N = 3
ENSEMBLE_METHOD = 'mean'
NN_NUM_MODELS = 10
TABNET_NUM_MODELS = 5

IS_1ST_STAGE = False
SHORTCUT_NN_IN_1ST_STAGE = True  # early-stop training to save GPU quota
SHORTCUT_GBDT_IN_1ST_STAGE = True
MEMORY_TEST_MODE = False

@contextmanager
def timer(name: str):
    s = time.time()
    yield
    elapsed = time.time() - s
    print(f'[{name}] {elapsed: .3f}sec')
    
def print_trace(name: str = ''):
    print(f'ERROR RAISED IN {name or "anonymous"}')
    print(traceback.format_exc())
In [2]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
In [3]:
train = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'train.csv'))
stock_ids = set(train['stock_id'])

Feature Engineering

Base Features

In [4]:
class DataBlock(Enum):
    TRAIN = 1
    TEST = 2
    BOTH = 3


def load_stock_data(stock_id: int, directory: str) -> pd.DataFrame:
    return pd.read_parquet(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', directory, f'stock_id={stock_id}'))


def load_data(stock_id: int, stem: str, block: DataBlock) -> pd.DataFrame:
    if block == DataBlock.TRAIN:
        return load_stock_data(stock_id, f'{stem}_train.parquet')
    elif block == DataBlock.TEST:
        return load_stock_data(stock_id, f'{stem}_test.parquet')
    else:
        return pd.concat([
            load_data(stock_id, stem, DataBlock.TRAIN),
            load_data(stock_id, stem, DataBlock.TEST)
        ]).reset_index(drop=True)

def load_book(stock_id: int, block: DataBlock=DataBlock.TRAIN) -> pd.DataFrame:
    return load_data(stock_id, 'book', block)


def load_trade(stock_id: int, block=DataBlock.TRAIN) -> pd.DataFrame:
    return load_data(stock_id, 'trade', block)


def calc_wap1(df: pd.DataFrame) -> pd.Series:
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap


def calc_wap2(df: pd.DataFrame) -> pd.Series:
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap


def realized_volatility(series):
    return np.sqrt(np.sum(series**2))


def log_return(series: np.ndarray):
    return np.log(series).diff()


def log_return_df2(series: np.ndarray):
    return np.log(series).diff(2)


def flatten_name(prefix, src_names):
    ret = []
    for c in src_names:
        if c[0] in ['time_id', 'stock_id']:
            ret.append(c[0])
        else:
            ret.append('.'.join([prefix] + list(c)))
    return ret


def make_book_feature(stock_id, block = DataBlock.TRAIN):
    book = load_book(stock_id, block)

    book['wap1'] = calc_wap1(book)
    book['wap2'] = calc_wap2(book)
    book['log_return1'] = book.groupby(['time_id'])['wap1'].apply(log_return)
    book['log_return2'] = book.groupby(['time_id'])['wap2'].apply(log_return)
    book['log_return_ask1'] = book.groupby(['time_id'])['ask_price1'].apply(log_return)
    book['log_return_ask2'] = book.groupby(['time_id'])['ask_price2'].apply(log_return)
    book['log_return_bid1'] = book.groupby(['time_id'])['bid_price1'].apply(log_return)
    book['log_return_bid2'] = book.groupby(['time_id'])['bid_price2'].apply(log_return)

    book['wap_balance'] = abs(book['wap1'] - book['wap2'])
    book['price_spread'] = (book['ask_price1'] - book['bid_price1']) / ((book['ask_price1'] + book['bid_price1']) / 2)
    book['bid_spread'] = book['bid_price1'] - book['bid_price2']
    book['ask_spread'] = book['ask_price1'] - book['ask_price2']
    book['total_volume'] = (book['ask_size1'] + book['ask_size2']) + (book['bid_size1'] + book['bid_size2'])
    book['volume_imbalance'] = abs((book['ask_size1'] + book['ask_size2']) - (book['bid_size1'] + book['bid_size2']))
    
    features = {
        'seconds_in_bucket': ['count'],
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'log_return_ask1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return_ask2': [np.sum, realized_volatility, np.mean, np.std],
        'log_return_bid1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return_bid2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    agg = book.groupby('time_id').agg(features).reset_index(drop=False)
    agg.columns = flatten_name('book', agg.columns)
    agg['stock_id'] = stock_id
    
    for time in [450, 300, 150]:
        d = book[book['seconds_in_bucket'] >= time].groupby('time_id').agg(features).reset_index(drop=False)
        d.columns = flatten_name(f'book_{time}', d.columns)
        agg = pd.merge(agg, d, on='time_id', how='left')
    return agg


def make_trade_feature(stock_id, block = DataBlock.TRAIN):
    trade = load_trade(stock_id, block)
    trade['log_return'] = trade.groupby('time_id')['price'].apply(log_return)

    features = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':['count'],
        'size':[np.sum],
        'order_count':[np.mean],
    }

    agg = trade.groupby('time_id').agg(features).reset_index()
    agg.columns = flatten_name('trade', agg.columns)
    agg['stock_id'] = stock_id
        
    for time in [450, 300, 150]:
        d = trade[trade['seconds_in_bucket'] >= time].groupby('time_id').agg(features).reset_index(drop=False)
        d.columns = flatten_name(f'trade_{time}', d.columns)
        agg = pd.merge(agg, d, on='time_id', how='left')
    return agg


def make_book_feature_v2(stock_id, block = DataBlock.TRAIN):
    book = load_book(stock_id, block)

    prices = book.set_index('time_id')[['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2']]
    time_ids = list(set(prices.index))

    ticks = {}
    for tid in time_ids:
        try:
            price_list = prices.loc[tid].values.flatten()
            price_diff = sorted(np.diff(sorted(set(price_list))))
            ticks[tid] = price_diff[0]
        except Exception:
            print_trace(f'tid={tid}')
            ticks[tid] = np.nan
        
    dst = pd.DataFrame()
    dst['time_id'] = np.unique(book['time_id'])
    dst['stock_id'] = stock_id
    dst['tick_size'] = dst['time_id'].map(ticks)

    return dst


def make_features(base, block):
    stock_ids = set(base['stock_id'])
    with timer('books'):
        books = Parallel(n_jobs=-1)(delayed(make_book_feature)(i, block) for i in stock_ids)
        book = pd.concat(books)

    with timer('trades'):
        trades = Parallel(n_jobs=-1)(delayed(make_trade_feature)(i, block) for i in stock_ids)
        trade = pd.concat(trades)

    with timer('extra features'):
        df = pd.merge(base, book, on=['stock_id', 'time_id'], how='left')
        df = pd.merge(df, trade, on=['stock_id', 'time_id'], how='left')
        #df = make_extra_features(df)

    return df


def make_features_v2(base, block):
    stock_ids = set(base['stock_id'])
    with timer('books(v2)'):
        books = Parallel(n_jobs=-1)(delayed(make_book_feature_v2)(i, block) for i in stock_ids)
        book_v2 = pd.concat(books)

    d = pd.merge(base, book_v2, on=['stock_id', 'time_id'], how='left')
    return d
In [5]:
if USE_PRECOMPUTE_FEATURES:
    with timer('load feather'):
        df = pd.read_feather(os.path.join(DATA_DIR, 'optiver-df2', 'features_v2.f'))
else:
    df = make_features(train, DataBlock.TRAIN)
    # v2
    df = make_features_v2(df, DataBlock.TRAIN)

df.to_feather('features_v2.f')  # save cache

test = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'test.csv'))
if len(test) == 3:
    print('is 1st stage')
    IS_1ST_STAGE = True

if IS_1ST_STAGE and MEMORY_TEST_MODE:
    print('use copy of training data as test data to immitate 2nd stage RAM usage.')
    test_df = df.iloc[:170000].copy()
    test_df['time_id'] += 32767
    test_df['row_id'] = ''
else:
    test_df = make_features(test, DataBlock.TEST)
    test_df = make_features_v2(test_df, DataBlock.TEST)

print(df.shape)
print(test_df.shape)
df = pd.concat([df, test_df.drop('row_id', axis=1)]).reset_index(drop=True)
[load feather]  6.360sec
is 1st stage
[books]  1.118sec
[trades]  0.320sec
[extra features]  0.038sec
[books(v2)]  0.022sec
(428932, 216)
(3, 216)

Nearest-Neighbor Features

In [6]:
N_NEIGHBORS_MAX = 80

def make_neighbors(df, k_neighbors, feature_col, n=5):
    feature_pivot = df.pivot('time_id', 'stock_id', feature_col)
    feature_pivot = feature_pivot.fillna(feature_pivot.mean())
    feature_pivot.head()
    
    neighbors = np.zeros((n, *feature_pivot.shape))

    for i in range(n):
        neighbors[i, :, :] += feature_pivot.values[k_neighbors[:, i], :]
        
    return feature_pivot, neighbors

def make_neighbors_stock(df, k_neighbors, feature_col, n=5):
    feature_pivot = df.pivot('time_id', 'stock_id', feature_col)
    feature_pivot = feature_pivot.fillna(feature_pivot.mean())

    neighbors = np.zeros((n, *feature_pivot.shape))

    for i in range(n):
        neighbors[i, :, :] += feature_pivot.values[:, k_neighbors[:, i]]
        
    return feature_pivot, neighbors

def make_nn_feature(df, neighbors, columns, index, feature_col,
                    n=5, agg=np.mean, postfix='', 
                    exclude_self=False, exact=False):
    start = 1 if exclude_self else 0
    
    if exact:
        pivot_aggs = pd.DataFrame(neighbors[n-1,:,:], columns=columns, index=index)
    else:
        pivot_aggs = pd.DataFrame(agg(neighbors[start:n,:,:], axis=0), columns=columns, index=index)
    dst = pivot_aggs.unstack().reset_index()
    dst.columns = ['stock_id', 'time_id', f'{feature_col}_nn{n}{postfix}_{agg.__name__}']
    return dst

class Neighbors:
    def __init__(self, pivot, p, metric='minkowski', metric_params=None):
        nn = NearestNeighbors(n_neighbors=N_NEIGHBORS_MAX, p=p, metric=metric, metric_params=metric_params)
        nn.fit(pivot)
        self.distances, self.neighbors = nn.kneighbors(pivot, return_distance=True)
In [7]:
# the tau itself is meaningless for GBDT, but useful as input to aggregate in Nearest Neighbor features
df['trade.tau'] = np.sqrt(1 / df['trade.seconds_in_bucket.count'])
df['trade_150.tau'] = np.sqrt(1 / df['trade_150.seconds_in_bucket.count'])
df['book.tau'] = np.sqrt(1 / df['book.seconds_in_bucket.count'])
df['real_price'] = 0.01 / df['tick_size']
In [8]:
with timer('knn fit'):
    df_pv = df[['stock_id', 'time_id']].copy()
    df_pv['price'] = 0.01 / df['tick_size']
    df_pv['vol'] = df['book.log_return1.realized_volatility']
    df_pv['trade.tau'] = df['trade.tau']
    df_pv['trade.size.sum'] = df['book.total_volume.sum']

    pivot = df_pv.pivot('time_id', 'stock_id', 'price')
    pivot = pivot.fillna(pivot.mean())
    pivot = pd.DataFrame(minmax_scale(pivot))
    
    k_neighbors_time_price_c = Neighbors(pivot, 2, metric='canberra')
    k_neighbors_time_price_m = Neighbors(pivot, 2, metric='mahalanobis', metric_params={'V':np.cov(pivot.values.T)})
    k_neighbors_stock_price = Neighbors(minmax_scale(pivot.transpose()), 1)

    pivot = df_pv.pivot('time_id', 'stock_id', 'vol')
    pivot = pivot.fillna(pivot.mean())
    pivot = pd.DataFrame(minmax_scale(pivot))
    
    k_neighbors_time_vol = Neighbors(pivot, 1)
    k_neighbors_stock_vol = Neighbors(minmax_scale(pivot.transpose()), 1)
    
    pivot = df_pv.pivot('time_id', 'stock_id', 'trade.size.sum')
    pivot = pivot.fillna(pivot.mean())
    pivot = pd.DataFrame(minmax_scale(pivot))

    k_neighbors_time_size_m = Neighbors(pivot, 2, metric='mahalanobis', metric_params={'V':np.cov(pivot.values.T)})
    k_neighbors_time_size_c = Neighbors(pivot, 2, metric='canberra')
    k_neighbors_stock_size = Neighbors(minmax_scale(pivot.transpose()), 1)
[knn fit]  412.128sec
In [9]:
# features with large changes over time are converted to relative ranks within time-id
df['trade.order_count.mean'] = df.groupby('time_id')['trade.order_count.mean'].rank()
df['book.total_volume.sum'] = df.groupby('time_id')['book.total_volume.sum'].rank()
df['book.total_volume.mean'] = df.groupby('time_id')['book.total_volume.mean'].rank()
df['book.total_volume.std'] = df.groupby('time_id')['book.total_volume.std'].rank()

df['trade.tau'] = df.groupby('time_id')['trade.tau'].rank()

for dt in [150, 300, 450]:
    df[f'book_{dt}.total_volume.sum'] = df.groupby('time_id')[f'book_{dt}.total_volume.sum'].rank()
    df[f'book_{dt}.total_volume.mean'] = df.groupby('time_id')[f'book_{dt}.total_volume.mean'].rank()
    df[f'book_{dt}.total_volume.std'] = df.groupby('time_id')[f'book_{dt}.total_volume.std'].rank()
    df[f'trade_{dt}.order_count.mean'] = df.groupby('time_id')[f'trade_{dt}.order_count.mean'].rank()
In [10]:
def make_nearest_neighbor_feature(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    print(df2.shape)

    feature_cols_stock = {
        'book.log_return1.realized_volatility': [np.mean, np.min, np.max, np.std],
        'trade.seconds_in_bucket.count': [np.mean],
        'trade.tau': [np.mean],
        'trade_150.tau': [np.mean],
        'book.tau': [np.mean],
        'trade.size.sum': [np.mean],
        'book.seconds_in_bucket.count': [np.mean],
    }
    
    feature_cols = {
        'book.log_return1.realized_volatility': [np.mean, np.min, np.max, np.std],
        'real_price': [np.max, np.mean, np.min],
        'trade.seconds_in_bucket.count': [np.mean],
        'trade.tau': [np.mean],
        'trade.size.sum': [np.mean],
        'book.seconds_in_bucket.count': [np.mean],
        'trade_150.tau_nn20_sv_mean': [np.mean],  # "volatilityの傾向が似ている20銘柄での直前300secの平均tau" の、近い時刻での平均
        'trade.size.sum_nn20_sv_mean': [np.mean],
    }

    time_id_neigbor_sizes = [3, 5, 10, 20, 40]
    time_id_neigbor_sizes_vol = [2, 3, 5, 10, 20, 40]
    stock_id_neighbor_sizes = [10, 20, 40]

    ndf = None

    cols = []

    def _add_ndf(ndf, dst):
        if ndf is None:
            return dst
        else:
            ndf[dst.columns[-1]] = dst[dst.columns[-1]].astype(np.float32)
            return ndf

    # neighbor stock_id
    for feature_col in feature_cols_stock.keys():
        try:
            feature_pivot, neighbors_stock_price = make_neighbors_stock(df2, k_neighbors_stock_price.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            _, neighbors_stock_vol = make_neighbors_stock(df2, k_neighbors_stock_vol.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            #_, neighbors_stock_size = make_neighbors_stock(df2, k_neighbors_stock_size.neighbors, feature_col, n=N_NEIGHBORS_MAX)

            columns = feature_pivot.columns
            index = feature_pivot.index

            for agg in feature_cols_stock[feature_col]:
                for n in stock_id_neighbor_sizes:
                    exclude_self = True
                    exact = False
                    try:
                        dst = make_nn_feature(df2, neighbors_stock_price, columns, index, feature_col, n=n, agg=agg, postfix='_s',
                                             exclude_self=exclude_self, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                        dst = make_nn_feature(df2, neighbors_stock_vol, columns, index, feature_col, n=n, agg=agg, postfix='_sv',
                                             exclude_self=exclude_self, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                    except Exception:
                        print_trace('stock-id nn')
                        pass
            del feature_pivot, neighbors_stock_price, neighbors_stock_vol
        except Exception:
            print_trace('stock-id nn')
            pass

    df2 = pd.merge(df2, ndf, on=['time_id', 'stock_id'], how='left')
    ndf = None

    print(df2.shape)

    # neighbor time_id
    for feature_col in feature_cols.keys():
        try:
            feature_pivot, neighbors_price_c = make_neighbors(df2, k_neighbors_time_price_c.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            _, neighbors_price_m = make_neighbors(df2, k_neighbors_time_price_m.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            _, neighbors_vol = make_neighbors(df2, k_neighbors_time_vol.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            _, neighbors_size_m = make_neighbors(df2, k_neighbors_time_size_m.neighbors, feature_col, n=N_NEIGHBORS_MAX)
            _, neighbors_size_c = make_neighbors(df2, k_neighbors_time_size_c.neighbors, feature_col, n=N_NEIGHBORS_MAX)

            columns = feature_pivot.columns
            index = feature_pivot.index

            if 'volatility' in feature_col:
                time_id_ns = time_id_neigbor_sizes_vol
            else:
                time_id_ns = time_id_neigbor_sizes

            for agg in feature_cols[feature_col]:
                for n in time_id_ns:
                    exclude_self = True #n >= 10
                    exclude_self2 = False
                    exact = False

                    try:
                        dst = make_nn_feature(df2, neighbors_price_c, columns, index, feature_col, n=n, agg=agg, postfix='_price_c',
                                              exclude_self=exclude_self, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                        dst = make_nn_feature(df2, neighbors_price_m, columns, index, feature_col, n=n, agg=agg, postfix='_price_m',
                                             exclude_self=exclude_self2, exact=exact)
                        ndf = _add_ndf(ndf, dst)

                        dst = make_nn_feature(df2, neighbors_vol, columns, index, feature_col, n=n, agg=agg, postfix='_v',
                                             exclude_self=exclude_self2, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                        dst = make_nn_feature(df2, neighbors_size_m, columns, index, feature_col, n=n, agg=agg, postfix='_size_m',
                                             exclude_self=exclude_self2, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                        dst = make_nn_feature(df2, neighbors_size_c, columns, index, feature_col, n=n, agg=agg, postfix='_size_c',
                                             exclude_self=exclude_self2, exact=exact)
                        ndf = _add_ndf(ndf, dst)
                    except Exception:
                        print_trace('time-id nn')

                    cols.append(dst.columns[-1])

            del feature_pivot, neighbors_price_c, neighbors_price_m, neighbors_vol, neighbors_size_m, neighbors_size_c
        except Exception:
            print_trace('time-id nn')

    df2 = pd.merge(df2, ndf, on=['time_id', 'stock_id'], how='left')

    # features further derived from nearest neighbor features
    try:
        for sz in time_id_neigbor_sizes:
            df2[f'real_price_rankmin_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_amin"]
            df2[f'real_price_rankmax_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_amax"]
            df2[f'real_price_rankmean_{sz}'] = df2['real_price'] / df2[f"real_price_nn{sz}_price_c_mean"]

        for sz in time_id_neigbor_sizes_vol:
            df2[f'vol_rankmin_{sz}'] = df2['book.log_return1.realized_volatility'] / df2[f"book.log_return1.realized_volatility_nn{sz}_price_c_amin"]
            df2[f'vol_rankmax_{sz}'] = df2['book.log_return1.realized_volatility'] / df2[f"book.log_return1.realized_volatility_nn{sz}_price_c_amax"]

        price_cols = [c for c in df2.columns if 'real_price' in c and 'rank' not in c]
        for c in price_cols:
            del df2[c]

        for sz in time_id_neigbor_sizes_vol:
            tgt = f'book.log_return1.realized_volatility_nn{sz}_price_m_mean'
            df2[f'{tgt}_rank'] = df2.groupby('time_id')[tgt].rank()
    except Exception:
        print_trace('nn features')

    return df2
In [11]:
gc.collect()

with timer('make nearest neighbor feature'):
    df2 = make_nearest_neighbor_feature(df)

print(df2.shape)
df2.reset_index(drop=True).to_feather('optiver_df2.f')

gc.collect()
(428935, 220)
(428935, 280)
[make nearest neighbor feature]  70.993sec
(428935, 582)
Out[11]:
0

Misc Features

In [12]:
# skew correction for NN
cols_to_log = [
    'trade.size.sum',
    'trade_150.size.sum',
    'trade_300.size.sum',
    'trade_450.size.sum',
    'volume_imbalance'
]
for c in df2.columns:
    for check in cols_to_log:
        try:
            if check in c:
                df2[c] = np.log(df2[c]+1)
                break
        except Exception:
            print_trace('log1p')
In [13]:
# Rolling average of RV for similar trading volume
try:
    df2.sort_values(by=['stock_id', 'book.total_volume.sum'], inplace=True)
    df2.reset_index(drop=True, inplace=True)
    df2['realized_volatility_roll3_by_book.total_volume.mean'] = df2.groupby('stock_id')['book.log_return1.realized_volatility'].rolling(3, center=True, min_periods=1).mean().reset_index().sort_values(by=['level_1'])['book.log_return1.realized_volatility'].values
    df2['realized_volatility_roll10_by_book.total_volume.mean'] = df2.groupby('stock_id')['book.log_return1.realized_volatility'].rolling(10, center=True, min_periods=1).mean().reset_index().sort_values(by=['level_1'])['book.log_return1.realized_volatility'].values
except Exception:
    print_trace('mean RV')
In [14]:
# stock-id embedding (helps little)
try:
    lda_n = 3
    lda = LatentDirichletAllocation(n_components=lda_n, random_state=0)
    stock_id_emb = pd.DataFrame(lda.fit_transform(pivot.transpose()), index=df_pv.pivot('time_id', 'stock_id', 'vol').columns)

    for i in range(lda_n):
        df2[f'stock_id_emb{i}'] = df2['stock_id'].map(stock_id_emb[i])
except Exception:
    print_trace('LDA')
In [15]:
df_train = df2[~df2.target.isnull()].copy()
df_test = df2[df2.target.isnull()].copy()
del df2, df_pv
gc.collect()
Out[15]:
88

Reverse Engineering time-id Order & Make CV Split

In [16]:
%matplotlib inline

@contextmanager
def timer(name):
    s = time.time()
    yield
    e = time.time() - s
    print(f"[{name}] {e:.3f}sec")
    

def calc_price2(df):
    tick = sorted(np.diff(sorted(np.unique(df.values.flatten()))))[0]
    return 0.01 / tick


def calc_prices(r):
    df = pd.read_parquet(r.book_path, columns=['time_id', 'ask_price1', 'ask_price2', 'bid_price1', 'bid_price2'])
    df = df.set_index('time_id')
    df = df.groupby(level='time_id').apply(calc_price2).to_frame('price').reset_index()
    df['stock_id'] = r.stock_id
    return df


def sort_manifold(df, clf):
    df_ = df.set_index('time_id')
    df_ = pd.DataFrame(minmax_scale(df_.fillna(df_.mean())))

    X_compoents = clf.fit_transform(df_)

    dft = df.reindex(np.argsort(X_compoents[:,0])).reset_index(drop=True)
    return np.argsort(X_compoents[:, 0]), X_compoents


def reconstruct_time_id_order():
    with timer('load files'):
        df_files = pd.DataFrame(
            {'book_path': glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/**/*.parquet')}) \
            .eval('stock_id = book_path.str.extract("stock_id=(\d+)").astype("int")', engine='python')
        df_target = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
        df_target = df_target.groupby('time_id').target.mean()

    with timer('calc prices'):
        df_prices = pd.concat(Parallel(n_jobs=4, verbose=51)(delayed(calc_prices)(r) for _, r in df_files.iterrows()))
        df_prices = df_prices.pivot('time_id', 'stock_id', 'price')
        df_prices.columns = [f'stock_id={i}' for i in df_prices.columns]
        df_prices = df_prices.reset_index(drop=False)

    with timer('t-SNE(400) -> 50'):
        clf = TSNE(n_components=1, perplexity=400, random_state=0, n_iter=2000)
        order, X_compoents = sort_manifold(df_prices, clf)
        clf = TSNE(n_components=1, perplexity=50, random_state=0, init=X_compoents, n_iter=2000, method='exact')
        order, X_compoents = sort_manifold(df_prices, clf)

        df_ordered = df_prices.reindex(order).reset_index(drop=True)
        if df_ordered['stock_id=61'].iloc[0] > df_ordered['stock_id=61'].iloc[-1]:
            df_ordered = df_ordered.reindex(df_ordered.index[::-1]).reset_index(drop=True)

    # AMZN
    plt.plot(df_ordered['stock_id=61'])
    
    return df_ordered[['time_id']]
In [17]:
with timer('calculate order of time-id'):
    if USE_PRECOMPUTE_FEATURES:
        timeid_order = pd.read_csv(os.path.join(DATA_DIR, 'optiver-time-id-ordered', 'time_id_order.csv'))
    else:
        timeid_order = reconstruct_time_id_order()

with timer('make folds'):
    timeid_order['time_id_order'] = np.arange(len(timeid_order))
    df_train['time_id_order'] = df_train['time_id'].map(timeid_order.set_index('time_id')['time_id_order'])
    df_train = df_train.sort_values(['time_id_order', 'stock_id']).reset_index(drop=True)

    folds_border = [3830 - 383*4, 3830 - 383*3, 3830 - 383*2, 3830 - 383*1]
    time_id_orders = df_train['time_id_order']

    folds = []
    for i, border in enumerate(folds_border):
        idx_train = np.where(time_id_orders < border)[0]
        idx_valid = np.where((border <= time_id_orders) & (time_id_orders < border + 383))[0]
        folds.append((idx_train, idx_valid))
        
        print(f"folds{i}: train={len(idx_train)}, valid={len(idx_valid)}")
        
del df_train['time_id_order']
[calculate order of time-id] 0.011sec
folds0: train=257362, valid=42882
folds1: train=300244, valid=42896
folds2: train=343140, valid=42896
folds3: train=386036, valid=42896
[make folds] 1.865sec

LightGBM Training

In [18]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))


def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False


# from: https://blog.amedama.jp/entry/lightgbm-cv-feature-importance
def plot_importance(cvbooster, figsize=(10, 10)):
    raw_importances = cvbooster.feature_importance(importance_type='gain')
    feature_name = cvbooster.boosters[0].feature_name()
    importance_df = pd.DataFrame(data=raw_importances,
                                 columns=feature_name)
    # 平均値でソートする
    sorted_indices = importance_df.mean(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    # 上位をプロットする
    PLOT_TOP_N = 80
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()


def get_X(df_src):
    cols = [c for c in df_src.columns if c not in ['time_id', 'target', 'tick_size']]
    return df_src[cols]


class EnsembleModel:
    def __init__(self, models: List[lgb.Booster], weights: Optional[List[float]] = None):
        self.models = models
        self.weights = weights

        features = list(self.models[0].feature_name())

        for m in self.models[1:]:
            assert features == list(m.feature_name())

    def predict(self, x):
        predicted = np.zeros((len(x), len(self.models)))

        for i, m in enumerate(self.models):
            w = self.weights[i] if self.weights is not None else 1
            predicted[:, i] = w * m.predict(x)

        ttl = np.sum(self.weights) if self.weights is not None else len(self.models)
        return np.sum(predicted, axis=1) / ttl

    def feature_name(self) -> List[str]:
        return self.models[0].feature_name()
In [19]:
lr = GBDT_LR
if SHORTCUT_GBDT_IN_1ST_STAGE and IS_1ST_STAGE:
    # to save GPU quota
    lr = 0.3

params = {
    'objective': 'regression',
    'verbose': 0,
    'metric': '',
    'reg_alpha': 5,
    'reg_lambda': 5,
    'min_data_in_leaf': 1000,
    'max_depth': -1,
    'num_leaves': 128,
    'colsample_bytree': 0.3,
    'learning_rate': lr
}

X = get_X(df_train)
y = df_train['target']
X.to_feather('X.f')
df_train[['target']].to_feather('y.f')

gc.collect()

print(X.shape)

if PREDICT_GBDT:
    ds = lgb.Dataset(X, y, weight=1/np.power(y, 2))

    with timer('lgb.cv'):
        ret = lgb.cv(params, ds, num_boost_round=8000, folds=folds, #cv,
                     feval=feval_RMSPE, stratified=False, 
                     return_cvbooster=True, verbose_eval=20,
                     early_stopping_rounds=int(40*0.1/lr))

        print(f"# overall RMSPE: {ret['RMSPE-mean'][-1]}")

    best_iteration = len(ret['RMSPE-mean'])
    for i in range(len(folds)):
        y_pred = ret['cvbooster'].boosters[i].predict(X.iloc[folds[i][1]], num_iteration=best_iteration)
        y_true = y.iloc[folds[i][1]]
        print(f"# fold{i} RMSPE: {rmspe(y_true, y_pred)}")
        
        if i == len(folds) - 1:
            np.save('pred_gbdt.npy', y_pred)

    plot_importance(ret['cvbooster'], figsize=(10, 20))

    boosters = []
    with timer('retraining'):
        for i in range(GBDT_NUM_MODELS):
            params['seed'] = i
            boosters.append(lgb.train(params, ds, num_boost_round=int(1.1*best_iteration)))

    booster = EnsembleModel(boosters)
    del ret
    del ds

gc.collect()
(428932, 584)
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 1.628674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 1.880322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 3.672728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 4.181389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[20]	cv_agg's l2: 2.09546e-07 + 2.73032e-08	cv_agg's RMSPE: 0.204387 + 0.0112926
[40]	cv_agg's l2: 2.09533e-07 + 2.78101e-08	cv_agg's RMSPE: 0.204347 + 0.0112653
# overall RMSPE: 0.20423249999999998
[lgb.cv] 120.691sec
# fold0 RMSPE: 0.20786273785605855
# fold1 RMSPE: 0.21328112405368008
# fold2 RMSPE: 0.21076523954130466
# fold3 RMSPE: 0.18501984354386528
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 5.129431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 5.154393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 5.093701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 5.133777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 4.848139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[retraining] 107.926sec
Out[19]:
62700

NN Training

In [20]:
import gc
import os
import pickle
import random
from typing import List, Tuple, Optional, Union

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.decomposition import PCA
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts


null_check_cols = [
    'book.log_return1.realized_volatility',
    'book_150.log_return1.realized_volatility',
    'book_300.log_return1.realized_volatility',
    'book_450.log_return1.realized_volatility',
    'trade.log_return.realized_volatility',
    'trade_150.log_return.realized_volatility',
    'trade_300.log_return.realized_volatility',
    'trade_450.log_return.realized_volatility'
]


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def rmspe_metric(y_true, y_pred):
    rmspe = np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
    return rmspe


def rmspe_loss(y_true, y_pred):
    rmspe = torch.sqrt(torch.mean(torch.square((y_true - y_pred) / y_true)))
    return rmspe


class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))

def RMSPELoss_Tabnet(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()


class AverageMeter:
    """Computes and stores the average and current value"""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class TabularDataset(Dataset):
    def __init__(self, x_num: np.ndarray, x_cat: np.ndarray, y: Optional[np.ndarray]):
        super().__init__()
        self.x_num = x_num
        self.x_cat = x_cat
        self.y = y

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx])
        else:
            return self.x_num[idx], torch.LongTensor(self.x_cat[idx]), self.y[idx]


class MLP(nn.Module):
    def __init__(self,
                 src_num_dim: int,
                 n_categories: List[int],
                 dropout: float = 0.0,
                 hidden: int = 50,
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 bn: bool = False):
        super().__init__()

        self.embs = nn.ModuleList([
            nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

        if bn:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
        else:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )

    def forward(self, x_num, x_cat):
        embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
        x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
        x_all = torch.cat([x_num, x_cat_emb], 1)
        x = self.sequence(x_all)
        return torch.squeeze(x)


class CNN(nn.Module):
    def __init__(self,
                 num_features: int,
                 hidden_size: int,
                 n_categories: List[int],
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 channel_1: int = 256,
                 channel_2: int = 512,
                 channel_3: int = 512,
                 dropout_top: float = 0.1,
                 dropout_mid: float = 0.3,
                 dropout_bottom: float = 0.2,
                 weight_norm: bool = True,
                 two_stage: bool = True,
                 celu: bool = True,
                 kernel1: int = 5,
                 leaky_relu: bool = False):
        super().__init__()

        num_targets = 1

        cha_1_reshape = int(hidden_size / channel_1)
        cha_po_1 = int(hidden_size / channel_1 / 2)
        cha_po_2 = int(hidden_size / channel_1 / 2 / 2) * channel_3

        self.cat_dim = emb_dim * len(n_categories)
        self.cha_1 = channel_1
        self.cha_2 = channel_2
        self.cha_3 = channel_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2
        self.two_stage = two_stage

        self.expand = nn.Sequential(
            nn.BatchNorm1d(num_features + self.cat_dim),
            nn.Dropout(dropout_top),
            nn.utils.weight_norm(nn.Linear(num_features + self.cat_dim, hidden_size), dim=None),
            nn.CELU(0.06) if celu else nn.ReLU()
        )

        def _norm(layer, dim=None):
            return nn.utils.weight_norm(layer, dim=dim) if weight_norm else layer

        self.conv1 = nn.Sequential(
            nn.BatchNorm1d(channel_1),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_1, channel_2, kernel_size=kernel1, stride=1, padding=kernel1 // 2, bias=False)),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(output_size=cha_po_1),
            nn.BatchNorm1d(channel_2),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
            nn.ReLU()
        )

        if self.two_stage:
            self.conv2 = nn.Sequential(
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_mid),
                _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
                nn.ReLU(),
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Conv1d(channel_2, channel_3, kernel_size=5, stride=1, padding=2, bias=True)),
                nn.ReLU()
            )

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        if leaky_relu:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0),
                nn.LeakyReLU()
            )
        else:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0)
            )

        self.embs = nn.ModuleList([nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

    def forward(self, x_num, x_cat):
        embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
        x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
        x = torch.cat([x_num, x_cat_emb], 1)

        x = self.expand(x)

        x = x.reshape(x.shape[0], self.cha_1, self.cha_1_reshape)

        x = self.conv1(x)

        if self.two_stage:
            x = self.conv2(x) * x

        x = self.max_po_c2(x)
        x = self.flt(x)
        x = self.dense(x)

        return torch.squeeze(x)


def preprocess_nn(
        X: pd.DataFrame,
        scaler: Optional[StandardScaler] = None,
        scaler_type: str = 'standard',
        n_pca: int = -1,
        na_cols: bool = True):
    if na_cols:
        #for c in X.columns:
        for c in null_check_cols:
            if c in X.columns:
                X[f"{c}_isnull"] = X[c].isnull().astype(int)

    cat_cols = [c for c in X.columns if c in ['time_id', 'stock_id']]
    num_cols = [c for c in X.columns if c not in cat_cols]

    X_num = X[num_cols].values.astype(np.float32)
    X_cat = np.nan_to_num(X[cat_cols].values.astype(np.int32))

    def _pca(X_num_):
        if n_pca > 0:
            pca = PCA(n_components=n_pca, random_state=0)
            return pca.fit_transform(X_num)
        return X_num

    if scaler is None:
        scaler = StandardScaler()
        X_num = scaler.fit_transform(X_num)
        X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
        return _pca(X_num), X_cat, cat_cols, scaler
    else:
        X_num = scaler.transform(X_num) #TODO: infでも大丈夫?
        X_num = np.nan_to_num(X_num, posinf=0, neginf=0)
        return _pca(X_num), X_cat, cat_cols


def train_epoch(data_loader: DataLoader,
                model: nn.Module,
                optimizer,
                scheduler,
                device,
                clip_grad: float = 1.5):
    model.train()
    losses = AverageMeter()
    step = 0

    for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Training'):
        batch_size = x_num.size(0)
        x_num = x_num.to(device, dtype=torch.float)
        x_cat = x_cat.to(device)
        y = y.to(device, dtype=torch.float)

        loss = rmspe_loss(y, model(x_num, x_cat))
        losses.update(loss.detach().cpu().numpy(), batch_size)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()

        step += 1

    return losses.avg


def evaluate(data_loader: DataLoader, model, device):
    model.eval()

    losses = AverageMeter()

    final_targets = []
    final_outputs = []

    with torch.no_grad():
        for x_num, x_cat, y in tqdm(data_loader, position=0, leave=True, desc='Evaluating'):
            batch_size = x_num.size(0)
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)
            y = y.to(device, dtype=torch.float)

            with torch.no_grad():
                output = model(x_num, x_cat)

            loss = rmspe_loss(y, output)
            # record loss
            losses.update(loss.detach().cpu().numpy(), batch_size)

            targets = y.detach().cpu().numpy()
            output = output.detach().cpu().numpy()

            final_targets.append(targets)
            final_outputs.append(output)

    final_targets = np.concatenate(final_targets)
    final_outputs = np.concatenate(final_outputs)

    try:
        metric = rmspe_metric(final_targets, final_outputs)
    except:
        metric = None

    return final_outputs, final_targets, losses.avg, metric


def predict_nn(X: pd.DataFrame,
               model: Union[List[MLP], MLP],
               scaler: StandardScaler,
               device,
               ensemble_method='mean'):
    if not isinstance(model, list):
        model = [model]

    for m in model:
        m.eval()
    X_num, X_cat, cat_cols = preprocess_nn(X.copy(), scaler=scaler)
    valid_dataset = TabularDataset(X_num, X_cat, None)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=512,
                                               shuffle=False,
                                               num_workers=4)

    final_outputs = []

    with torch.no_grad():
        for x_num, x_cat in tqdm(valid_loader, position=0, leave=True, desc='Evaluating'):
            x_num = x_num.to(device, dtype=torch.float)
            x_cat = x_cat.to(device)

            outputs = []
            with torch.no_grad():
                for m in model:
                    output = m(x_num, x_cat)
                    outputs.append(output.detach().cpu().numpy())

            if ensemble_method == 'median':
                pred = np.nanmedian(np.array(outputs), axis=0)
            else:
                pred = np.array(outputs).mean(axis=0)
            final_outputs.append(pred)

    final_outputs = np.concatenate(final_outputs)
    return final_outputs


def predict_tabnet(X: pd.DataFrame,
                   model: Union[List[TabNetRegressor], TabNetRegressor],
                   scaler: StandardScaler,
                   ensemble_method='mean'):
    if not isinstance(model, list):
        model = [model]

    X_num, X_cat, cat_cols = preprocess_nn(X.copy(), scaler=scaler)
    X_processed = np.concatenate([X_cat, X_num], axis=1)

    predicted = []
    for m in model:
        predicted.append(m.predict(X_processed))

    if ensemble_method == 'median':
        pred = np.nanmedian(np.array(predicted), axis=0)
    else:
        pred = np.array(predicted).mean(axis=0)

    return pred


def train_tabnet(X: pd.DataFrame,
                 y: pd.DataFrame,
                 folds: List[Tuple],
                 batch_size: int = 1024,
                 lr: float = 1e-3,
                 model_path: str = 'fold_{}.pth',
                 scaler_type: str = 'standard',
                 output_dir: str = 'artifacts',
                 epochs: int = 250,
                 seed: int = 42,
                 n_pca: int = -1,
                 na_cols: bool = True,
                 patience: int = 10,
                 factor: float = 0.5,
                 gamma: float = 2.0,
                 lambda_sparse: float = 8.0,
                 n_steps: int = 2,
                 scheduler_type: str = 'cosine',
                 n_a: int = 16):
    seed_everything(seed)

    os.makedirs(output_dir, exist_ok=True)

    y = y.values.astype(np.float32)
    X_num, X_cat, cat_cols, scaler = preprocess_nn(X.copy(), scaler_type=scaler_type, n_pca=n_pca, na_cols=na_cols)

    best_losses = []
    best_predictions = []

    for cv_idx, (train_idx, valid_idx) in enumerate(folds):
        X_tr, X_va = X_num[train_idx], X_num[valid_idx]
        X_tr_cat, X_va_cat = X_cat[train_idx], X_cat[valid_idx]
        y_tr, y_va = y[train_idx], y[valid_idx]
        y_tr = y_tr.reshape(-1,1)
        y_va = y_va.reshape(-1,1)
        X_tr = np.concatenate([X_tr_cat, X_tr], axis=1)
        X_va = np.concatenate([X_va_cat, X_va], axis=1)

        cat_idxs = [0]
        cat_dims = [128]

        if scheduler_type == 'cosine':
            scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False)
            scheduler_fn = CosineAnnealingWarmRestarts
        else:
            scheduler_params = {'mode': 'min', 'min_lr': 1e-7, 'patience': patience, 'factor': factor, 'verbose': True}
            scheduler_fn = torch.optim.lr_scheduler.ReduceLROnPlateau

        model = TabNetRegressor(
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=1,
            n_d=n_a,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            n_independent=2,
            n_shared=2,
            lambda_sparse=lambda_sparse,
            optimizer_fn=torch.optim.Adam,
            optimizer_params={'lr': lr},
            mask_type="entmax",
            scheduler_fn=scheduler_fn,
            scheduler_params=scheduler_params,
            seed=seed,
            verbose=10
            #device_name=device,
            #clip_value=1.5
        )

        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], max_epochs=epochs, patience=50, batch_size=1024*20,
                  virtual_batch_size=batch_size, num_workers=4, drop_last=False, eval_metric=[RMSPE], loss_fn=RMSPELoss_Tabnet)

        path = os.path.join(output_dir, model_path.format(cv_idx))
        model.save_model(path)

        predicted = model.predict(X_va)

        rmspe = rmspe_metric(y_va, predicted)
        best_losses.append(rmspe)
        best_predictions.append(predicted)

    return best_losses, best_predictions, scaler, model


def train_nn(X: pd.DataFrame,
             y: pd.DataFrame,
             folds: List[Tuple],
             device,
             emb_dim: int = 25,
             batch_size: int = 1024,
             model_type: str = 'mlp',
             mlp_dropout: float = 0.0,
             mlp_hidden: int = 64,
             mlp_bn: bool = False,
             cnn_hidden: int = 64,
             cnn_channel1: int = 32,
             cnn_channel2: int = 32,
             cnn_channel3: int = 32,
             cnn_kernel1: int = 5,
             cnn_celu: bool = False,
             cnn_weight_norm: bool = False,
             dropout_emb: bool = 0.0,
             lr: float = 1e-3,
             weight_decay: float = 0.0,
             model_path: str = 'fold_{}.pth',
             scaler_type: str = 'standard',
             output_dir: str = 'artifacts',
             scheduler_type: str = 'onecycle',
             optimizer_type: str = 'adam',
             max_lr: float = 0.01,
             epochs: int = 30,
             seed: int = 42,
             n_pca: int = -1,
             batch_double_freq: int = 50,
             cnn_dropout: float = 0.1,
             na_cols: bool = True,
             cnn_leaky_relu: bool = False,
             patience: int = 8,
             factor: float = 0.5):
    seed_everything(seed)

    os.makedirs(output_dir, exist_ok=True)

    y = y.values.astype(np.float32)
    X_num, X_cat, cat_cols, scaler = preprocess_nn(X.copy(), scaler_type=scaler_type, n_pca=n_pca, na_cols=na_cols)

    best_losses = []
    best_predictions = []

    for cv_idx, (train_idx, valid_idx) in enumerate(folds):
        X_tr, X_va = X_num[train_idx], X_num[valid_idx]
        X_tr_cat, X_va_cat = X_cat[train_idx], X_cat[valid_idx]
        y_tr, y_va = y[train_idx], y[valid_idx]

        cur_batch = batch_size
        best_loss = 1e10
        best_prediction = None

        print(f"fold {cv_idx} train: {X_tr.shape}, valid: {X_va.shape}")

        train_dataset = TabularDataset(X_tr, X_tr_cat, y_tr)
        valid_dataset = TabularDataset(X_va, X_va_cat, y_va)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cur_batch, shuffle=True,
                                                   num_workers=4)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=cur_batch, shuffle=False,
                                                   num_workers=4)

        if model_type == 'mlp':
            model = MLP(X_tr.shape[1],
                        n_categories=[128],
                        dropout=mlp_dropout, hidden=mlp_hidden, emb_dim=emb_dim,
                        dropout_cat=dropout_emb, bn=mlp_bn)
        elif model_type == 'cnn':
            model = CNN(X_tr.shape[1],
                        hidden_size=cnn_hidden,
                        n_categories=[128],
                        emb_dim=emb_dim,
                        dropout_cat=dropout_emb,
                        channel_1=cnn_channel1,
                        channel_2=cnn_channel2,
                        channel_3=cnn_channel3,
                        two_stage=False,
                        kernel1=cnn_kernel1,
                        celu=cnn_celu,
                        dropout_top=cnn_dropout,
                        dropout_mid=cnn_dropout,
                        dropout_bottom=cnn_dropout,
                        weight_norm=cnn_weight_norm,
                        leaky_relu=cnn_leaky_relu)
        else:
            raise NotImplementedError()
        model = model.to(device)

        if optimizer_type == 'adamw':
            opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer_type == 'adam':
            opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            raise NotImplementedError()

        scheduler = epoch_scheduler = None
        if scheduler_type == 'onecycle':
            scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt, pct_start=0.1, div_factor=1e3,
                                                            max_lr=max_lr, epochs=epochs,
                                                            steps_per_epoch=len(train_loader))
        elif scheduler_type == 'reduce':
            epoch_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,
                                                                         mode='min',
                                                                         min_lr=1e-7,
                                                                         patience=patience,
                                                                         verbose=True,
                                                                         factor=factor)

        for epoch in range(epochs):
            if epoch > 0 and epoch % batch_double_freq == 0:
                cur_batch = cur_batch * 2
                print(f'batch: {cur_batch}')
                train_loader = torch.utils.data.DataLoader(train_dataset,
                                                           batch_size=cur_batch,
                                                           shuffle=True,
                                                           num_workers=4)
            train_loss = train_epoch(train_loader, model, opt, scheduler, device)
            predictions, valid_targets, valid_loss, rmspe = evaluate(valid_loader, model, device=device)
            print(f"epoch {epoch}, train loss: {train_loss:.3f}, valid rmspe: {rmspe:.3f}")

            if epoch_scheduler is not None:
                epoch_scheduler.step(rmspe)

            if rmspe < best_loss:
                print(f'new best:{rmspe}')
                best_loss = rmspe
                best_prediction = predictions
                torch.save(model, os.path.join(output_dir, model_path.format(cv_idx)))

        best_predictions.append(best_prediction)
        best_losses.append(best_loss)
        del model, train_dataset, valid_dataset, train_loader, valid_loader, X_tr, X_va, X_tr_cat, X_va_cat, y_tr, y_va, opt
        if scheduler is not None:
            del scheduler
        gc.collect()

    return best_losses, best_predictions, scaler
In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

del df, df_train
gc.collect()

def get_top_n_models(models, scores, top_n):
    if len(models) <= top_n:
        print('number of models are less than top_n. all models will be used')
        return models
    sorted_ = [(y, x) for y, x in sorted(zip(scores, models), key=lambda pair: pair[0])]
    print(f'scores(sorted): {[y for y, _ in sorted_]}')
    return [x for _, x in sorted_][:top_n]


if PREDICT_MLP:
    model_paths = []
    scores = []
    
    if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
        print('shortcut to save quota...')
        epochs = 3
        valid_th = 100
    else:
        epochs = 30
        valid_th = NN_VALID_TH
    
    for i in range(NN_NUM_MODELS):
        # MLP
        nn_losses, nn_preds, scaler = train_nn(X, y, 
                                               [folds[-1]], 
                                               device=device, 
                                               batch_size=512,
                                               mlp_bn=True,
                                               mlp_hidden=256,
                                               mlp_dropout=0.0,
                                               emb_dim=30,
                                               epochs=epochs,
                                               lr=0.002,
                                               max_lr=0.0055,
                                               weight_decay=1e-7,
                                               model_path='mlp_fold_{}' + f"_seed{i}.pth",
                                               seed=i)
        if nn_losses[0] < NN_VALID_TH:
            print(f'model of seed {i} added.')
            scores.append(nn_losses[0])
            model_paths.append(f'artifacts/mlp_fold_0_seed{i}.pth')
            np.save(f'pred_mlp_seed{i}.npy', nn_preds[0])

    model_paths = get_top_n_models(model_paths, scores, NN_MODEL_TOP_N)
    mlp_model = [torch.load(path, device) for path in model_paths]
    print(f'total {len(mlp_model)} models will be used.')
if PREDICT_CNN:
    model_paths = []
    scores = []
        
    if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
        print('shortcut to save quota...')
        epochs = 3
        valid_th = 100
    else:
        epochs = 50
        valid_th = NN_VALID_TH

    for i in range(NN_NUM_MODELS):
        nn_losses, nn_preds, scaler = train_nn(X, y, 
                                               [folds[-1]], 
                                               device=device, 
                                               cnn_hidden=8*128,
                                               batch_size=1280,
                                               model_type='cnn',
                                               emb_dim=30,
                                               epochs=epochs, #epochs,
                                               cnn_channel1=128,
                                               cnn_channel2=3*128,
                                               cnn_channel3=3*128,
                                               lr=0.00038, #0.0011,
                                               max_lr=0.0013,
                                               weight_decay=6.5e-6,
                                               optimizer_type='adam',
                                               scheduler_type='reduce',
                                               model_path='cnn_fold_{}' + f"_seed{i}.pth",
                                               seed=i,
                                               cnn_dropout=0.0,
                                               cnn_weight_norm=True,
                                               cnn_leaky_relu=False,
                                               patience=8,
                                               factor=0.3)
        if nn_losses[0] < valid_th:
            model_paths.append(f'artifacts/cnn_fold_0_seed{i}.pth')
            scores.append(nn_losses[0])
            np.save(f'pred_cnn_seed{i}.npy', nn_preds[0])
            
    model_paths = get_top_n_models(model_paths, scores, NN_MODEL_TOP_N)
    cnn_model = [torch.load(path, device) for path in model_paths]
    print(f'total {len(cnn_model)} models will be used.')
    
if PREDICT_TABNET:
    tab_model = []
    scores = []
        
    if SHORTCUT_NN_IN_1ST_STAGE and IS_1ST_STAGE:
        print('shortcut to save quota...')
        epochs = 10
        valid_th = 1000
    else:
        print('train full')
        epochs = 250
        valid_th = NN_VALID_TH

    for i in range(TABNET_NUM_MODELS):
        nn_losses, nn_preds, scaler, model = train_tabnet(X, y,  
                                                          [folds[-1]], 
                                                          batch_size=1280,
                                                          epochs=epochs, #epochs,
                                                          lr=0.04,
                                                          patience=50,
                                                          factor=0.5,
                                                          gamma=1.6,
                                                          lambda_sparse=3.55e-6,
                                                          seed=i,
                                                          n_a=36)
        if nn_losses[0] < valid_th:
            tab_model.append(model)
            scores.append(nn_losses[0])
            np.save(f'pred_tab_seed{i}.npy', nn_preds[0])
            model.save_model(f'artifacts/tabnet_fold_0_seed{i}')
            
    tab_model = get_top_n_models(tab_model, scores, TAB_MODEL_TOP_N)
    print(f'total {len(tab_model)} models will be used.')
cuda
shortcut to save quota...
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:14<00:00, 52.54it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 64.26it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 13.423, valid rmspe: 2.135
new best:2.134801149368286
Training: 100%|██████████| 754/754 [00:12<00:00, 58.23it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 61.72it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.578, valid rmspe: 0.372
new best:0.3721766769886017
Training: 100%|██████████| 754/754 [00:13<00:00, 56.84it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 62.81it/s]
epoch 2, train loss: 0.374, valid rmspe: 0.196
new best:0.19614197313785553
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:12<00:00, 59.13it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 47.64it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 14.878, valid rmspe: 1.592
new best:1.5919514894485474
Training: 100%|██████████| 754/754 [00:13<00:00, 55.61it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 62.93it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.399, valid rmspe: 0.411
new best:0.411074161529541
Training: 100%|██████████| 754/754 [00:13<00:00, 57.61it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 57.46it/s]
epoch 2, train loss: 0.293, valid rmspe: 0.191
new best:0.19130554795265198
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 57.68it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 62.60it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 13.788, valid rmspe: 4.318
new best:4.317901611328125
Training: 100%|██████████| 754/754 [00:13<00:00, 56.21it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 45.41it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.798, valid rmspe: 1.173
new best:1.1728029251098633
Training: 100%|██████████| 754/754 [00:13<00:00, 57.15it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 61.82it/s]
epoch 2, train loss: 0.472, valid rmspe: 0.197
new best:0.19664736092090607
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 55.77it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 63.51it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 15.260, valid rmspe: 2.225
new best:2.2247672080993652
Training: 100%|██████████| 754/754 [00:13<00:00, 56.30it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 57.01it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.615, valid rmspe: 0.433
new best:0.43293842673301697
Training: 100%|██████████| 754/754 [00:12<00:00, 58.52it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 45.02it/s]
epoch 2, train loss: 0.389, valid rmspe: 0.196
new best:0.19590497016906738
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 57.26it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 54.34it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 15.195, valid rmspe: 1.503
new best:1.5032968521118164
Training: 100%|██████████| 754/754 [00:13<00:00, 56.18it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 62.39it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.535, valid rmspe: 0.512
new best:0.511673092842102
Training: 100%|██████████| 754/754 [00:13<00:00, 55.82it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 55.29it/s]
epoch 2, train loss: 0.336, valid rmspe: 0.195
new best:0.19460316002368927
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:14<00:00, 53.57it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 62.33it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 15.910, valid rmspe: 1.726
new best:1.726242184638977
Training: 100%|██████████| 754/754 [00:12<00:00, 60.33it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 64.39it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.908, valid rmspe: 0.649
new best:0.648505449295044
Training: 100%|██████████| 754/754 [00:13<00:00, 53.95it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 57.04it/s]
epoch 2, train loss: 0.386, valid rmspe: 0.200
new best:0.2001459002494812
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 56.19it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 64.16it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 13.716, valid rmspe: 2.127
new best:2.127309799194336
Training: 100%|██████████| 754/754 [00:13<00:00, 55.76it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 63.20it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.638, valid rmspe: 0.931
new best:0.9309726357460022
Training: 100%|██████████| 754/754 [00:12<00:00, 60.08it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 54.69it/s]
epoch 2, train loss: 0.358, valid rmspe: 0.194
new best:0.19419467449188232
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 56.06it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 63.05it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 14.441, valid rmspe: 1.609
new best:1.609104871749878
Training: 100%|██████████| 754/754 [00:13<00:00, 57.29it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 63.66it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.492, valid rmspe: 0.368
new best:0.36837005615234375
Training: 100%|██████████| 754/754 [00:13<00:00, 56.87it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 63.39it/s]
epoch 2, train loss: 0.312, valid rmspe: 0.196
new best:0.19634036719799042
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:13<00:00, 54.20it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 56.14it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 15.310, valid rmspe: 2.719
new best:2.7193262577056885
Training: 100%|██████████| 754/754 [00:12<00:00, 59.99it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 64.10it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.696, valid rmspe: 0.640
new best:0.6401618123054504
Training: 100%|██████████| 754/754 [00:13<00:00, 56.66it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 59.41it/s]
epoch 2, train loss: 0.385, valid rmspe: 0.195
new best:0.19467251002788544
Training:   0%|          | 0/754 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 754/754 [00:12<00:00, 60.26it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 60.93it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 0, train loss: 14.523, valid rmspe: 1.424
new best:1.4244930744171143
Training: 100%|██████████| 754/754 [00:14<00:00, 53.35it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 59.69it/s]
Training:   0%|          | 0/754 [00:00<?, ?it/s]
epoch 1, train loss: 1.499, valid rmspe: 0.358
new best:0.35776957869529724
Training: 100%|██████████| 754/754 [00:12<00:00, 59.74it/s]
Evaluating: 100%|██████████| 84/84 [00:01<00:00, 64.18it/s]
epoch 2, train loss: 0.313, valid rmspe: 0.193
new best:0.19274553656578064
number of models are less than top_n. all models will be used
total 0 models will be used.
shortcut to save quota...
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 21.87it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.78it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 36.847, valid rmspe: 13.479
new best:13.479052543640137
Training: 100%|██████████| 302/302 [00:12<00:00, 23.55it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 18.71it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 6.999, valid rmspe: 11.602
new best:11.602226257324219
Training: 100%|██████████| 302/302 [00:14<00:00, 21.45it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.03it/s]
epoch 2, train loss: 6.338, valid rmspe: 3.183
new best:3.1833648681640625
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 22.47it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.63it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 32.680, valid rmspe: 13.425
new best:13.425479888916016
Training: 100%|██████████| 302/302 [00:13<00:00, 22.07it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.89it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 9.076, valid rmspe: 5.088
new best:5.088461875915527
Training: 100%|██████████| 302/302 [00:13<00:00, 21.97it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.27it/s]
epoch 2, train loss: 6.327, valid rmspe: 2.818
new best:2.8181731700897217
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 21.80it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.05it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 26.282, valid rmspe: 7.520
new best:7.5200018882751465
Training: 100%|██████████| 302/302 [00:13<00:00, 22.44it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.06it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 5.948, valid rmspe: 2.820
new best:2.819704294204712
Training: 100%|██████████| 302/302 [00:14<00:00, 21.37it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.10it/s]
epoch 2, train loss: 4.182, valid rmspe: 8.928
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:12<00:00, 23.34it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 21.61it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 30.133, valid rmspe: 6.609
new best:6.609215259552002
Training: 100%|██████████| 302/302 [00:13<00:00, 22.35it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 18.61it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 5.277, valid rmspe: 4.785
new best:4.785083770751953
Training: 100%|██████████| 302/302 [00:13<00:00, 23.15it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 22.25it/s]
epoch 2, train loss: 4.854, valid rmspe: 3.869
new best:3.869030237197876
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:14<00:00, 20.47it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.21it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 33.445, valid rmspe: 13.552
new best:13.552135467529297
Training: 100%|██████████| 302/302 [00:12<00:00, 23.51it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.56it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 8.429, valid rmspe: 4.714
new best:4.714221000671387
Training: 100%|██████████| 302/302 [00:14<00:00, 21.04it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 21.20it/s]
epoch 2, train loss: 5.189, valid rmspe: 1.880
new best:1.8802149295806885
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 22.87it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.25it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 31.151, valid rmspe: 6.718
new best:6.717589378356934
Training: 100%|██████████| 302/302 [00:14<00:00, 20.82it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.14it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 8.873, valid rmspe: 9.657
Training: 100%|██████████| 302/302 [00:12<00:00, 23.39it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.16it/s]
epoch 2, train loss: 7.172, valid rmspe: 2.419
new best:2.4192187786102295
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:14<00:00, 20.95it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.31it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 31.107, valid rmspe: 7.231
new best:7.230500221252441
Training: 100%|██████████| 302/302 [00:12<00:00, 23.58it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.90it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 8.856, valid rmspe: 6.785
new best:6.785274982452393
Training: 100%|██████████| 302/302 [00:14<00:00, 20.35it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 25.35it/s]
epoch 2, train loss: 5.506, valid rmspe: 1.750
new best:1.7495239973068237
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 23.11it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 19.41it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 32.531, valid rmspe: 10.042
new best:10.041739463806152
Training: 100%|██████████| 302/302 [00:14<00:00, 21.16it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.62it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 5.759, valid rmspe: 5.931
new best:5.930790901184082
Training: 100%|██████████| 302/302 [00:12<00:00, 23.52it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.30it/s]
epoch 2, train loss: 6.079, valid rmspe: 4.910
new best:4.910189628601074
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:14<00:00, 20.92it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.44it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 32.629, valid rmspe: 12.124
new best:12.124483108520508
Training: 100%|██████████| 302/302 [00:13<00:00, 22.55it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.21it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 7.210, valid rmspe: 6.183
new best:6.182739734649658
Training: 100%|██████████| 302/302 [00:14<00:00, 21.14it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.32it/s]
epoch 2, train loss: 5.386, valid rmspe: 2.422
new best:2.422102451324463
Training:   0%|          | 0/302 [00:00<?, ?it/s]
fold 0 train: (386036, 591), valid: (42896, 591)
Training: 100%|██████████| 302/302 [00:13<00:00, 22.08it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 22.36it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 0, train loss: 39.311, valid rmspe: 8.752
new best:8.75162124633789
Training: 100%|██████████| 302/302 [00:14<00:00, 21.09it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 26.81it/s]
Training:   0%|          | 0/302 [00:00<?, ?it/s]
epoch 1, train loss: 7.640, valid rmspe: 2.339
new best:2.339479684829712
Training: 100%|██████████| 302/302 [00:13<00:00, 22.45it/s]
Evaluating: 100%|██████████| 34/34 [00:01<00:00, 27.50it/s]
epoch 2, train loss: 5.291, valid rmspe: 2.516
scores(sorted): [1.749524, 1.8802149, 2.3394797, 2.4192188, 2.4221025, 2.8181732, 2.8197043, 3.1833649, 3.8690302, 4.9101896]
total 5 models will be used.
In [22]:
del X, y
gc.collect()
Out[22]:
20

Inference

In [23]:
X_test = get_X(df_test)
print(X_test.shape)
(3, 584)
In [24]:
df_pred = pd.DataFrame()
df_pred['row_id'] = df_test['stock_id'].astype(str) + '-' + df_test['time_id'].astype(str)

predictions = {}

prediction_weights = {}

if PREDICT_GBDT:
    gbdt_preds = booster.predict(X_test)
    predictions['gbdt'] = gbdt_preds
    prediction_weights['gbdt'] = 4


if PREDICT_MLP and mlp_model:
    try:
        mlp_preds = predict_nn(X_test, mlp_model, scaler, device, ensemble_method=ENSEMBLE_METHOD)
        print(f'mlp: {mlp_preds.shape}')
        predictions['mlp'] = mlp_preds
        prediction_weights['mlp'] = 1
    except:
        print(f'failed to predict mlp: {traceback.format_exc()}')


if PREDICT_CNN and cnn_model:
    try:
        cnn_preds = predict_nn(X_test, cnn_model, scaler, device, ensemble_method=ENSEMBLE_METHOD)
        print(f'cnn: {cnn_preds.shape}')
        predictions['cnn'] = cnn_preds
        prediction_weights['cnn'] = 4
    except:
        print(f'failed to predict cnn: {traceback.format_exc()}')


if PREDICT_TABNET and tab_model:
    try:
        tab_preds = predict_tabnet(X_test, tab_model, scaler, ensemble_method=ENSEMBLE_METHOD).flatten()
        print(f'tab: {tab_preds.shape}')
        predictions['tab'] = tab_preds
        prediction_weights['tab'] = 1
    except:
        print(f'failed to predict tab: {traceback.format_exc()}')

        
overall_preds = None
overall_weight = np.sum(list(prediction_weights.values()))

print(f'prediction will be made by: {list(prediction_weights.keys())}')

for name, preds in predictions.items():
    w = prediction_weights[name] / overall_weight
    if overall_preds is None:
        overall_preds = preds * w
    else:
        overall_preds += preds * w
        
df_pred['target'] = np.clip(overall_preds, 0, None)
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
cnn: (3,)
prediction will be made by: ['gbdt', 'cnn']

In [25]:
sub = pd.read_csv(os.path.join(DATA_DIR, 'optiver-realized-volatility-prediction', 'sample_submission.csv'))
submission = pd.merge(sub[['row_id']], df_pred[['row_id', 'target']], how='left')
submission['target'] = submission['target'].fillna(0)
submission.to_csv('submission.csv', index=False)