import numpy as np
import pandas as pd
import itertools
import time
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from matplotlib.ticker import FormatStrFormatter
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
##Suppress Warning##
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
def data_split(X_full, Y_full, train_size):
'''
Sample n=200 data from the original dataset to be used as training data
X_full: Full X data matrix (for communities/meps/blog)
Y_full: Full Y data matrix (for communities/meps/blog)
Return: A list of data matrices of train/test(or validation) pairs
'''
n_tot = len(Y_full)
idx = np.random.choice(n_tot, train_size, replace=False)
not_idx = [i not in idx for i in range(n_tot)]
X_train = X_full[idx, :]
X_test = X_full[not_idx, :]
Y_train = Y_full[idx, ]
Y_test = Y_full[not_idx, ]
return([X_train, X_test, Y_train, Y_test])
def generate_bootstrap_samples(n, m, B):
'''
Return: B-by-m matrix, where row b gives the indices for b-th bootstrap sample
'''
samples_idx = np.zeros((B, m),dtype=int)
for b in range(B):
sample_idx = np.random.choice(n, m)
samples_idx[b, :] = sample_idx
return(samples_idx)
def fit_bootstrap_models(X_train, Y_train, X_predict, fit_muh_fun, n, m, B):
'''
Train B bootstrap estimators and calculate predictions on X_predict
Return: list of matrices [M,P]
samples_idx = B-by-m matrix, row b = indices of b-th bootstrap sample
predictions = B-by-n1 matrix, row b = predictions from b-th bootstrap sample
(n1=len(X_predict))
'''
samples_idx = generate_bootstrap_samples(n, m, B)
n1 = len(X_predict)
# P holds the predictions from individual bootstrap estimators
predictions = np.zeros((B, n1), dtype=float)
for b in range(B):
predictions[b] = fit_muh_fun(X_train[samples_idx[b], :],\
Y_train[samples_idx[b], ], X_predict)
return([samples_idx, predictions])
# Define Baseline Regressor to be used later
def ridge2(X,Y,X1,ridge_mult=0.001):
# named ridge2 to distinguish it from the ridge regressor in sklearn.
lam = ridge_mult * np.linalg.svd(X,compute_uv=False).max()**2
betahat = np.linalg.solve(\
np.c_[np.ones(X.shape[0]),X].T.dot(np.c_[np.ones(X.shape[0]),X]) \
+ lam * np.diag(np.r_[0,np.ones(X.shape[1])]),
np.c_[np.ones(X.shape[0]),X].T.dot(Y))
return betahat[0] + X1.dot(betahat[1:])
def neural_net(X,Y,X1):
nnet = MLPRegressor(solver='lbfgs',activation='logistic').fit(X,Y)
return nnet.predict(X1)
def random_forest_nB(X,Y,X1,ntree=20):
# when bootstrap=False, it means each tree is trained on all rows of X and only
# subsamples its columns (which are features).
rf = RandomForestRegressor(n_estimators=ntree,criterion='mae',bootstrap=False,n_jobs=-1).fit(X,Y)
return rf.predict(X1)
Include mean, median, and trimmed mean aggregation
def compute_PIs_jacknife_plus_after_bootstrap(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
'''
Using mean aggregation
'''
n=len(X)
n1 = len(X1)
[boot_samples_idx,boot_predictions] = \
fit_bootstrap_models(X, Y, np.vstack((X,X1)), fit_muh_fun, n, m, B)
in_boot_sample = np.zeros((B,n),dtype=bool)
for b in range(len(in_boot_sample)):
in_boot_sample[b,boot_samples_idx[b]] = True
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
b_keep = np.argwhere(~(in_boot_sample[:,i])).reshape(-1)
if(len(b_keep)>0):
if aggre=='mean':
resids_LOO[i] = np.abs(Y[i] - boot_predictions[b_keep,i].mean())
muh_LOO_vals_testpoint[i] = boot_predictions[b_keep,n:].mean(0)
elif aggre=='median':
resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[b_keep,i]))
muh_LOO_vals_testpoint[i] = np.median(boot_predictions[b_keep,n:],axis=0)
elif aggre=='tmean':
resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[b_keep,i],0.25))
muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[b_keep,n:],0.25,axis=0)
else: # if aggregating an empty set of models, predict zero everywhere
resids_LOO[i] = np.abs(Y[i])
muh_LOO_vals_testpoint[i] = np.zeros(n1)
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
Include mean, median, and trimmed mean aggregation
def compute_PIs_jacknife_plus_naive(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
'''
Using mean aggregation
'''
n=len(X)
n1 = len(X1)
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
boot_predictions = fit_bootstrap_models(np.delete(X,i,0), Y, np.vstack((X,X1)), fit_muh_fun, n-1, m, B)[1]
if aggre=='mean':
resids_LOO[i] = np.abs(Y[i] - np.mean(boot_predictions[:,i]))
muh_LOO_vals_testpoint[i] = np.mean(boot_predictions[:,n:],axis=0)
elif aggre=='median':
resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[:,i]))
muh_LOO_vals_testpoint[i] = np.median(boot_predictions[:,n:],axis=0)
elif aggre=='tmean':
resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[:,i],0.25))
muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[:,n:],0.25,axis=0)
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
def compute_PIs_jacknife_plus(X,Y,X1,alpha,fit_muh_fun):
n = len(X)
n1 = len(X1)
muh_vals = fit_muh_fun(X,Y,np.r_[X,X1])
resids_naive = np.abs(Y-muh_vals[:n])
muh_vals_testpoint = muh_vals[n:]
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
muh_vals_LOO = fit_muh_fun(np.delete(X,i,0),np.delete(Y,i),\
np.r_[X[i].reshape((1,-1)),X1])
resids_LOO[i] = np.abs(Y[i] - muh_vals_LOO[0])
muh_LOO_vals_testpoint[i] = muh_vals_LOO[1:]
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
# UCI Communities and Crime Data Set
# http://archive.ics.uci.edu/ml/datasets/communities+and+crime
communities_data = np.loadtxt('communities_data.csv',delimiter=',',dtype=str)
# remove categorical predictors
communities_data = np.delete(communities_data, np.arange(5), 1)
# remove predictors with missing values
communities_data = np.delete(communities_data, np.argwhere(
(communities_data == '?').sum(0) > 0).reshape(-1), 1)
communities_data = communities_data.astype(float)
X_communities = communities_data[:, :-1]
Y_communities = communities_data[:, -1]
# UCI BlogFeedback data set
# https://archive.ics.uci.edu/ml/datasets/BlogFeedback
blog_data = np.loadtxt('blogData_train.csv',delimiter=',')
X_blog = blog_data[:, :-1]
Y_blog = np.log(1+blog_data[:, -1])
# MEPS data set
# data downloaded (in .ssp format) from:
# https://meps.ahrq.gov/mepsweb/data_files/pufs/h192ssp.zip
# then run get_meps_data.ipynb script
# to perform feature selection & data cleaning, & store to .txt file
meps_data = np.loadtxt('meps_data.txt')
X_meps = meps_data[:, :-1]
Y_meps = meps_data[:, -1]
Compare J+aB, J+ Ensemble, and J+ Non-ensemble under the same parameter choices
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 40 # training size n
m_vals = np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
B_ = 20 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# Run Mean, Median, and Trimmed Mean
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
for agg in ['mean','median','tmean']:
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print(f'Under {agg}: running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
print(f'At trial # {itrial}')
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Run J+ Non-ensemble
print('On J+ Non-ensemble')
start_time=time.time()
PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
# Run J+ Ensemble and J+aB across all m values at train size = 40
for m in m_vals:
# Naive J+
print('On J+ Ensemble')
start_time=time.time()
PIs = compute_PIs_jacknife_plus_naive(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,agg)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
coverage,width,time.time()-start_time]
# J+aB
start_time=time.time()
print('On J+aB')
B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,agg)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv(f'{data_name}-main-results-{agg}.csv',index=False)
Compare J+aB with Random $B$ and with Fixed $B'$, making sure $E(B)=B'$
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 200 # training size n
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
B_ = 50 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# Run Mean, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
for agg in ['mean','median','tmean']:
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print(f'Under {agg}: running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
print(f'trial # {itrial}')
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Run jackknife+aB, random B & non-random B
for m in m_vals:
# J+aB, random B
start_time=time.time()
B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,agg)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
# J+aB, fixed B
start_time=time.time()
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,agg)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv(f'fixed-random-B-{agg}.csv',index=False)
# For comparing J+aB, J+ Ensemble, and J+ Non-ensemble under different aggregation method
def grouped_box_agg(agg, type):
size = 6
if type == 'coverage':
f, ax = plt.subplots(3, 3, figsize=(size, size), sharex=True, sharey=True)
else:
f, ax = plt.subplots(3, 3, figsize=(size, size), sharex=True, sharey='row')
f.tight_layout(pad=-0.3)
i = 0 # row, denote dataset
regrs = ['ridge2', 'random_forest_nB', 'neural_net']
regrs_label = {'ridge2': 'RIDGE', 'random_forest_nB': "RF", 'neural_net': "NN"}
# titles = {'mean': 'Mean', 'median': "Median", 'tmean': 'Trimmed Mean'}
titles = {'meps': 'MEPS', 'blog': "BLOG", 'communities': "COMMUNITIES"}
font_size = 12
label_size = 14
for dataname in ['meps', 'blog', 'communities']:
j = 0 # column, denote regressor
# for agg in ['mean', 'median', 'tmean']:
for regr in regrs:
results = pd.read_csv(f'{dataname}-main-results-{agg}.csv')
results.loc[results.method == 'jack+', 'method'] = 'jack+ (m=0)' # for plotting
m_vals = np.array([int(mtd.split('=')[1].split(')')[0])
for mtd in results.method]) # get m value
method_name = np.array([mtd.split(' ')[0] for mtd in results.method])
results['method'] = method_name
# For proper legend
results['method'].replace('jack+-naive', 'J+ ENSEMBLE', inplace=True)
results['method'].replace('jack+aB', 'J+aB', inplace=True)
results['method'].replace('jack+', 'J+ NON-ENSEMBLE', inplace=True)
results['ratio'] = m_vals/max(m_vals)
mtd = ['J+ ENSEMBLE', 'J+aB', 'J+ NON-ENSEMBLE']
mtd_colors = ['orange', 'royalblue', 'green']
color_dict = dict(zip(mtd, mtd_colors)) # specify colors for each box
# Start plotting
sns.boxplot(y=type, x='ratio',
data=results[results.muh_fun == regr],
palette=color_dict,
hue='method',
hue_order=['J+ NON-ENSEMBLE', 'J+ ENSEMBLE', 'J+aB'],
ax=ax[i, j])
# Remove tick for J+ Non-ensemble
xtick = ax[i, j].xaxis.get_major_ticks()
xtick[0].set_visible(False)
ax[i, j].tick_params(axis='both', which='major', labelsize=font_size)
ax[i, j].get_legend().remove()
if type == 'width':
ax[i, j].yaxis.set_major_formatter(FormatStrFormatter('%.1f')) # no decimal
# control y-range
result_regr = results
ax[i, j].set(ylim=(max(min(result_regr.width)-0.1, 0),
max(result_regr.width)+0.1))
if type == 'coverage':
ax[i, j].axhline(y=0.9, color='black')
if j == 0:
# Y-label on
ax[i, 0].set_ylabel(titles[dataname], fontsize=label_size)
if i == 2:
# X-label on
ax[2, 0].set_xlabel('m/n', fontsize=label_size)
ax[2, j].tick_params(axis='x', rotation=45)
else:
# X-label off
x_axis = ax[i, j].axes.get_xaxis()
x_axis.set_visible(False)
else:
# Y-label off but keep ticks
ax[i, j].set_ylabel('')
if i == 2:
# X-label on
ax[2, j].set_xlabel('m/n', fontsize=label_size)
ax[2, j].tick_params(axis='x', rotation=45)
else:
# X-label off
x_axis = ax[i, j].axes.get_xaxis()
x_axis.set_visible(False)
# Control Title
if i == 0:
ax[0, j].set_title(regrs_label[regr], fontsize=label_size)
if j == 1 and type == 'coverage':
ax[0, j].set_title(f'Coverage \n {regrs_label[regr]}', fontsize=label_size)
if j == 1 and type == 'width':
ax[0, j].set_title(f'Width \n {regrs_label[regr]}', fontsize=label_size)
j += 1
i += 1
# Legend lastly
# Assign to top middle
ax[2, 1].legend(loc='upper center',
bbox_to_anchor=(0.4, -0.4), ncol=3, fontsize=font_size)
# it would of course be better with a nicer handle to the middle-bottom axis object, but since I know it is the second last one in my 3 x 3 grid...
plt.savefig(f'Boxplots_{agg}_{type}.pdf', dpi=300, bbox_inches='tight',
pad_inches=0)
for agg in ['mean', 'median', 'tmean']:
grouped_box_agg(agg, 'coverage')
grouped_box_agg(agg, 'width')
# For comparing random vs. fixed B on J+aB
def grouped_box_rand_fix(agg, type):
size = 6
if type == 'coverage':
f, ax = plt.subplots(3, 3, figsize=(size, size), sharex=True, sharey=True)
else:
# all plots in same row share y-axis
f, ax = plt.subplots(3, 3, figsize=(size, size), sharex=True, sharey='row')
f.tight_layout(pad=-0.3)
i = 0 # row, denote regressor
regrs = ['ridge2', 'random_forest_nB', 'neural_net']
regrs_label = {'ridge2': 'RIDGE', 'random_forest_nB': "RF", 'neural_net': "NN"}
# titles = {'mean': 'Mean', 'median': "Median", 'tmean': 'Trimmed Mean'}
titles = {'meps': 'MEPS', 'blog': "BLOG", 'communities': "COMMUNITIES"}
font_size = 12
label_size=14
for dataname in ['meps', 'blog', 'communities']:
j = 0 # column, denote aggregator
# for agg in ['mean', 'median', 'tmean']:
for regr in regrs:
results = pd.read_csv(f'fixed-random-B-{agg}.csv')
results = results[results.dataset == dataname]
m_vals = np.array([int(mtd.split('=')[1].split(')')[0])
for mtd in results.method]) # get m value
method_name = np.array([mtd.split(' ')[0] for mtd in results.method])
results['method'] = method_name
# For proper legend
results['method'].replace('random-jack+aB', 'J+aB Random B', inplace=True)
results['method'].replace('fixed-jack+aB', 'J+aB Fixed B', inplace=True)
results['ratio'] = m_vals/max(m_vals)
# which_idx = np.array([rate in np.array([0.2, 0.4, 0.6, 0.8, 1.0]) for rate in results.ratio])
results = results.loc[results['ratio'].isin([0.2, 0.4, 0.6, 0.8, 1.0])]
mtd = ['J+aB Random B', 'J+aB Fixed B']
mtd_colors = ['orange', 'royalblue']
color_dict = dict(zip(mtd, mtd_colors)) # specify colors for each box
# Start plotting
sns.boxplot(y=type, x='ratio',
data=results[results.muh_fun == regr],
palette=color_dict,
hue='method',
hue_order=mtd,
ax=ax[i, j])
if type == 'coverage':
ax[i, j].axhline(y=0.9, color='black')
else:
ax[i, j].yaxis.set_major_formatter(FormatStrFormatter('%.1f')) # 1 decimal
# control y-range
result_regr = results
ax[i, j].set(ylim=(max(min(result_regr.width)-0.1, 0),
max(result_regr.width)+0.1))
ax[i, j].tick_params(axis='both', which='major', labelsize=14)
ax[i, j].get_legend().remove()
# # Control legend
# if i == 0 and j == 2:
# ax[i, j].legend(fontsize=font_size)
# else:
# ax[i, j].get_legend().remove()
# Control y and x-label
if j == 0:
# Y-label on
ax[i, 0].set_ylabel(titles[dataname], fontsize=label_size)
if i == 2:
# X-label on
ax[2, 0].set_xlabel('m/n', fontsize=label_size)
ax[2, j].tick_params(axis='x', rotation=45)
else:
# X-label off
x_axis = ax[i, j].axes.get_xaxis()
x_axis.set_visible(False)
else:
# Y-label off but keep ticks
# y_axis = ax[i, j].axes.get_yaxis()
# y_axis.set_visible(False)
ax[i, j].set_ylabel('')
if i == 2:
# X-label on
ax[2, j].set_xlabel('m/n', fontsize=label_size)
ax[2, j].tick_params(axis='x', rotation=45)
else:
# X-label off
x_axis = ax[i, j].axes.get_xaxis()
x_axis.set_visible(False)
# Control Title
if i == 0:
ax[0, j].set_title(regrs_label[regr], fontsize=label_size)
if j == 1 and type == 'coverage':
ax[0, j].set_title(f'Coverage \n {regrs_label[regr]}', fontsize=label_size)
if j == 1 and type == 'width':
ax[0, j].set_title(f'Width \n {regrs_label[regr]}', fontsize=label_size)
j += 1
i += 1
# Legend lastly
ax[2, 1].legend(loc='upper center',
bbox_to_anchor=(0.5, -0.4), ncol=2, fontsize=font_size)
# it would of course be better with a nicer handle to the middle-bottom axis object, but since I know it is the second last one in my 3 x 3 grid...
plt.savefig(f'Boxplots_rand_fixed_{agg}_{type}.pdf', dpi=300, bbox_inches='tight',
pad_inches=0)
for agg in ['mean', 'median', 'tmean']:
grouped_box_rand_fix(agg, 'coverage')
grouped_box_rand_fix(agg, 'width')
nrow = 9
ncol = 3
row_names = ['Ridge mean', 'Ridge median', 'Ridge tmean', 'RF mean',
'RF median', 'RF tmean', 'NN mean', 'NN median', 'NN tmean']
col_names = ['J+aB', 'J+ ENSEMBLE', 'J+ NON-ENSEMBLE']
meps_run_time = np.zeros((nrow, ncol))
blog_run_time = np.zeros((nrow, ncol))
communities_run_time = np.zeros((nrow, ncol))
for dname in ['meps', 'blog', 'communities']:
np.set_printoptions(suppress=True) # no scientific notation
i = 0 # denote which rows are saturated
for agg in ['mean', 'median', 'tmean']:
results = pd.read_csv(f'{dname}-main-results-{agg}.csv')
results.loc[results.method == 'jack+', 'method'] = 'jack+ (m=0)' # for plotting
m_vals = np.array([int(mtd.split('=')[1].split(')')[0])
for mtd in results.method]) # get m value
method_name = np.array([mtd.split(' ')[0] for mtd in results.method])
results['method'] = method_name
# For proper legend
results['method'].replace('jack+-naive', 'J+ ENSEMBLE', inplace=True)
results['method'].replace('jack+aB', 'J+aB', inplace=True)
results['method'].replace('jack+', 'J+ NON-ENSEMBLE', inplace=True)
results['ratio'] = m_vals/max(m_vals)
results = results[(results.method == 'J+ NON-ENSEMBLE') | (results.ratio == 0.6)]
time = results.groupby(by=['muh_fun', 'method'])['time'].mean()
time = time.reindex(['J+aB', 'J+ ENSEMBLE', 'J+ NON-ENSEMBLE'], level='method')
time = time.reindex(['ridge2', 'random_forest_nB', 'neural_net'], level='muh_fun')
time = list(time)
for regr in ['ridge2', 'random_forest_nB', 'neural_net']:
eval(f'{dname}_run_time')[i, :] = time[:3] # By ridge
eval(f'{dname}_run_time')[i+3, :] = time[3:6] # By RF
eval(f'{dname}_run_time')[i+6, :] = time[6:9] # By NN
i += 1
# Average J+aB results, since it does not aggregate
for j in range(3):
eval(f'{dname}_run_time')[3*j, -1] = np.mean(eval(f'{dname}_run_time')[3*j:3*j+3, -1])
eval(f'{dname}_run_time')[3*j+1:3*j+3, -1] = 0
time_table = eval(f'{dname}_run_time')
# Save to one decimal place
exec(f'{dname}_run_time=np.round(time_table, 1)')
# Correspond to current Table S1 format
meps_run_time = pd.DataFrame(meps_run_time, index=row_names, columns=col_names)
blog_run_time = pd.DataFrame(blog_run_time, index=row_names, columns=col_names)
communities_run_time = pd.DataFrame(communities_run_time, index=row_names, columns=col_names)
print(meps_run_time)
print(blog_run_time)
print(communities_run_time)