#Imports
import math
import seaborn as sns
import statsmodels.api as sm
import sys
sys.path.append('/home/ubuntu/workplace/python_notebook/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance
database_config = DatabaseConfig(connection_string='mongodb://localhost',
database_name='abt_calibration', dump_path='',
restore_from_dump=False, dump_on_exit=False)
database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationDataBonsai')
num_of_query = df.groupby('run_id')
num_of_query.count() # Number of queries.
_id | collection | pipeline | explain | query_parameters | sbe | abt | total_execution_time | |
---|---|---|---|---|---|---|---|---|
run_id | ||||||||
63174d294d9a8dbc9dfaefef | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63174d294d9a8dbc9dfaeff0 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63174d294d9a8dbc9dfaeff1 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63174d294d9a8dbc9dfaeff2 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63174d2a4d9a8dbc9dfaeff3 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
6317793a870b4c95d45f99ad | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
631779cd870b4c95d45f99ae | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63177aa6870b4c95d45f99af | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63177ab2870b4c95d45f99b0 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
63177acd870b4c95d45f99b1 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 |
414 rows × 8 columns
df.count() # Number of the total results
_id 16560 run_id 16560 collection 16560 pipeline 16560 explain 16560 query_parameters 16560 sbe 16560 abt 16560 total_execution_time 16560 dtype: int64
abt_df = exp.extract_abt_nodes(df) # Extract abt nodes from explain result.
abt_df.count()
abt_type 71760 execution_time 71760 n_returned 71760 n_processed 71760 keys_length_in_bytes 71760 average_document_size_in_bytes 71760 run_id 71760 pipeline 71760 dtype: int64
abt_df.groupby(['abt_type']).count()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | run_id | pipeline | |
---|---|---|---|---|---|---|---|
abt_type | |||||||
BinaryJoin | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 |
Filter | 5520 | 5520 | 5520 | 5520 | 5520 | 5520 | 5520 |
IndexScan | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 |
LimitSkip | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 |
PhysicalScan | 5520 | 5520 | 5520 | 5520 | 5520 | 5520 | 5520 |
Root | 16560 | 16560 | 16560 | 16560 | 16560 | 16560 | 16560 |
Seek | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 | 11040 |
ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | |
---|---|---|---|---|---|
count | 11040.000000 | 1.104000e+04 | 1.104000e+04 | 11040.000000 | 11040.000000 |
mean | 361.064583 | 1.060934e+05 | 1.060944e+05 | 9.333333 | 132.913043 |
std | 906.640286 | 2.946677e+05 | 2.946677e+05 | 2.054898 | 0.281784 |
min | 12.000000 | 0.000000e+00 | 1.000000e+00 | 7.000000 | 132.000000 |
25% | 16.000000 | 1.185000e+02 | 1.195000e+02 | 7.000000 | 133.000000 |
50% | 31.000000 | 2.235500e+03 | 2.236500e+03 | 9.000000 | 133.000000 |
75% | 195.000000 | 4.770900e+04 | 4.771000e+04 | 12.000000 | 133.000000 |
max | 6108.000000 | 1.787009e+06 | 1.787010e+06 | 12.000000 | 133.000000 |
abt_df.groupby(['abt_type', 'run_id']).mean()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | ||
---|---|---|---|---|---|---|
abt_type | run_id | |||||
BinaryJoin | 63174d294d9a8dbc9dfaefef | 8.750 | 5.0 | 10.0 | 9.0 | 133.0 |
63174d294d9a8dbc9dfaeff0 | 7.625 | 4.0 | 8.0 | 9.0 | 133.0 | |
63174d294d9a8dbc9dfaeff2 | 13.000 | 10.0 | 20.0 | 9.0 | 133.0 | |
63174d2a4d9a8dbc9dfaeff3 | 10.475 | 7.0 | 14.0 | 9.0 | 133.0 | |
63174d2a4d9a8dbc9dfaeff5 | 19.900 | 16.0 | 32.0 | 9.0 | 133.0 | |
... | ... | ... | ... | ... | ... | ... |
Seek | 631777c7870b4c95d45f99aa | 65848.100 | 107410.0 | 107410.0 | 12.0 | 133.0 |
631778af870b4c95d45f99ac | 90141.900 | 1784547.0 | 1784547.0 | 7.0 | 133.0 | |
6317793a870b4c95d45f99ad | 89804.825 | 1787009.0 | 1787009.0 | 7.0 | 133.0 | |
63177aa6870b4c95d45f99af | 71699.450 | 119110.0 | 119110.0 | 12.0 | 133.0 | |
63177ab2870b4c95d45f99b0 | 71605.100 | 118546.0 | 118546.0 | 12.0 | 133.0 |
1794 rows × 5 columns
abt_df_mean=abt_df.groupby(['abt_type', 'run_id']).mean().execution_time
abt_df_n_processed = abt_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=abt_df_mean, y=abt_df_n_processed)
# The mean of execution_times of 40 runs correlated with the number of processed rows
<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>
abt_df_median=abt_df.groupby(['abt_type', 'run_id']).median().execution_time
sns.scatterplot(x=abt_df_median, y=abt_df_n_processed)
# The median of execution_times of 40 runs correlated with the number of processed rows
<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>
ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_std = ixscan_df.groupby(['abt_type', 'run_id']).std().execution_time
ixscan_df_n_processed = ixscan_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=ixscan_df_std, y=ixscan_df_n_processed)
# The stddev of execution_times of 40 runs "IndexScan". Replace 'IndexScan' and re-run to see results of other abt types.
<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>
ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_std = ixscan_df.groupby(['abt_type', 'run_id']).std().execution_time
ixscan_df_n_processed = ixscan_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=ixscan_df_std, y=ixscan_df_n_processed)
# The stddev of execution_times of 40 runs "IndexScan". Replace 'IndexScan' and re-run to see results of other abt types.
<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>
ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_mean = ixscan_df.groupby(['abt_type', 'run_id']).mean().execution_time
mean_coef = ixscan_df_mean.corr(ixscan_df_n_processed)
std_coef = ixscan_df_std.corr(ixscan_df_n_processed)
# stddev is also corelated with the number of processed row but not as strong as 'mean', 'median' corelation with 'n_processed'
print(mean_coef, std_coef)
0.9975969235577371 0.9424998942544887
from scipy.stats import zscore
import scipy
def kstest_with_abt_type(abt_type: str):
res = []
abt_groups = abt_df.groupby(['abt_type', 'run_id', 'pipeline'])
for query in abt_groups:
if query[0][0] != abt_type: # Replace 'IndexScan' with other abt type.
continue
#print(query)
frame = query[1].execution_time.to_frame()
#print(frame)
zs = frame.apply(zscore)
zs_list = zs.squeeze().values.tolist()
pval = scipy.stats.kstest(zs_list, 'norm')
# pval = scipy.stats.kstest(zs_list, 't', [120])
res.append(pval.pvalue)
# if pval.pvalue < 0.05:
# print(query[0][2]) # Output the pipeline with pvalue less than 0.05. Don't see a pattern. :(
cnt = 0
for p in res:
if p > 0.05:
cnt += 1
print(str(cnt / len(res) * 100) + "%")
# Output the percentage of queries that are normal distributed.
kstest_with_abt_type('IndexScan')
58.333333333333336%
kstest_with_abt_type('Seek')
64.13043478260869%
kstest_with_abt_type('Filter')
53.62318840579711%
from scipy.stats import zscore
degrees_res = []
for degree_of_freedom in range(1, 40):
res = {}
res_ls = []
for run_id in range(414):
idx_run_id = abt_df_id[(abt_df_id['run_id'] == run_id) & (abt_df_id['abt_type'] == 'IndexScan')].execution_time
#Calculate Z-score for individual execution time of each query. (execution time of about 36 runs)
if idx_run_id.count() > 0:
frame = idx_run_id.to_frame()
zs = frame.apply(zscore)
#Calculate P-value in K-S test
zs_list = zs.squeeze().values.tolist()
pval = stats.kstest(zs_list, 't', [degree_of_freedom]) # Specify distribution here, e.g., 'norm'
#pval = stats.kstest(zs_list, 'norm')
#print(run_id, pval.pvalue)
res_ls.append(pval.pvalue)
res[run_id] = pval
res_ls.sort()
#print(str(res_ls).replace(', ',',\n '))
cnt = 0
for z in res_ls:
if z > 0.05:
cnt += 1
degrees_res.append((cnt / len(res), degree_of_freedom))
#print(str(cnt / len(res) * 100) + "%")
degrees_res.sort()
for val in degrees_res:
print(val)
# Output the percentage of queries that have Student's t distribution with degree of freedom from 1 to 40.
(0.5833333333333334, 1) (0.6268115942028986, 21) (0.6268115942028986, 22) (0.6268115942028986, 23) (0.6268115942028986, 24) (0.6268115942028986, 25) (0.6268115942028986, 26) (0.6268115942028986, 27) (0.6268115942028986, 28) (0.6268115942028986, 29) (0.6268115942028986, 30) (0.6268115942028986, 31) (0.6268115942028986, 32) (0.6268115942028986, 33) (0.6268115942028986, 34) (0.6268115942028986, 35) (0.6268115942028986, 36) (0.6268115942028986, 37) (0.6268115942028986, 38) (0.6268115942028986, 39) (0.6304347826086957, 19) (0.6304347826086957, 20) (0.6340579710144928, 4) (0.6340579710144928, 5) (0.6340579710144928, 6) (0.6340579710144928, 7) (0.6340579710144928, 8) (0.6340579710144928, 9) (0.6340579710144928, 17) (0.6340579710144928, 18) (0.6376811594202898, 10) (0.6376811594202898, 11) (0.6376811594202898, 12) (0.6376811594202898, 13) (0.6376811594202898, 14) (0.6376811594202898, 15) (0.6376811594202898, 16) (0.644927536231884, 3) (0.6485507246376812, 2)