#Imports
import math
import seaborn as sns
import statsmodels.api as sm

import sys
sys.path.append('/home/ubuntu/workplace/python_notebook/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance


database_config = DatabaseConfig(connection_string='mongodb://localhost',
                                     database_name='abt_calibration', dump_path='',
                                     restore_from_dump=False, dump_on_exit=False)


database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationDataBonsai')


num_of_query = df.groupby('run_id')
num_of_query.count()  # Number of queries.


df.count() # Number of the total results

_id                     16560
run_id                  16560
collection              16560
pipeline                16560
explain                 16560
query_parameters        16560
sbe                     16560
abt                     16560
total_execution_time    16560
dtype: int64


abt_df = exp.extract_abt_nodes(df) # Extract abt nodes from explain result. 
abt_df.count()

abt_type                          71760
execution_time                    71760
n_returned                        71760
n_processed                       71760
keys_length_in_bytes              71760
average_document_size_in_bytes    71760
run_id                            71760
pipeline                          71760
dtype: int64


abt_df.groupby(['abt_type']).count()


ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df.describe()


abt_df.groupby(['abt_type', 'run_id']).mean()


abt_df_mean=abt_df.groupby(['abt_type', 'run_id']).mean().execution_time
abt_df_n_processed = abt_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=abt_df_mean, y=abt_df_n_processed)
# The mean of execution_times of 40 runs correlated with the number of processed rows

<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>


abt_df_median=abt_df.groupby(['abt_type', 'run_id']).median().execution_time
sns.scatterplot(x=abt_df_median, y=abt_df_n_processed)

# The median of execution_times of 40 runs correlated with the number of processed rows

<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>


ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_std = ixscan_df.groupby(['abt_type', 'run_id']).std().execution_time
ixscan_df_n_processed = ixscan_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=ixscan_df_std, y=ixscan_df_n_processed)

# The stddev of execution_times of 40 runs "IndexScan". Replace 'IndexScan' and re-run to see results of other abt types.

<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>


ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_std = ixscan_df.groupby(['abt_type', 'run_id']).std().execution_time
ixscan_df_n_processed = ixscan_df.groupby(['abt_type', 'run_id']).mean().n_processed
sns.scatterplot(x=ixscan_df_std, y=ixscan_df_n_processed)

# The stddev of execution_times of 40 runs "IndexScan". Replace 'IndexScan' and re-run to see results of other abt types.

<AxesSubplot:xlabel='execution_time', ylabel='n_processed'>


ixscan_df = abt_df[abt_df['abt_type'] == 'IndexScan']
ixscan_df_mean = ixscan_df.groupby(['abt_type', 'run_id']).mean().execution_time


mean_coef = ixscan_df_mean.corr(ixscan_df_n_processed)
std_coef = ixscan_df_std.corr(ixscan_df_n_processed)
# stddev is also corelated with the number of processed row but not as strong as 'mean', 'median' corelation with 'n_processed'
print(mean_coef, std_coef)

0.9975969235577371 0.9424998942544887


from scipy.stats import zscore
import scipy

def kstest_with_abt_type(abt_type: str):
    res = []
    abt_groups = abt_df.groupby(['abt_type', 'run_id', 'pipeline'])
    for query in abt_groups:
        if query[0][0] != abt_type:  # Replace 'IndexScan' with other abt type.
            continue
        #print(query)
        frame = query[1].execution_time.to_frame()
        #print(frame)
        zs = frame.apply(zscore)
        zs_list = zs.squeeze().values.tolist()
        pval = scipy.stats.kstest(zs_list, 'norm')
        # pval = scipy.stats.kstest(zs_list, 't', [120])

        res.append(pval.pvalue)
        # if pval.pvalue < 0.05:
        #      print(query[0][2]) # Output the pipeline with pvalue less than 0.05. Don't see a pattern. :(


    cnt = 0
    for p in res:
        if p > 0.05:
            cnt += 1

    print(str(cnt / len(res) * 100) + "%")

# Output the percentage of queries that are normal distributed.
kstest_with_abt_type('IndexScan')

58.333333333333336%


kstest_with_abt_type('Seek')

64.13043478260869%


kstest_with_abt_type('Filter')

53.62318840579711%


from scipy.stats import zscore


degrees_res = []

for degree_of_freedom in range(1, 40):
    res = {}
    res_ls = []
    for run_id in range(414):
        idx_run_id = abt_df_id[(abt_df_id['run_id'] == run_id) & (abt_df_id['abt_type'] == 'IndexScan')].execution_time

        #Calculate Z-score for individual execution time of each query. (execution time of about 36 runs)
        if idx_run_id.count() > 0:
            frame = idx_run_id.to_frame()
            zs = frame.apply(zscore)

            #Calculate P-value in K-S test
            zs_list = zs.squeeze().values.tolist()
            pval = stats.kstest(zs_list, 't', [degree_of_freedom])    # Specify distribution here, e.g., 'norm'
            #pval = stats.kstest(zs_list, 'norm')
            #print(run_id, pval.pvalue)
            res_ls.append(pval.pvalue)
            res[run_id] = pval

    res_ls.sort()
    #print(str(res_ls).replace(', ',',\n '))

    cnt = 0
    for z in res_ls:
        if z > 0.05:
            cnt += 1
    degrees_res.append((cnt / len(res), degree_of_freedom))
    #print(str(cnt / len(res) * 100) + "%")

degrees_res.sort()
for val in degrees_res:
    print(val)
    
# Output the percentage of queries that have Student's t distribution with degree of freedom from 1 to 40.

(0.5833333333333334, 1)
(0.6268115942028986, 21)
(0.6268115942028986, 22)
(0.6268115942028986, 23)
(0.6268115942028986, 24)
(0.6268115942028986, 25)
(0.6268115942028986, 26)
(0.6268115942028986, 27)
(0.6268115942028986, 28)
(0.6268115942028986, 29)
(0.6268115942028986, 30)
(0.6268115942028986, 31)
(0.6268115942028986, 32)
(0.6268115942028986, 33)
(0.6268115942028986, 34)
(0.6268115942028986, 35)
(0.6268115942028986, 36)
(0.6268115942028986, 37)
(0.6268115942028986, 38)
(0.6268115942028986, 39)
(0.6304347826086957, 19)
(0.6304347826086957, 20)
(0.6340579710144928, 4)
(0.6340579710144928, 5)
(0.6340579710144928, 6)
(0.6340579710144928, 7)
(0.6340579710144928, 8)
(0.6340579710144928, 9)
(0.6340579710144928, 17)
(0.6340579710144928, 18)
(0.6376811594202898, 10)
(0.6376811594202898, 11)
(0.6376811594202898, 12)
(0.6376811594202898, 13)
(0.6376811594202898, 14)
(0.6376811594202898, 15)
(0.6376811594202898, 16)
(0.644927536231884, 3)
(0.6485507246376812, 2)

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id	pipeline
abt_type
BinaryJoin	11040	11040	11040	11040	11040	11040	11040
Filter	5520	5520	5520	5520	5520	5520	5520
IndexScan	11040	11040	11040	11040	11040	11040	11040
LimitSkip	11040	11040	11040	11040	11040	11040	11040
PhysicalScan	5520	5520	5520	5520	5520	5520	5520
Root	16560	16560	16560	16560	16560	16560	16560
Seek	11040	11040	11040	11040	11040	11040	11040

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes
count	11040.000000	1.104000e+04	1.104000e+04	11040.000000	11040.000000
mean	361.064583	1.060934e+05	1.060944e+05	9.333333	132.913043
std	906.640286	2.946677e+05	2.946677e+05	2.054898	0.281784
min	12.000000	0.000000e+00	1.000000e+00	7.000000	132.000000
25%	16.000000	1.185000e+02	1.195000e+02	7.000000	133.000000
50%	31.000000	2.235500e+03	2.236500e+03	9.000000	133.000000
75%	195.000000	4.770900e+04	4.771000e+04	12.000000	133.000000
max	6108.000000	1.787009e+06	1.787010e+06	12.000000	133.000000

		execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes
abt_type	run_id
BinaryJoin	63174d294d9a8dbc9dfaefef	8.750	5.0	10.0	9.0	133.0
	63174d294d9a8dbc9dfaeff0	7.625	4.0	8.0	9.0	133.0
	63174d294d9a8dbc9dfaeff2	13.000	10.0	20.0	9.0	133.0
	63174d2a4d9a8dbc9dfaeff3	10.475	7.0	14.0	9.0	133.0
	63174d2a4d9a8dbc9dfaeff5	19.900	16.0	32.0	9.0	133.0
...	...	...	...	...	...	...
Seek	631777c7870b4c95d45f99aa	65848.100	107410.0	107410.0	12.0	133.0
	631778af870b4c95d45f99ac	90141.900	1784547.0	1784547.0	7.0	133.0
	6317793a870b4c95d45f99ad	89804.825	1787009.0	1787009.0	7.0	133.0
	63177aa6870b4c95d45f99af	71699.450	119110.0	119110.0	12.0	133.0
	63177ab2870b4c95d45f99b0	71605.100	118546.0	118546.0	12.0	133.0