import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme()
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)


df = pd.read_csv('Data/ab_data.csv')
df.head()


df.shape[0]

294478


df.user_id.nunique()

290584


# Proportion converted - this works for 0s and 1s
df.converted.mean()

0.11965919355605512


# # Proportion not converted - this works for 0s and 1s
# 1 - df.converted.mean()


# Another way to calculate (understanding formula "df.converted.mean()")
# Formula below is useful if column contain string or boolean value.
not_converted = df.query('converted == 0').user_id.count() #we could use nunique() insted of count
converted = df.query('converted == 1').user_id.count()   #we could use nunique() insted of count
proportion_converted = converted/df.shape[0]
not_converted, converted, proportion_converted

(259241, 35237, 0.11965919355605512)


# Treatment doesn't align with the new page
no_aligment1 = df.query('group == "treatment" and landing_page == "old_page"').count().timestamp #Not align
# New page doesn't align with the treatment
no_aligment2 = df.query('group == "control" and landing_page == "new_page"').count().timestamp #Not align
# Check how many times do align
aligment3 = df.query('group == "treatment" and landing_page == "new_page"').count().timestamp #Align

no_aligment = no_aligment1 + no_aligment2
no_aligment

3893


df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64


# Drop first condition
df_drop1 = df.drop(df[(df['group'] == 'treatment') & (df['landing_page'] == 'old_page')].index)
# Drop second condition
df2 = df_drop1.drop(df_drop1[(df_drop1['group'] == 'control') & (df_drop1['landing_page'] == 'new_page')].index)


# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0


df2.user_id.nunique()

290584


# How many duplicates in the dataset
df2.user_id.duplicated().sum()

1


# Display the duplicates (displays both duplicates)
df2[df2.user_id.duplicated(keep=False)]


# Displays only one duplicate
# duplicate = df2[df2.user_id.duplicated(keep='first')]
# duplicate = df2[df2.user_id.duplicated(keep='last')]
duplicate = df2[df2.user_id.duplicated()]
duplicate


# Check shape before droping the row
df2.shape

(290585, 5)


# Drop the row by index
df2 = df2.drop([2893])


# Check if the drop is successful
df2.shape

(290584, 5)


converting_prob = df.converted.mean()
converting_prob

0.11965919355605512


control_con_prob = df2.query('group == "control"').converted.mean()
control_con_prob

0.1203863045004612


# This is the calculation to understand formula above and/or in case the converted is a boolean or other non-numerical value.
control_con_prob2 = df2.query('group == "control" & converted == 1 ').user_id.nunique() / df2.query('group == "control"').user_id.nunique()
control_con_prob2

0.1203863045004612


treat_con_prob = df2.query('group == "treatment"').converted.mean()
treat_con_prob

0.11880806551510564


new_page_prob =  df2.query('landing_page == "new_page"').count().user_id / df2.shape[0]
new_page_prob

0.5000619442226688


p_new = df.converted.mean()
p_new

0.11965919355605512


p_old = df.converted.mean()
p_old

0.11965919355605512


n_new = df2.query('landing_page == "new_page"').shape[0]
n_new

145310


n_old = df2.query('landing_page == "old_page"').shape[0]
n_old

145274


new_page_converted = np.random.binomial(1, p_new, n_new)
new_page_converted

array([0, 0, 0, ..., 0, 0, 1])


old_page_converted = np.random.binomial(1, p_old, n_old)
old_page_converted

array([0, 0, 0, ..., 0, 0, 0])


#This is the the stimulated mean difference under the null hypotesis.
p_diffs1 = new_page_converted.mean() - old_page_converted.mean()
p_diffs1

-0.0013168229053714814


# This is the difference in acctual data (observed sample)
pdiff_actual = df2.query('group == "treatment"').converted.mean() - df2.query('group == "control"').converted.mean()
pdiff_actual

-0.0015782389853555567


p_diffs = []
# No sample needed since we are using the whole dataset
# For loop is slower - using this computation to speed up the process much faster.
new_page_converted = np.random.binomial(n_new,𝑝_𝑛𝑒𝑤,10000)/n_new
old_page_converted = np.random.binomial(n_old,𝑝_old,10000)/n_old
p_diffs = new_page_converted - old_page_converted 
p_diffs

array([ 0.00183579,  0.0021796 ,  0.00181484, ...,  0.00096808,
        0.0001561 , -0.00122045])


# Calculate the mean from the null
p_diffs_mean = p_diffs.mean()
p_diffs_mean

2.370172572845339e-05


#new_page_converted = np.random.binomial(n_new,𝑝_𝑛𝑒𝑤,10000)/n_new
sns.histplot(new_page_converted, kde=True);


new_page_converted1 = np.random.binomial(n_new,𝑝_𝑛𝑒𝑤,50)/n_new
sns.histplot(new_page_converted1, kde=True);


# view 95% confidence interval
low, upper = np.percentile(p_diffs, .05), np.percentile(p_diffs, 99.5)


plt.hist(p_diffs); #plot the distribution of 10,000 samples under null hypotesis
plt.axvline(pdiff_actual , color='blue', linewidth=2, linestyle='dashed', label='actual mean'); #plot the accutual observation
plt.axvline(p_diffs_mean, color='darkgray', linewidth=2, linestyle='dashed', label='null mean'); #plot the mean from the null
plt.axvline(low,  color='red', linewidth=2, label='lower boundry'); # lower boundry of 95% confidence interval
plt.axvline(upper,  color='red', linewidth=2, label='upper boundry'); # upper boundry of 95% confidence interval
plt.title('Distribution of differences');
plt.xlabel('differences');
plt.ylabel('number of occurrence')
plt.legend();
plt.legend();


# p_diffs > pdiff_actual
p_diffs = np.array(p_diffs)
null_value = np.random.normal(0, p_diffs.std(), p_diffs.size)

# Compute p-value
p_value = (null_value > pdiff_actual).mean()
p_value

0.9061


convert_old = df2.query('group == "control" & converted == 1').user_id.count()
convert_new = df2.query('group == "treatment" & converted == 1').user_id.count()
n_old = df2.query('landing_page == "old_page"').shape[0]
n_new = df2.query('landing_page == "new_page"').shape[0]


import statsmodels.api as sm
z_test, p_value = sm.stats.proportions_ztest([convert_new, convert_old], [n_new, n_old], alternative='larger')
z_test, p_value

(-1.3109241984234394, 0.9050583127590245)


# Calculating critical value
# import library
from scipy.stats import norm
# Determine our critical value (upper bound og 95%)
p = 0.95
# Calculate
cval = norm.ppf(p)
cval

1.6448536269514722


from IPython import display
display.Image("Resources/criticalvalue.png", width=500)


# calculating standard deviations
standart_deviation = np.std(p_diffs)
std1_low = 0 - standart_deviation*1
std1_high = 0 + standart_deviation*1
std2_low = 0 - standart_deviation*2
std2_high = 0 + standart_deviation*2
std3_low = 0 - standart_deviation*3
std3_high = 0 + standart_deviation*3

# visualizing
fig, ax = plt.subplots(figsize=(8, 6))
sns.histplot(p_diffs, bins=50, color="skyblue", kde=True);
plt.axvline(x=std1_low, color='blue', label='1 std');
plt.axvline(x=std1_high, color='blue');
plt.axvline(x=std2_low, color='green', label='2 std');
plt.axvline(x=std2_high, color='green');
plt.axvline(x=std3_low, color='orange', label='3 std');
plt.axvline(x=std3_high, color='orange');
plt.axvline(upper,  color='black', linewidth=2, linestyle='dashed',label='upper boundry 95%'); # upper boundry of 95% confidence interval
plt.title('Distribution of simulation');
plt.xlabel('differences');
plt.ylabel('count')

# Shade the are between the curve and alpha - where critical value is
kde_x, kde_y = ax.lines[0].get_data()
ax.fill_between(kde_x, kde_y, where=(kde_x>upper), 
                interpolate=True, alpha=1, color='red', label='critical value')

# Shade the area between std-1 and std -2 where z-score is:
ax.axvspan(std1_low, std2_low, alpha=0.5, color='gray', label='z-score')

# This will shade all area from "upper" til the end of the chart - not used in this chart, but kept for the reference.
#ax.axvspan(upper, xlim[1], alpha=0.3, color='red', label='critical value')
# Get x-axis limit to shade the area
#xlim = ax.get_xlim()
#ax.margins(x=0)

plt.legend();


import statsmodels.api as sm
df2['intercept'] = 1
df2['ab_page'] = pd.get_dummies(df['landing_page'])['new_page']


df2.head()


log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
results = log_mod.fit()

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


results.summary()


# Exponentiate each variable. Now each of these resulting value is the multiplicative change in the odds
np.exp(results.params)

intercept    0.136863
ab_page      0.985123
dtype: float64


# Calculate the reciprocal -  with the values less than 1.
1/_

intercept    7.306593
ab_page      1.015102
dtype: float64


countries_df = pd.read_csv('Data/countries.csv')
df_new = countries_df.set_index('user_id').join(df2.set_index('user_id'), how='inner')
df_new.tail()


# Country dummies - check what values we have
df_new.country.value_counts()

US    203619
UK     72466
CA     14499
Name: country, dtype: int64


### Create the necessary dummy variables
df_new[['CA','UK','US']] = pd.get_dummies(df_new['country'])
df_new.tail()


# drop one column to get full rank
df_new = df_new.drop('US', axis=1)


### Fit Your Linear Model And Obtain the Results
df_new['intercept'] = 1
log_mod = sm.Logit(df_new['converted'], df_new[['intercept', 'CA', 'UK']])
results = log_mod.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6


# Exponentiate each variable. Now each of these resulting value is the multiplicative change in the odds
np.exp(results.params)

intercept    0.135779
CA           0.960018
UK           1.009966
dtype: float64


# Calculate the reciprocal -  with the values less than 1.
1/_

intercept    7.364925
CA           1.041647
UK           0.990133
dtype: float64


### Fit Your Linear Model And Obtain the Results
df_new['intercept'] = 1
log_mod2 = sm.Logit(df_new['converted'], df_new[['intercept', 'ab_page', 'CA', 'UK']])
results2 = log_mod2.fit()
results2.summary()

Optimization terminated successfully.
         Current function value: 0.366113
         Iterations 6


# Exponentiate each variable. Now each of these resulting value is the multiplicative change in the odds
np.exp(results2.params)

intercept    0.136795
ab_page      0.985168
CA           0.960062
UK           1.009932
dtype: float64


# Calculate the reciprocal -  with the values less than 1.
1/_

intercept    7.310207
ab_page      1.015056
CA           1.041599
UK           0.990165
dtype: float64


# adding interaction between page and country
df_new['CA_abpage'] = df_new.CA*df_new.ab_page
df_new['UK_abpage'] = df_new.UK*df_new.ab_page


### Fit Your Linear Model And Obtain the Results
df_new['intercept'] = 1
log_mod_int = sm.Logit(df_new['converted'], df_new[['intercept', 'CA', 'UK','ab_page', 'CA_abpage', 'UK_abpage']])
results_int = log_mod_int.fit()
results_int.summary()

Optimization terminated successfully.
         Current function value: 0.366109
         Iterations 6

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290582
Method:	MLE	Df Model:	1
Date:	Tue, 04 May 2021	Pseudo R-squ.:	8.077e-06
Time:	16:54:33	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1899

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-1.9888	0.008	-246.669	0.000	-2.005	-1.973
ab_page	-0.0150	0.011	-1.311	0.190	-0.037	0.007

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290581
Method:	MLE	Df Model:	2
Date:	Tue, 04 May 2021	Pseudo R-squ.:	1.521e-05
Time:	16:54:35	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1984

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-1.9967	0.007	-292.314	0.000	-2.010	-1.983
CA	-0.0408	0.027	-1.518	0.129	-0.093	0.012
UK	0.0099	0.013	0.746	0.456	-0.016	0.036

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290580
Method:	MLE	Df Model:	3
Date:	Tue, 04 May 2021	Pseudo R-squ.:	2.323e-05
Time:	16:54:37	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1760

Analyze A/B Test Results¶

Table of Contents¶

Introduction¶

Part I - Probability¶

Part II - A/B Test¶

Part III - A regression approach¶

Conclusions¶

Gather Submission Materials¶

Submit the Project¶

	user_id	timestamp	group	landing_page	converted
0	851104	2017-01-21 22:11:48.556739	control	old_page	0
1	804228	2017-01-12 08:01:45.159739	control	old_page	0
2	661590	2017-01-11 16:55:06.154213	treatment	new_page	0
3	853541	2017-01-08 18:28:03.143765	treatment	new_page	0
4	864975	2017-01-21 01:52:26.210827	control	old_page	1

	user_id	timestamp	group	landing_page	converted
1899	773192	2017-01-09 05:37:58.781806	treatment	new_page	0
2893	773192	2017-01-14 02:55:59.590927	treatment	new_page	0

	country	timestamp	group	landing_page	converted	intercept	ab_page
user_id
653118	US	2017-01-09 03:12:31.034796	control	old_page	0	1	0
878226	UK	2017-01-05 15:02:50.334962	control	old_page	0	1	0
799368	UK	2017-01-09 18:07:34.253935	control	old_page	0	1	0
655535	CA	2017-01-09 13:30:47.524512	treatment	new_page	0	1	1
934996	UK	2017-01-09 00:30:08.377677	control	old_page	0	1	0

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-1.9865	0.010	-206.344	0.000	-2.005	-1.968
CA	-0.0175	0.038	-0.465	0.642	-0.091	0.056
UK	-0.0057	0.019	-0.306	0.760	-0.043	0.031
ab_page	-0.0206	0.014	-1.505	0.132	-0.047	0.006
CA_abpage	-0.0469	0.054	-0.872	0.383	-0.152	0.059
UK_abpage	0.0314	0.027	1.181	0.238	-0.021	0.084