#Analyzing and Manipulating Data
import numpy as np #importing numpy libary for manipulating arrays
import pandas as pd #importing pandas library for manipulating datasets

#Data Visualization
import seaborn as sns #importing seaborn library for data visualization
import matplotlib.pyplot as plt #importing matplotlib.pyplot for data visualization

#Statistics/Probability
import scipy.stats as stats #importing scipy.stats library for statistical/probability functions


#organizing the data in the abtest.csv file into a dataframe called users
users = pd.read_csv('abtest.csv')


#viewing first 5 rows of the users dataframe
users.head()


#viewing last 5 rows of the users dataframe
users.tail()


#viewing the total number of rows and columns in the users dataframe
users.shape

(100, 6)


#viewing the statistical summary of the numerical variables in the users dataframe
#In this case, the only numerical value considered is time_spent_on_the_page
users.describe()


#viewing further information about the columns and their datatypes
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 100 non-null    int64  
 1   group                   100 non-null    object 
 2   landing_page            100 non-null    object 
 3   time_spent_on_the_page  100 non-null    float64
 4   converted               100 non-null    object 
 5   language_preferred      100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


#checking for any missing values in the users dataframe
users.isnull().sum()  #viewing the sum of null values in each column

user_id                   0
group                     0
landing_page              0
time_spent_on_the_page    0
converted                 0
language_preferred        0
dtype: int64


#checking for any duplicate rows in the users dataframe
users.duplicated().sum() #viewing the sum of the duplicated rows in the users dataframe

0


#making sure that user_id values are unique by checking basic distribution of user_id values
#using seaborn .countplot() function to make user_id bar graph/ color is set to light steel blue
sns.countplot(data=users, x='user_id', color='lightsteelblue'); #data from users dataframe is used to create a bar graph, placing user_id on the x-axis
plt.xticks(rotation=90, size=3); #x-axis labels rotated 90 degrees/size is set to 3 to decrease clutter
plt.title('Basic Distribution of User ID'); #setting title of bar graph
plt.xlabel('User ID'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.show(); #displaying bar graph


#making sure there are an equal number of users in each group
#using seaborn .countplot() function to make bar graph to view the number of users in each group/color palette is set to use pastel colors
sns.countplot(data=users, x='group', palette='pastel'); #bar graph is created with data from users dataframe, placing group on x-axis
plt.title('Number of Users in Each Group'); #setting title of bar graph
plt.xlabel('Group'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.show(); #displaying bar graph


#making sure that there are an equal number of users who viewed each landing page
#using seaborn .countplot() function to make bar graph to examine number of users who viewed each landing page/color palette is set to use pastel colors
sns.countplot(data=users, x='landing_page', palette='pastel'); #using data from users dataframe to create bar graph, placing landing_page on x-axis
plt.title('Number of Users Who Viewed Each Landing Page'); #setting title of bar graph
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.show(); #displaying bar graph


#using seaborn .boxplot() function to make boxplot for examining the time users spent on the landing page/color palette is set to use pastel colors
sns.boxplot(data=users, x='time_spent_on_the_page', palette='pastel'); #using data from users dataframe to create boxplot, placing time_spent_on_the_page on the x-axis
plt.title('Time Spent on Landing Page'); #setting title of boxplot
plt.xlabel('Time Spent'); #setting title of x-axis
plt.show(); #displaying boxplot


#using seaborn .countplot() function to create bar graph to examine how many users converted and how many did not/color palette is set to use pastel colors
sns.countplot(data=users, x='converted', palette='pastel'); #using data from users dataframe to create bar graph, placing converted on x-axis
plt.title('Users based on Conversion'); #setting title of bar graph
plt.xlabel('Converted'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.show(); #displaying bar graph


#using seaborn .countplot() function to create bar graph to examine the language preference of users/color palette is set to use pastel colors
sns.countplot(data=users,x='language_preferred', palette='pastel'); #using data from users dataframe to create bar graph, placing language_preferred on x-axis
plt.title('Language Preference of Users'); #setting title of bar graph
plt.xlabel('Language Preferred'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.show(); #displaying bar graph


#using seaborn .countplot() function to create bargraph for comparing group with landing_page
#group (categorical variable) and landing_page (categorical variable)/color palette uses pastel colors
sns.countplot(data=users, x='group', hue='landing_page', palette='pastel'); #using data from users dataframe to create bar graph with group on x-axis/bar colors representing landing_page
plt.title('Group and Landing Page'); #setting title of bar graph
plt.xlabel('Group'); #setting title of x-axis
plt.ylabel('Number of Users'); # setting title of y-axis
plt.legend(title='Landing Page'); #setting title of legend
plt.show(); #displaying bar graph


#using seaborn .boxplot() function to compare group and time_spent_on_the_page
#time_spent_on_the_page (numerical variable) vs. group (categorical variable)/color is set to thistle
sns.boxplot(data=users, x='group', y='time_spent_on_the_page', color='thistle');#using data from users dataframe to create boxplot with group on x-axis and time_spent_on_the_page on y-axis
plt.title('Time Spent on Landing Page vs. Group'); #setting title of boxplot
plt.xlabel('Group'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#using seaborn .countplot() function to make bar graph for comparing group with converted
#group (categorical variable) and converted (categorical variable)/color palette is set to use pastel colors
sns.countplot(data=users, x='group', hue='converted', palette='pastel');#using data from users dataframe to create bar graph with group on x-axis/bar colors representing converted
plt.title('Group and Conversion'); #setting title of bar graph
plt.xlabel('Group'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Conversion'); #setting title of legend
plt.show(); #displaying bar graph


#using seaborn .countplot() function to makre bar graph for comparing group and language_preferred
#group (categorical variable) vs. language_preferred (categorical variable)/color palette is set to use pastel colors
sns.countplot(data=users, x='group', hue='language_preferred', palette='pastel');#using data from users dataframe to create bar graph with group on x-axis/bar colors representing language_preferred
plt.title('Group and Language Preference'); #setting title of bar graph
plt.xlabel('Group'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Language Preference'); #setting title of legend
plt.show(); #displaying bar graph


#using seaborn .boxplot() function to make boxplot for comparing landing_page and time_spent_on_the_page
#time_spent_on_the_page (numerical variable) vs. landing_page (categorical variable)/color is set to thistle
sns.boxplot(data=users, x='landing_page', y='time_spent_on_the_page', color='thistle');#using data from users dataframe to create boxplot with landing_page on x-axis and time_spent_on_the_page on y-axis 
plt.title('Time Spent on Landing Page vs. Landing Page'); #setting title of boxplot
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#using seaborn .countplot() function to make bar graph for comparing landing_page and converted
#landing_page (categorical variable) and converted (categorical variable)/color palette is set to use pastel colors
sns.countplot(data=users, x='landing_page', hue='converted', palette='pastel'); #using data from users dataframe to create bar graph with landing_page on x-axis/bar colors representing converted
plt.title('Landing Page and Conversion'); #setting title of bar graph
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Conversion'); #setting title of legend
plt.show(); #displaying bar graph


#using seaborn .countplot() function to make bar graph for comparing landing_page amd language_preferred
#landing_page (categorical variable) and language_preferred (categorical variable)/color palette is set to use pastel colors
sns.countplot(data=users, x='landing_page', hue='language_preferred', palette='pastel');#using data in users dataframe to make bar graph with landing_page on x-axis/bar colors presenting language_preferred
plt.title('Landing Page and language Preference'); #setting title of bar graph
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Language Preference'); #setting title of legend
plt.show(); #displaying bar graph


#using seaborn .boxplot() function to make boxplot for comparing time_spent_on_the_page and converted
#time_spent_on_the_page (numerical variable) vs. converted (categorical variable)/color is set to thistle
sns.boxplot(data=users, x='converted', y='time_spent_on_the_page', color='thistle'); #using data from users dataframe to make boxplot with converted on x-axis and time_spent_on_the_page on y-axis
plt.title('Time Spent on Landing Page vs. Conversion'); #setting title of boxplot
plt.xlabel('Conversion'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#using seaborn .boxplot() function to make box plot for comparing language_preferred and time_spent_on_the_page
#time_spent_on_the_page (numerical variable) vs. language_preferred (categorical variable)/color is set to thistle
sns.boxplot(data=users, x='language_preferred', y='time_spent_on_the_page', color='thistle'); #using data from users dataframe to make boxplot with language_preferred on x-axis and time_spent_on_the_page on y-axis
plt.title('Time Spent on Landing Page vs. Language Preference'); #setting title of boxplot
plt.xlabel('Language Preference'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#using seaborn .countplot() function to make bar graph for comparing converted and language_preferred
#converted (categorical variable) and language_preferred (categorical variable)/ color palette is set to use pastel colors
sns.countplot(data=users, x='converted', hue='language_preferred', palette='pastel');#using data from users dataframe to make bar graph with converted on x-axis/bar colors representing language_preferred
plt.title('Conversion and Language Preference'); #setting title of bar graph
plt.xlabel('Conversion'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Language Preference') #setting title of legend
plt.show(); #displaying bar graph


#visual depiction of time_spent_on_the_page(numerical variable) vs. landing_page (categorical variable)
#using seaborn .boxplot() function to make boxplot/setting color of boxplot to thistle
sns.boxplot(data=users, x='landing_page', y='time_spent_on_the_page', color='thistle'); #using data from users dataframe to make boxplot with landing_page on x-axis and time_spent_on_the_page on y-axis
plt.title('Time Spent on Landing Page vs. Landing Page'); #setting title of boxplot
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#checking standard deviation of the time spent on the new page
#adding data for the new landing page into a new dataframe called new_page
new_page = users[users['landing_page'] == 'new']
#using .std() function to calculate standard deviation for the time_spent_on_the_page column of the new_page dataframe
new_std_deviation = new_page['time_spent_on_the_page'].std()

#checking standard deviation of the time spent on the old page
#adding data for the old landing page into a new dataframe called old_page
old_page = users[users['landing_page'] == 'old']
#using .std() function to calculate standard deviation for the time_spent_on_the_page column of the old_page dataframe
old_std_deviation = old_page['time_spent_on_the_page'].std()

#printing both standard deviations
print('Standard Deviation for New Page:', new_std_deviation)
print('Standard Deviation fot Old Page:', old_std_deviation)

Standard Deviation for New Page: 1.8170310387878263
Standard Deviation fot Old Page: 2.581974849306046


#adding just the time_spent_on_the_page column of the new_page data frame into a new dataframe called time_spent_on_new_page
time_spent_on_new_page = new_page['time_spent_on_the_page']
#adding just the time_spent_on_the_page column of the old_page dataframe into a new dataframe called time_spent_on_old_page
time_spent_on_old_page = old_page['time_spent_on_the_page']


#importing the independent t-test function
from scipy.stats import ttest_ind

#using the ttest_ind() function to calculate the p-value
#equal_var is set to false because the variations for both samples are not equal
#alternative is set to greater because the alternative hypothesis is that time spent on the new page is greater than time spent on old page
test_stat, p_value = ttest_ind(time_spent_on_new_page, time_spent_on_old_page, equal_var = False, alternative = 'greater')

print('p-value:', p_value) #printing the p-value

p-value: 0.0001392381225166549


print('α =', 0.05) #printing the alpha value
print('p-value =', p_value) #printing the p-value

α = 0.05
p-value = 0.0001392381225166549


#visual analysis of conversion and landing page
#using seaborn .countplot() to make bar graph for examining converted and landing_page/color palette is set to use pastel colors
sns.countplot(data=users,x='landing_page', hue='converted', palette='pastel'); #using data from users dataframe to make bar graph with landing_page on x-axis/bar colors representing converted
plt.title('Landing Page and Conversion'); #setting title of bar graph
plt.xlabel('Landing Page'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Conversion'); #setting title of legend
plt.show(); #displaying bar graph


#calculating the number of users who converted in the new page
new = users[users['landing_page'] == 'new'] #adding users in new landing page into another dataframe called new
new_converted = new[new['converted'] == 'yes']#adding the users who converted in the new dataframe into a dataframe called new_converted
# and adding those who did not convert into a frame called new_not_converted
new_not_converted = new[new['converted'] == 'no']

#calculating the number of users who converted in the old page
old = users[users['landing_page'] == 'old'] #adding the users in old landing page into another dataframe called old
old_converted = old[old['converted'] == 'yes']#adding the users who converted in the old dataframe into a dataframe called old_converted
#and adding those that did not convert into a dataframe called old_not_converted
old_not_converted = old[old['converted'] == 'no']

#printing the counts for converted and not converted in the new page and old page
print('Users that converted in the new page:', new_converted['converted'].count())
print('Users that did not convert in the new page:', new_not_converted['converted'].count())
print('Users that converted in the old page:', old_converted['converted'].count())
print('Users that did not convert in the old page:', old_not_converted['converted'].count())

#It is already known that there are a total of 50 users who viewed the new page and 50 users who viewed the old page

Users that converted in the new page: 33
Users that did not convert in the new page: 17
Users that converted in the old page: 21
Users that did not convert in the old page: 29


#importing the proportions z-test function
from statsmodels.stats.proportion import proportions_ztest

#assigning the values for conversion in the new page and old page in numpy array form into a variable called users_converted
users_converted = np.array([33,21])

#assigning the total number of users in the new page and old page in numpy array form into a variable called total_observations
#It has already been established previously that there are 50 users in the new page and 50 in the old page
total_observations = np.array([50,50])

#using the proportions_ztest() function for calculating the p-value
test_stat, p_value = proportions_ztest(users_converted, total_observations, alternative='larger')

print('p-value:', p_value) #printing the p-value

p-value: 0.008026308204056278


print('α =', 0.05) #printing the alpha value
print('p-value =', p_value) #printing the p-value

α = 0.05
p-value = 0.008026308204056278


#visual analysis of conversion and language preference
#using seaborn .countplot() to make bar graph/ color palette is set to use pastel colors
sns.countplot(data=users, x='language_preferred', hue ='converted', palette='pastel');#using data from users dataframe to make bar graph with language_preferred on x-axis/bar colors representing converted
plt.title('Language Preference and Conversion'); #setting title of bar graph
plt.xlabel('Language Preference'); #setting title of x-axis
plt.ylabel('Number of Users'); #setting title of y-axis
plt.legend(title='Conversion'); #setting title of legend
plt.show(); #displaying bar graph


#using pd.crosstab() function to create a contingency table with just the converted and language_preferred variables
conversion_languages = pd.crosstab(users.converted, users.language_preferred)

#printing the contingency table
print(conversion_languages)

language_preferred  English  French  Spanish
converted                                   
no                       11      19       16
yes                      21      15       18


#importing the chi2_contingency() function
from scipy.stats import chi2_contingency

#using the chi2_contingency() function for calculating the p-value
chi, p_value, dof, expected = chi2_contingency(conversion_languages)

print('p-value:', p_value) #printing the p-value

p-value: 0.21298887487543447


print('α =', 0.05) #printing the value of α
print('p-value =', p_value) #printing the p-value

α = 0.05
p-value = 0.21298887487543447


#creating a copy of the dataframe users, but with just the users who viewed the new page
new_users = users[users['landing_page'] == 'new']

#visual analysis of time_spent_on_the_page (numerical variable) vs. language_preferred (categorical variable)
#using seaborn .boxplot() function to make boxplot/color is set to thistle
sns.boxplot(data=new_users, x='language_preferred', y='time_spent_on_the_page', color='thistle'); #using data from new_users dataframe to make boxplot with language_preferred on x-axis and time_spent_on_the_page on y-axis
plt.title('Time Spent on Landing Page vs. Language Preference'); #setting title of boxplot
plt.xlabel('Language Preference'); #setting title of x-axis
plt.ylabel('Time Spent on Landing Page'); #setting title of y-axis
plt.show(); #displaying boxplot


#performing shapiro-wilk's test to examine normal distribution
#importing the stats function to carry out shapiro-wilk's test
from scipy import stats

#calculating the p-value for shapiro-wilk's test
w, p_value = stats.shapiro(new_users['time_spent_on_the_page']) #performing the test on the time_spent_on_the_page column of the new_users dataframe

#printing the p-value of shapiro-wilk's test
print("Shapiro-Wilk's Test p-value:", p_value)

Shapiro-Wilk's Test p-value: 0.8040016293525696


#performing levene's test to examine equality of variance
#importing the levene() function
from scipy.stats import levene

#calculating the p-value for levene's test
#inputting time_spent_on_the_page column of the new_users dataframe followed by the language_preferred column data specific for each language
statistic, p_value = levene(new_users['time_spent_on_the_page'][new_users['language_preferred'] == 'Spanish'],
                            new_users['time_spent_on_the_page'][new_users['language_preferred'] == 'English'],
                            new_users['time_spent_on_the_page'][new_users['language_preferred'] == 'French'])

#printing the p-value of levene's test
print("Levene's Test p-value:", p_value)

Levene's Test p-value: 0.46711357711340173


#using the one-way ANOVA F-test to calculate the p-value
#importing the f_oneway() function
from scipy.stats import f_oneway

#calculating the p-value with the f_oneway() function
#inputting the time_spent_on_the_page from the new_users dataframe based on data for each language from the language_preferred column
test_stat, p_value = f_oneway(new_users.loc[new_users['language_preferred'] == 'Spanish', 'time_spent_on_the_page'],
                              new_users.loc[new_users['language_preferred'] == 'English', 'time_spent_on_the_page'],
                              new_users.loc[new_users['language_preferred'] == 'French', 'time_spent_on_the_page'])

print('p-value:', p_value)#printing the p-value

p-value: 0.43204138694325955


print('α =', 0.05) #printing the value of alpha
print('p-value =', p_value) #printing the p-value

α = 0.05
p-value = 0.43204138694325955

	user_id	time_spent_on_the_page
count	100.000000	100.000000
mean	546517.000000	5.377800
std	52.295779	2.378166
min	546443.000000	0.190000
25%	546467.750000	3.880000
50%	546492.500000	5.415000
75%	546567.250000	7.022500
max	546592.000000	10.710000

	user_id	group	landing_page	time_spent_on_the_page	converted	language_preferred
0	546592	control	old	3.48	no	Spanish
1	546468	treatment	new	7.13	yes	English
2	546462	treatment	new	4.40	no	Spanish
3	546567	control	old	3.02	no	French
4	546459	treatment	new	4.75	yes	Spanish

	user_id	group	landing_page	time_spent_on_the_page	converted	language_preferred
95	546446	treatment	new	5.15	no	Spanish
96	546544	control	old	6.52	yes	English
97	546472	treatment	new	7.07	yes	Spanish
98	546481	treatment	new	6.20	yes	Spanish
99	546483	treatment	new	5.86	yes	English

Project Business Statistics: E-news Express¶

Define Problem Statement and Objectives¶

Problem Statement¶

Objectives¶

Data Dictionary¶

Import all the necessary libraries¶

Reading the Data into a DataFrame¶

Explore the dataset and extract insights using Exploratory Data Analysis¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Univariate Analysis¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Bivariate Analysis¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

1. Do the users spend more time on the new landing page than the existing landing page?¶

Visual Analysis¶

Observations:¶

Null and Alternate Hypotheses¶

Appropriate Test¶

Significance Level¶

Collecting and Preparing Data¶

Calculating the p-value¶

Comparing the p-value with $\alpha$¶

Drawing Inferences¶

2. Is the conversion rate (the proportion of users who visit the landing page and get converted) for the new page greater than the conversion rate for the old page?¶

Visual Analysis¶

Observations:¶

Null and Alternate Hypotheses¶

Appropriate Test¶

Significance Level¶

Calculating the p-value¶

Comparing the p-value with $\alpha$¶

Drawing Inferences¶

3. Is the conversion and preferred language independent or related?¶

Visual Analysis¶

Observations:¶

Null and Alternate Hypotheses¶

Appropriate Test¶

Significance Level¶

Collecting and Preparing Data¶

Calculating the p-value¶

Comparing the p-value with $\alpha$¶

Drawing Inferences¶

4. Is the time spent on the new page same for the different language users?¶

Visual Analysis¶

Observations:¶

Null and Alternate Hypotheses¶

Appropriate Test¶

Using Shapiro-Wilk's Test to Examine Normal Distribution¶

Using Levene's Test to Examine Equal Variance¶

Significance Level¶

Calculating the p-value¶

Comparing the p-value with $\alpha$¶

Drawing Inferences¶

Conclusion and Business Recommendations¶

Conclusion:¶

Business Recommendations:¶