import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('marketing_campaign.csv', sep = ';')
print("Head of the data:")
data.head()
Head of the data:
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
print("Tail of the data: ")
data.tail()
Tail of the data:
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2235 | 10870 | 1967 | Graduation | Married | 61223.0 | 0 | 1 | 2013-06-13 | 46 | 709 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2236 | 4001 | 1946 | PhD | Together | 64014.0 | 2 | 1 | 2014-06-10 | 56 | 406 | ... | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 11 | 0 |
2237 | 7270 | 1981 | Graduation | Divorced | 56981.0 | 0 | 0 | 2014-01-25 | 91 | 908 | ... | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2238 | 8235 | 1956 | Master | Together | 69245.0 | 0 | 1 | 2014-01-24 | 8 | 428 | ... | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2239 | 9405 | 1954 | PhD | Married | 52869.0 | 1 | 1 | 2012-10-15 | 40 | 84 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
5 rows × 29 columns
print("Data types of each column: ")
data.info()
Data types of each column: <class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Complain 2240 non-null int64 26 Z_CostContact 2240 non-null int64 27 Z_Revenue 2240 non-null int64 28 Response 2240 non-null int64 dtypes: float64(1), int64(25), object(3) memory usage: 507.6+ KB
print("Summary statics of dataset: ")
data.describe()
Summary statics of dataset:
ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2240.000000 | 2240.000000 | 2216.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | ... | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.0 | 2240.0 | 2240.000000 |
mean | 5592.159821 | 1968.805804 | 52247.251354 | 0.444196 | 0.506250 | 49.109375 | 303.935714 | 26.302232 | 166.950000 | 37.525446 | ... | 5.316518 | 0.072768 | 0.074554 | 0.072768 | 0.064286 | 0.013393 | 0.009375 | 3.0 | 11.0 | 0.149107 |
std | 3246.662198 | 11.984069 | 25173.076661 | 0.538398 | 0.544538 | 28.962453 | 336.597393 | 39.773434 | 225.715373 | 54.628979 | ... | 2.426645 | 0.259813 | 0.262728 | 0.259813 | 0.245316 | 0.114976 | 0.096391 | 0.0 | 0.0 | 0.356274 |
min | 0.000000 | 1893.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
25% | 2828.250000 | 1959.000000 | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 23.750000 | 1.000000 | 16.000000 | 3.000000 | ... | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
50% | 5458.500000 | 1970.000000 | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 173.500000 | 8.000000 | 67.000000 | 12.000000 | ... | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
75% | 8427.750000 | 1977.000000 | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 504.250000 | 33.000000 | 232.000000 | 50.000000 | ... | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
max | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | ... | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.000000 |
8 rows × 26 columns
data["total_spending"] = data['MntFishProducts'] + data['MntFruits'] + data['MntGoldProds'] + data['MntMeatProducts'] + data['MntSweetProducts'] + data['MntWines']
data["total_kids"] = data['Kidhome'] + data['Teenhome']
data.isnull().sum()
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 total_spending 0 total_kids 0 dtype: int64
data = data.dropna()
data.isnull().sum()
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 0 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Complain 0 Z_CostContact 0 Z_Revenue 0 Response 0 total_spending 0 total_kids 0 dtype: int64
data.boxplot(figsize=(15,10))
plt.title('Boxplots for Each Column')
plt.xticks(rotation = 45)
plt.xlabel('Columns')
plt.ylabel('Values')
plt.show()
plt.figure(figsize = (8,6))
data.boxplot(column=['Income'])
plt.xlabel('Columns')
plt.ylabel('Values')
plt.show()
Q1 = data['Income'].quantile(0.25)
Q3 = data["Income"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_filtered = data[(data['Income'] >= lower_bound) & (data['Income'] <= upper_bound)]
outliers = data.shape[0] - data_filtered.shape[0]
print("Number of Outliers Removed: " + str(outliers))
Number of Outliers Removed: 8
plt.figure(figsize=(8,6))
plt.hist(data_filtered['Income'], bins = 20, edgecolor = 'black')
plt.title('Distribution of Income')
plt.xlabel('Income')
plt.ylabel('Count')
plt.show()
marital_status_counts = data_filtered["Marital_Status"].value_counts()
plt.figure(figsize = (8,6))
plt.bar(marital_status_counts.index, marital_status_counts.values, edgecolor = 'black')
plt.title('Marital Status Distribution')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.show()
average_income = data_filtered.groupby('Education')['Income'].mean().sort_index()
plt.figure(figsize = (8,6))
average_income.plot(kind = 'bar')
plt.title('Average Income by Education')
plt.xticks(rotation = 0)
plt.xlabel('Education Level')
plt.ylabel('Avg Income')
plt.show()
import matplotlib.colors as mcolors
income_bins = [0, 20000, 40000, 60000, 80000, 100000]
income_labels = ['<20k', '20k-40k', '40k-60k', '60k-80k', '80k-100k']
data_filtered['Income_Bracket'] = pd.cut(data_filtered['Income'], bins = income_bins, labels = income_labels)
pivot_table = data_filtered.pivot_table(index='Education', columns='Income_Bracket', aggfunc='size', fill_value=0)
colors = plt.get_cmap('tab10').colors
plt.figure(figsize = (8,6))
pivot_table.plot(kind = 'bar', stacked = True, color = colors)
plt.title('Income Distribution by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Number of People')
plt.xticks(rotation=45)
plt.legend(title='Income Bracket')
plt.show()
<Figure size 800x600 with 0 Axes>
import plotly.express as px
fig = px.scatter_matrix(data_filtered,
dimensions = ['Income', 'NumStorePurchases', 'NumWebPurchases'])
fig.update_layout(
title = 'Income/Online Purchases/In Person Purchases Matrix',
width = 800,
height = 600
)
data_processed = data_filtered
data_processed['Marital_Status'] = pd.factorize(data_processed['Marital_Status'])[0]
data_processed['Education'] = pd.factorize(data_processed['Education'])[0]
data_processed.pop('Dt_Customer')
data_processed.pop('Income_Bracket')
0 40k-60k 1 40k-60k 2 60k-80k 3 20k-40k 4 40k-60k ... 2235 60k-80k 2236 60k-80k 2237 40k-60k 2238 60k-80k 2239 40k-60k Name: Income_Bracket, Length: 2208, dtype: category Categories (5, object): ['<20k' < '20k-40k' < '40k-60k' < '60k-80k' < '80k-100k']
from sklearn.cluster import KMeans
# Elbow method to determine optimal number of clusters
inertia = []
K = range(1, 11)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_processed)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(8, 4))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
data_processed['cluster'] = kmeans.fit_predict(data_processed)
cluster_sizes = data_processed['cluster'].value_counts()
for cluster in range(3):
print(f"Group {cluster} contains {cluster_sizes[cluster]} customers.")
Group 0 contains 732 customers. Group 1 contains 715 customers. Group 2 contains 761 customers.
cluster_means = data_processed.groupby('cluster').mean()
cluster_means
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | total_spending | total_kids | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cluster | |||||||||||||||||||||
0 | 5685.519126 | 1973.090164 | 1.132514 | 1.401639 | 28202.706284 | 0.811475 | 0.308743 | 48.405738 | 29.898907 | 5.915301 | ... | 0.004098 | 0.000000 | 0.001366 | 0.000000 | 0.016393 | 3.0 | 11.0 | 0.116120 | 93.849727 | 1.120219 |
1 | 5739.041958 | 1967.293706 | 0.855944 | 1.535664 | 75544.142657 | 0.092308 | 0.377622 | 49.058741 | 623.208392 | 56.720280 | ... | 0.135664 | 0.222378 | 0.179021 | 0.026573 | 0.006993 | 3.0 | 11.0 | 0.234965 | 1278.710490 | 0.469930 |
2 | 5349.095926 | 1966.109067 | 0.954008 | 1.565046 | 51706.478318 | 0.415243 | 0.817346 | 49.557162 | 273.838371 | 17.718791 | ... | 0.084100 | 0.003942 | 0.017083 | 0.014455 | 0.005256 | 3.0 | 11.0 | 0.105125 | 469.127464 | 1.232589 |
3 rows × 30 columns
plt.subplots(figsize=(10, 7))
legend = []
plt.xlabel('Year_Birth')
plt.ylabel('Income')
plt.title('Cluster Figure A')
for c, rows in data_processed.groupby('cluster'):
plt.scatter(rows['Year_Birth'], rows['Income'], s = 100)
legend.append("cluster %s" % c)
plt.legend(legend, loc="upper left")
plt.show()
plt.subplots(figsize=(10, 7))
legend = []
plt.xlabel('Year_Birth')
plt.ylabel('Total_Spending')
plt.title('Cluster Figure B')
for c, rows in data_processed.groupby('cluster'):
plt.scatter(rows['Year_Birth'], rows['total_spending'], s = 100)
legend.append("cluster %s" % c)
plt.legend(legend, loc="upper left")
plt.show()