# -----------------------------------------------------------
# Importing
# -----------------------------------------------------------
# In this project, we use Python for customer segmentation,
# statistical testing, and exploratory analysis.
# The libraries below cover data handling, visualization,
# and basic hypothesis testing.
# -----------------------------------------------------------
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, t
from statsmodels.graphics.mosaicplot import mosaic
from matplotlib.ticker import FuncFormatter
# Make sure we can see enough rows during inspection
pd.set_option('display.max_rows', 300)
# -----------------------------------------------------------
# Load Customer and Transaction Data
# -----------------------------------------------------------
# We load the two datasets used throughout this analysis:
# - Purchase Behavior: customer attributes and segments
# - Transaction Data: item-level purchase records
# These will later be merged to study purchasing patterns.
# -----------------------------------------------------------
pb_df = pd.read_csv(
r"C:\Users\rageg\Downloads\Quantium Forage\Purchase Behavior.csv"
)
td_df = pd.read_excel(
r"C:\Users\rageg\Downloads\Quantium Forage\Transaction Data.xlsx"
)
td_df.head()
| DATE | STORE_NBR | LYLTY_CARD_NBR | TXN_ID | PROD_NBR | PROD_NAME | PROD_QTY | TOT_SALES | |
|---|---|---|---|---|---|---|---|---|
| 0 | 43390 | 1 | 1000 | 1 | 5 | Natural Chip Compny SeaSalt175g | 2 | 6.0 |
| 1 | 43599 | 1 | 1307 | 348 | 66 | CCs Nacho Cheese 175g | 3 | 6.3 |
| 2 | 43605 | 1 | 1343 | 383 | 61 | Smiths Crinkle Cut Chips Chicken 170g | 2 | 2.9 |
| 3 | 43329 | 2 | 2373 | 974 | 69 | Smiths Chip Thinly S/Cream&Onion 175g | 5 | 15.0 |
| 4 | 43330 | 2 | 2426 | 1038 | 108 | Kettle Tortilla ChpsHny&Jlpno Chili 150g | 3 | 13.8 |
pb_df.head()
| LYLTY_CARD_NBR | LIFESTAGE | PREMIUM_CUSTOMER | |
|---|---|---|---|
| 0 | 1000 | YOUNG SINGLES/COUPLES | Premium |
| 1 | 1002 | YOUNG SINGLES/COUPLES | Mainstream |
| 2 | 1003 | YOUNG FAMILIES | Budget |
| 3 | 1004 | OLDER SINGLES/COUPLES | Mainstream |
| 4 | 1005 | MIDAGE SINGLES/COUPLES | Mainstream |
# -----------------------------------------------------------
# Process Purchase Behavior Data
# -----------------------------------------------------------
# We clean the customer dataset by:
# - Creating a working copy
# - Removing missing values
# - Dropping duplicate loyalty card records
# This ensures each customer is represented once.
# -----------------------------------------------------------
pb_df_cleaned = pb_df.copy()
# Check column data types
pb_df_cleaned.dtypes
# Remove missing values and duplicate customers
pb_df_cleaned = pb_df_cleaned.dropna().drop_duplicates(subset="LYLTY_CARD_NBR")
# Display original and cleaned shapes
print("Original dataset:\n", pb_df)
print("Cleaned dataset:\n", pb_df_cleaned)
# Show unique levels for key categorical variables
print("Customer lifestage levels:\n", pb_df_cleaned["LIFESTAGE"].unique())
print("\nCustomer membership levels:\n", pb_df_cleaned["PREMIUM_CUSTOMER"].unique())
Original dataset:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
0 1000 YOUNG SINGLES/COUPLES Premium
1 1002 YOUNG SINGLES/COUPLES Mainstream
2 1003 YOUNG FAMILIES Budget
3 1004 OLDER SINGLES/COUPLES Mainstream
4 1005 MIDAGE SINGLES/COUPLES Mainstream
... ... ... ...
72632 2370651 MIDAGE SINGLES/COUPLES Mainstream
72633 2370701 YOUNG FAMILIES Mainstream
72634 2370751 YOUNG FAMILIES Premium
72635 2370961 OLDER FAMILIES Budget
72636 2373711 YOUNG SINGLES/COUPLES Mainstream
[72637 rows x 3 columns]
Cleaned dataset:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
0 1000 YOUNG SINGLES/COUPLES Premium
1 1002 YOUNG SINGLES/COUPLES Mainstream
2 1003 YOUNG FAMILIES Budget
3 1004 OLDER SINGLES/COUPLES Mainstream
4 1005 MIDAGE SINGLES/COUPLES Mainstream
... ... ... ...
72632 2370651 MIDAGE SINGLES/COUPLES Mainstream
72633 2370701 YOUNG FAMILIES Mainstream
72634 2370751 YOUNG FAMILIES Premium
72635 2370961 OLDER FAMILIES Budget
72636 2373711 YOUNG SINGLES/COUPLES Mainstream
[72637 rows x 3 columns]
Customer lifestage levels:
['YOUNG SINGLES/COUPLES' 'YOUNG FAMILIES' 'OLDER SINGLES/COUPLES'
'MIDAGE SINGLES/COUPLES' 'NEW FAMILIES' 'OLDER FAMILIES' 'RETIREES']
Customer membership levels:
['Premium' 'Mainstream' 'Budget']
There were no missing values in the purchase behavior dataset, and three duplicate loyalty card numbers were removed. All column data types are correctly formatted. The cleaned dataset contains three customer membership levels (Premium, Mainstream, Budget) and seven distinct lifestage groups, which will be used to segment customers in the analysis.
# -----------------------------------------------------------
# Processing Transaction Data
# -----------------------------------------------------------
# Here we explore the product name field by splitting each
# name into individual words. This helps identify uncommon
# or unique terms that may interfere with grouping or analysis.
# -----------------------------------------------------------
# Make a clean copy of the transaction dataset
td_df_cleaned = td_df.copy()
# Extract all unique product names
unique_prod_names = td_df_cleaned["PROD_NAME"].unique()
unique_prod_names = pd.Series(unique_prod_names)
# Split each product name into alphabetical words
# (Remove numbers, punctuation, symbols, etc.)
word_list = [
word
for name in unique_prod_names
for word in re.split(r"[^A-Za-z]+", name)
]
# Convert the extracted words into a DataFrame
product_words_df = pd.DataFrame(word_list, columns=["words"])
# Keep only valid alphabetic words with at least 3 characters
product_words_df = (
product_words_df[product_words_df["words"].str.fullmatch(r"[A-Za-z]{3,}")]
.reset_index(drop=True)
)
# Count how often each word appears
word_counts = (
product_words_df["words"]
.value_counts()
.reset_index()
)
word_counts.columns = ["words", "N"]
print("WORD COUNTS")
# print(word_counts)
# Identify words that appear only once across all product names
one_time_word = word_counts[word_counts["N"] == 1].reset_index(drop=True)
# Display one-time words
one_time_word.head()
WORD COUNTS
| words | N | |
|---|---|---|
| 0 | Smoked | 1 |
| 1 | ChpsBtroot | 1 |
| 2 | Chipotle | 1 |
| 3 | Barbeque | 1 |
| 4 | Mystery | 1 |
Upon reviewing the product names, several outliers such as Smoked, Bolognese, Seasoned Chicken, and Rib appear to describe grocery or meal items rather than chips. These products sit outside the core snack category and would introduce noise into the analysis. To keep the dataset focused and relevant, we will filter out these non chip items before moving forward.
# ------------------------------------------------------------
# Checking whether rare words actually appear in chip products
# ------------------------------------------------------------
# Convert rare-word list into a set of regex-safe patterns
rare_word_patterns = [fr"\b{re.escape(word)}\b" for word in one_time_word["words"]]
# Combine into one long OR-pattern for matching
combined_pattern = "|".join(rare_word_patterns)
print("Regex pattern:", combined_pattern)
# Find all product names containing any of these rare words
matched_products = unique_prod_names[
unique_prod_names.str.contains(combined_pattern, case=False, regex=True)
]
# Review the first 100 matched products to evaluate whether they belong in the chips category
matched_products.head(100)
Regex pattern: \bSmoked\b|\bChpsBtroot\b|\bChipotle\b|\bBarbeque\b|\bMystery\b|\bFlavour\b|\bChed\b|\bTasty\b|\bRicotta\b|\bSeaSalt\b|\bSplash\b|\bPuffs\b|\bWhlgrn\b|\bBarbecue\b|\bChpsHny\b|\bCutSalt\b|\bStacked\b|\bCrips\b|\bStyle\b|\bMaple\b|\bBag\b|\bBig\b|\bOrgnl\b|\bSnbts\b|\bMstrd\b|\bCheddr\b|\bSunbites\b|\bBelly\b|\bPork\b|\bRst\b|\bSlow\b|\bAioli\b|\bChikn\b|\bPepper\b|\bOnionStacked\b|\bSauce\b|\bSnag\b|\bOnin\b|\bFrch\b|\bWhlegrn\b|\bChimuchurri\b|\bCrm\b|\bSteak\b|\bPapadums\b|\bChutny\b|\bMango\b|\bCompny\b|\bChpsFeta\b|\bStrws\b|\bVeg\b|\bHerbs\b|\bVingar\b|\bSlt\b|\bBalls\b|\bBacon\b|\bCrnkle\b|\bJlpno\b|\bSpicy\b|\bSouthern\b|\bCamembert\b|\bPotatoMix\b|\bSweetChili\b|\bPesto\b|\bBasil\b|\bMozzarella\b|\bRoast\b|\bBox\b|\bChp\b|\bDorito\b|\bChckn\b|\bInfzns\b|\bHony\b|\bCrn\b|\bOnionDip\b|\bCrnchers\b|\bMexicana\b|\bChli\b|\bGcamole\b|\bTruffle\b|\bsaltd\b|\bOriginl\b|\bMzzrlla\b|\bSpcy\b|\bFriedChicken\b|\bFig\b|\bMac\b|\bHot\b|\bGarden\b|\bCoconut\b|\bSthrn\b|\bMed\b|\bChili\b|\bFries\b|\bSpce\b|\bHrb\b|\bTmato\b|\bChlli\b|\bMexican\b|\bJalapeno\b|\bTom\b|\bChnky\b|\bSiracha\b|\bPot\b|\bLight\b|\bNCC\b|\bJam\b|\bBtroot\b|\bPlus\b|\bGrnWves\b|\bCrackers\b|\bPrawn\b|\bRib\b|\bSeasonedchicken\b|\bBolognese\b
0 Natural Chip Compny SeaSalt175g 4 Kettle Tortilla ChpsHny&Jlpno Chili 150g 8 Doritos Corn Chip Mexican Jalapeno 150g 10 Kettle Sensations Siracha Lime 150g 13 Thins Chips Light& Tangy 175g 16 NCC Sour Cream & Garden Chives 175g 17 Doritos Corn Chip Southern Chicken 150g 18 Cheezels Cheese Box 125g 20 Infzns Crn Crnchers Tangy Gcamole 110g 25 Pringles Sthrn FriedChicken 134g 26 Pringles Sweet&Spcy BBQ 134g 27 Red Rock Deli SR Salsa & Mzzrlla 150g 28 Thins Chips Originl saltd 175g 29 Red Rock Deli Sp Salt & Truffle 150G 30 Smiths Thinly Swt Chli&S/Cream175G 32 Doritos Mexicana 170g 33 Smiths Crinkle Cut French OnionDip 150g 34 Natural ChipCo Hony Soy Chckn175g 35 Dorito Corn Chp Supreme 380g 37 Smiths Thinly Cut Roast Chicken 175g 39 Kettle Mozzarella Basil & Pesto 175g 40 Infuzions Thai SweetChili PotatoMix 110g 41 Kettle Sensations Camembert & Fig 150g 42 Smith Crinkle Cut Mac N Cheese 150g 44 Thins Chips Seasonedchicken 175g 46 Infuzions BBQ Rib Prawn Crackers 110g 47 GrnWves Plus Btroot & Chilli Jam 180g 51 Kettle 135g Swt Pot Sea Salt 55 Old El Paso Salsa Dip Chnky Tom Ht300g 56 Cobs Popd Swt/Chlli &Sr/Cream Chips 110g 58 Natural Chip Co Tmato Hrb&Spce 175g 62 French Fries Potato Chips 175g 63 Old El Paso Salsa Dip Tomato Med 300g 66 RRD Chilli& Coconut 150g 68 Thins Potato Chips Hot & Spicy 175g 69 Cobs Popd Sour Crm &Chives Chips 110g 70 Smiths Crnkle Chip Orgnl Big Bag 380g 72 Kettle Sensations BBQ&Maple 150g 73 WW D/Style Chip Sea Salt 200g 74 Pringles Chicken Salt Crips 134g 75 WW Original Stacked Chips 160g 76 Smiths Chip Thinly CutSalt/Vinegr175g 80 Smiths Crinkle Cut Chips Barbecue 170g 81 Cheetos Puffs 165g 84 Tostitos Splash Of Lime 175g 86 Kettle Tortilla ChpsBtroot&Ricotta 150g 87 CCs Tasty Cheese 175g 89 Tostitos Smoked Chipotle 175g 90 Pringles Barbeque 134g 92 Pringles Mystery Flavour 134g 93 Tyrrells Crisps Ched & Chives 165g 94 Snbts Whlgrn Crisps Cheddr&Mstrd 90g 95 Cheetos Chs & Bacon Balls 190g 96 Pringles Slt Vingar 134g 97 Infuzions SourCream&Herbs Veg Strws 110g 98 Kettle Tortilla ChpsFeta&Garlic 150g 99 Infuzions Mango Chutny Papadums 70g 100 RRD Steak & Chimuchurri 150g 102 Sunbites Whlegrn Crisps Frch/Onin 90g 105 Smiths Crinkle Cut Snag&Sauce 150g 106 WW Sour Cream &OnionStacked Chips 160g 107 RRD Lime & Pepper 165g 109 Red Rock Deli Chikn&Garlic Aioli 150g 110 RRD SR Slow Rst Pork Belly 150g 112 Smith Crinkle Cut Bolognese 150g dtype: object
The sampled rare words all appear to be genuine chip-related terms, so they do not indicate contamination from unrelated product categories.
# ------------------------------------------------------------
# Extracting brand names from product descriptions
# ------------------------------------------------------------
# The first word of each product name typically corresponds to the brand.
# We convert it to uppercase to keep formatting consistent.
td_df_cleaned["BRAND"] = (
td_df_cleaned["PROD_NAME"]
.str.split()
.str[0]
.str.upper()
)
# Inspect brand frequency to confirm that extraction worked correctly
td_df_cleaned["BRAND"].value_counts()
BRAND KETTLE 41288 SMITHS 28860 PRINGLES 25102 DORITOS 24962 THINS 14075 RRD 11894 INFUZIONS 11057 WW 10320 COBS 9693 TOSTITOS 9471 TWISTIES 9454 OLD 9324 TYRRELLS 6442 GRAIN 6272 NATURAL 6050 RED 5885 CHEEZELS 4603 CCS 4551 WOOLWORTHS 4437 DORITO 3185 INFZNS 3144 SMITH 2963 CHEETOS 2927 SNBTS 1576 BURGER 1564 GRNWVES 1468 SUNBITES 1432 NCC 1419 FRENCH 1418 Name: count, dtype: int64
I created a BRAND column using the first word of each product name. However, several brands appear under multiple spellings or abbreviations, so the next step is to map these variations to a single standardized brand name to ensure the brand distribution is counted correctly.
# -----------------------------------------------------------
# Standardizing Brand Names
# -----------------------------------------------------------
# This mapping groups alternate spellings and abbreviations
# under a single consistent brand label so brand-level analysis
# reflects true customer preferences.
# -----------------------------------------------------------
brand_map = {
# Sunbites / Grain Waves
"SUNBITES": "SUNBITES",
"SNBTS": "SUNBITES",
"GRAIN": "SUNBITES",
"GRNWVES": "SUNBITES",
# Smith's
"SMITHS": "SMITHS",
"SMITH": "SMITHS",
# Infuzions
"INFUZIONS": "INFUZIONS",
"INFZNS": "INFUZIONS",
# Red Rock Deli
"RRD": "RED_ROCK_DELI",
"RED": "RED_ROCK_DELI",
# Natural Chip Co.
"NATURAL": "NATURAL_CHIP_CO",
"NCC": "NATURAL_CHIP_CO",
# Old El Paso
"OLD": "OLD_EL_PASO",
# French Fries
"FRENCH": "FRENCH_FRIES",
# Doritos
"DORITO": "DORITOS",
"DORITOS": "DORITOS",
# Woolworths private label
"WOOLWORTHS": "WOOLWORTHS",
"WOOLSWORTHS": "WOOLWORTHS",
"WW": "WOOLWORTHS",
# Burger Rings
"BURGER": "BURGER RINGS"
}
# -----------------------------------------------------------
# Applying the Brand Mapping
# -----------------------------------------------------------
# Replace all alternate brand spellings with the standardized
# names defined in the mapping above.
# -----------------------------------------------------------
td_df_cleaned["BRAND"] = td_df_cleaned["BRAND"].replace(brand_map)
td_df_cleaned["BRAND"].value_counts()
BRAND KETTLE 41288 SMITHS 31823 DORITOS 28147 PRINGLES 25102 RED_ROCK_DELI 17779 WOOLWORTHS 14757 INFUZIONS 14201 THINS 14075 SUNBITES 10748 COBS 9693 TOSTITOS 9471 TWISTIES 9454 OLD_EL_PASO 9324 NATURAL_CHIP_CO 7469 TYRRELLS 6442 CHEEZELS 4603 CCS 4551 CHEETOS 2927 BURGER RINGS 1564 FRENCH_FRIES 1418 Name: count, dtype: int64
All brands have been named correctly and thus the proper product sale count has been insured.
# Seeing the shape of our quantitative fields
td_df_cleaned.info()
td_df_cleaned.describe(include= "all")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 264836 entries, 0 to 264835 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DATE 264836 non-null int64 1 STORE_NBR 264836 non-null int64 2 LYLTY_CARD_NBR 264836 non-null int64 3 TXN_ID 264836 non-null int64 4 PROD_NBR 264836 non-null int64 5 PROD_NAME 264836 non-null object 6 PROD_QTY 264836 non-null int64 7 TOT_SALES 264836 non-null float64 8 BRAND 264836 non-null object dtypes: float64(1), int64(6), object(2) memory usage: 18.2+ MB
| DATE | STORE_NBR | LYLTY_CARD_NBR | TXN_ID | PROD_NBR | PROD_NAME | PROD_QTY | TOT_SALES | BRAND | |
|---|---|---|---|---|---|---|---|---|---|
| count | 264836.000000 | 264836.00000 | 2.648360e+05 | 2.648360e+05 | 264836.000000 | 264836 | 264836.000000 | 264836.000000 | 264836 |
| unique | NaN | NaN | NaN | NaN | NaN | 114 | NaN | NaN | 20 |
| top | NaN | NaN | NaN | NaN | NaN | Kettle Mozzarella Basil & Pesto 175g | NaN | NaN | KETTLE |
| freq | NaN | NaN | NaN | NaN | NaN | 3304 | NaN | NaN | 41288 |
| mean | 43464.036260 | 135.08011 | 1.355495e+05 | 1.351583e+05 | 56.583157 | NaN | 1.907309 | 7.304200 | NaN |
| std | 105.389282 | 76.78418 | 8.057998e+04 | 7.813303e+04 | 32.826638 | NaN | 0.643654 | 3.083226 | NaN |
| min | 43282.000000 | 1.00000 | 1.000000e+03 | 1.000000e+00 | 1.000000 | NaN | 1.000000 | 1.500000 | NaN |
| 25% | 43373.000000 | 70.00000 | 7.002100e+04 | 6.760150e+04 | 28.000000 | NaN | 2.000000 | 5.400000 | NaN |
| 50% | 43464.000000 | 130.00000 | 1.303575e+05 | 1.351375e+05 | 56.000000 | NaN | 2.000000 | 7.400000 | NaN |
| 75% | 43555.000000 | 203.00000 | 2.030942e+05 | 2.027012e+05 | 85.000000 | NaN | 2.000000 | 9.200000 | NaN |
| max | 43646.000000 | 272.00000 | 2.373711e+06 | 2.415841e+06 | 114.000000 | NaN | 200.000000 | 650.000000 | NaN |
We can see that our most purchased item is the Kettle Mozzarella Basil & Pesto 175g, present in 3,304 transactions, from our top brand KETTLE with 41,288 transactions. The function above also reveals two outliers in PROD_QTY and TOT_SALES. There is a max prod quantity of 200 with a mean of 1.9 and a standard deviation of 0.64 indicating that this purchase of 200 is an extremity. Also note the Date columns and category columns are not in the correct datatypes, I will be converting PROD_NAME and BRAND from objects to strings, as well as ensuring the DATE column is in the correct Datetime datatype below.
# -----------------------------------------------------------
# Removing Extreme Outliers in PROD_QTY
# -----------------------------------------------------------
# Two transactions show impossible 200-unit purchases, so we
# drop them to keep totals and averages realistic.
# -----------------------------------------------------------
td_df_cleaned.sort_values(by="PROD_QTY", ascending=False)
td_df_cleaned = td_df_cleaned.drop(index=[69762, 69763])
# -----------------------------------------------------------
# Fixing Date Format and Ensuring Proper Data Types
# -----------------------------------------------------------
# Convert Excel-style dates into true timestamps and set key
# text fields to string for consistency.
# -----------------------------------------------------------
td_df_cleaned['DATE'] = pd.to_datetime(td_df_cleaned['DATE'], unit='D', origin='1899-12-30')
td_df_cleaned = td_df_cleaned.astype({"PROD_NAME": "string", "BRAND": "string"})
td_df_cleaned.info()
<class 'pandas.core.frame.DataFrame'> Index: 264834 entries, 0 to 264835 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DATE 264834 non-null datetime64[ns] 1 STORE_NBR 264834 non-null int64 2 LYLTY_CARD_NBR 264834 non-null int64 3 TXN_ID 264834 non-null int64 4 PROD_NBR 264834 non-null int64 5 PROD_NAME 264834 non-null string 6 PROD_QTY 264834 non-null int64 7 TOT_SALES 264834 non-null float64 8 BRAND 264834 non-null string dtypes: datetime64[ns](1), float64(1), int64(5), string(2) memory usage: 20.2 MB
The transaction dataset above has been cleaned, All outliers have been removed and column data types have been changed for efficiency. Now we will merge both proccessed dataframes on the Customer ID and proceed with analysis, with the purpose of optimization.
# -----------------------------------------------------------
# Merging Transaction + Customer Data
# -----------------------------------------------------------
# Join both datasets using the loyalty card number, check for
# merge-created duplicates, and remove them to keep the data clean.
# -----------------------------------------------------------
merged_data = pd.merge(td_df_cleaned, pb_df_cleaned, how="left", on="LYLTY_CARD_NBR")
# Check whether the merge introduced duplicate rows
merged_data[merged_data.duplicated()]
# Drop any duplicates that appear after merging
merged_data = merged_data.drop_duplicates(keep='first')
# Confirm no remaining duplicates
merged_data[merged_data.duplicated()]
| DATE | STORE_NBR | LYLTY_CARD_NBR | TXN_ID | PROD_NBR | PROD_NAME | PROD_QTY | TOT_SALES | BRAND | LIFESTAGE | PREMIUM_CUSTOMER |
|---|
During data preparation, I identified a duplicate transaction with Transaction ID 108462 that appeared during the merging stage. This duplicate was removed to maintain data integrity. With that, the dataset is now cleaned and ready for exploratory data analysis.
Next, I will extract package sizes from the dataset to enrich our analysis.
# -----------------------------------------------------------
# Extracting Package Sizes and Validating Units
# -----------------------------------------------------------
# Pull numeric pack sizes (in grams) from product names, check for
# any malformed entries, and review basic stats to ensure values
# look reasonable.
# -----------------------------------------------------------
# Extract numeric values that precede "g" (grams)
merged_data["PACKAGE SIZE"] = merged_data["PROD_NAME"].str.extract(
r"(?i)(\d+)(?=g)", expand=False
).astype(float)
# Identify product names that contain numbers
numbered = merged_data[merged_data["PROD_NAME"].str.contains(r"\d+", regex=True, na=False)]
# From those, detect any that *do not* follow the "<number>g" format
non_g_products = numbered[
~numbered["PROD_NAME"].str.contains(r"(?i)\d+g", regex=True, na=False)
]
# Print unexpected formats (if empty → all good)
print(non_g_products[["PROD_NAME"]])
# Check for missing values, including PACKAGE SIZE
print(merged_data.isna().sum())
# Review distribution of package sizes
print(merged_data["PACKAGE SIZE"].describe())
Empty DataFrame Columns: [PROD_NAME] Index: [] DATE 0 STORE_NBR 0 LYLTY_CARD_NBR 0 TXN_ID 0 PROD_NBR 0 PROD_NAME 0 PROD_QTY 0 TOT_SALES 0 BRAND 0 LIFESTAGE 0 PREMIUM_CUSTOMER 0 PACKAGE SIZE 0 dtype: int64 count 264833.000000 mean 182.425540 std 64.325268 min 70.000000 25% 150.000000 50% 170.000000 75% 175.000000 max 380.000000 Name: PACKAGE SIZE, dtype: float64
Now that I've created the package size column and validated our units, the next step is to verify that there are no missing dates in our transaction log, that is to ensure the transactions are continuous.
# -----------------------------------------------------------
# Checking for Missing Transaction Dates + Daily Volume Plot
# -----------------------------------------------------------
# Count transactions per day, detect missing calendar dates,
# and visualize daily transaction volume.
# -----------------------------------------------------------
# Count how many transactions occurred each day
daily_order_count = merged_data.groupby("DATE").size().reset_index(name="count")
print(daily_order_count)
# Generate a complete date range covering the entire period
full_range = pd.date_range(
start=daily_order_count["DATE"].min(),
end=daily_order_count["DATE"].max()
)
# Identify missing calendar dates
existing_dates = set(daily_order_count["DATE"])
missing_dates = [d for d in full_range if d not in existing_dates]
print(missing_dates)
# Plot daily transaction counts
plt.plot(daily_order_count["DATE"], daily_order_count["count"])
plt.title("Transactions per Day")
plt.xlabel("Date")
plt.ylabel("Number of Transactions")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
DATE count
0 2018-07-01 724
1 2018-07-02 711
2 2018-07-03 722
3 2018-07-04 714
4 2018-07-05 712
.. ... ...
359 2019-06-26 723
360 2019-06-27 709
361 2019-06-28 730
362 2019-06-29 745
363 2019-06-30 744
[364 rows x 2 columns]
[Timestamp('2018-12-25 00:00:00')]
# -----------------------------------------------------------
# 🎄 December Daily Transaction Trend
# -----------------------------------------------------------
# Focus on December to spot seasonal spikes or dips in activity.
# A line + scatter combo helps show both the trend and daily detail.
# -----------------------------------------------------------
# Filter December transactions
december_data = daily_order_count[daily_order_count["DATE"].dt.month == 12]
# Plot daily transaction counts
plt.plot(december_data["DATE"], december_data["count"], label="Transaction Line")
plt.scatter(december_data["DATE"], december_data["count"], color='red', s=10, label="Daily Points")
plt.xlabel("Date")
plt.ylabel("Transaction Count")
plt.title("Transactions per Day in December")
plt.xticks(rotation=45, fontsize="x-small")
plt.legend()
plt.tight_layout()
plt.show()
We observe a rise in sales during the middle and toward the end of December, followed by a return to normal trends. There is a clear spike in transactions just before Christmas, reflecting the holiday season’s impact on consumer behavior. December 25th is missing from the dataset, which makes sense because stores are closed that day, confirming that the gap is expected rather than an error.
With the transaction pattern validated and no inconsistencies remaining, we can now move on to visualizing customer counts and sales proportions across our segments and lifestages. We begin with a holistic view to understand how each group contributes to total sales and total customers. This foundation will guide the selection of which segment and lifestage should be prioritized in our next marketing campaign.
# -----------------------------------------------------------
# Preparing Membership and Sales Distribution Charts
# -----------------------------------------------------------
# These charts show how customers and total sales are distributed
# across the three membership groups. Comma formatting is added
# to the y axis to make the values easier to read.
# -----------------------------------------------------------
# Comma formatter for y-axis
def comma_formatter(x, pos):
return f"{x:,.0f}"
def dollar_formatter(x, pos):
return f"${x:,.0f}"
# Fixed display order
category_order = ["Mainstream", "Budget", "Premium"]
# Color map
color_map = {
"Mainstream": "#1f77b4",
"Premium": "#ff7f0e",
"Budget": "#2ca02c"
}
colors = [color_map[c] for c in category_order]
# --------------------------------------------------------
# 1️⃣ Membership Status Distribution (Comma-Formatted Axis)
# --------------------------------------------------------
membership_status = pb_df_cleaned["PREMIUM_CUSTOMER"].value_counts()
membership_status = membership_status.reindex(category_order)
plt.figure(figsize=(7,5))
plt.bar(category_order, membership_status.values, color=colors)
plt.title("Membership Status Distribution")
plt.xlabel("Membership Status")
plt.ylabel("Customers")
# ADD COMMA SEPARATOR TO Y-AXIS
plt.gca().yaxis.set_major_formatter(FuncFormatter(comma_formatter))
# Add labels on top of bars (formatted)
for i, v in enumerate(membership_status.values):
plt.text(i, v + 300, f"{v:,}", ha='center', fontsize='small')
plt.tight_layout()
plt.show()
# --------------------------------------------------------
# 2️⃣ Total Sales by Membership Status (Comma-Formatted Axis)
# --------------------------------------------------------
sales_per_stat = merged_data.groupby("PREMIUM_CUSTOMER")["TOT_SALES"].sum()
sales_per_stat = sales_per_stat.reindex(category_order)
plt.figure(figsize=(7,5))
plt.bar(category_order, sales_per_stat.values, color=colors)
plt.title("Total Sales by Membership Status")
plt.xlabel("Membership Status")
plt.ylabel("Total Sales")
# ADD COMMA SEPARATOR TO Y-AXIS (Dollar version)
plt.gca().yaxis.set_major_formatter(FuncFormatter(dollar_formatter))
# Add labels on bars with comma formatting
for i, v in enumerate(sales_per_stat.values):
plt.text(i, v + 10000, f"${v:,.2f}", ha='center', fontsize='small')
plt.tight_layout()
plt.show()
Note: Most sales come from Mainstream customers, followed by Budget and then Premium, which aligns with how each group contributes to the overall customer base. Next, we will break this down by customer lifestage to see which lifestage groups drive the most footfall and which generate the highest sales.
# -----------------------------------------------------------
# Cleaning and abbreviating customer life stage labels
# -----------------------------------------------------------
# This helper function standardizes and shortens long lifestage
# names to make plot labels cleaner and more readable.
# The underlying logic remains unchanged.
# -----------------------------------------------------------
def auto_abbrev(name):
"""Abbreviates life stage names for cleaner plot labels."""
name = name.upper().strip() # Standardize to uppercase and remove spaces
name = re.sub(r"SINGLES/COUPLES", "S/C", name) # Abbreviate singles/couples
name = re.sub(r"FAMILIES", "fam", name) # Abbreviate families
name = re.sub(r"YOUNG", "young", name) # Lowercase 'young'
name = re.sub(r"OLDER", "old", name) # Lowercase 'older'
name = re.sub(r"MIDAGE", "Mid", name) # Capitalize 'Mid'
return name.title() # Return in title case
# -----------------------------------------------------------
# Visualizing sales contribution using a mosaic plot
# -----------------------------------------------------------
# This block summarizes total sales and customer counts by
# premium customer segment and life stage. Labels are abbreviated
# for clarity, and a mosaic chart is used to show the proportion
# of sales coming from each segment and life stage combination.
# -----------------------------------------------------------
# Group the data by Premium Customer segment and Life Stage, summing total sales
life_stages_count = merged_data.groupby(["PREMIUM_CUSTOMER", "LIFESTAGE"])["TOT_SALES"].sum().unstack()
# Convert the grouped sales data into a dictionary with keys as (segment, lifestage)
sales_dict = {
(col, index): row[col]
for index, row in life_stages_count.iterrows()
for col in life_stages_count.columns
}
# Apply abbreviation to segment names for cleaner labels
sales_dict = {(auto_abbrev(key[0]), key[1]): value for key, value in sales_dict.items()}
# Total sales for percentage label calculations
total_sales = sum(sales_dict.values())
# Label function for sales mosaic plot
def labelizer(key):
share = sales_dict[key] / total_sales
return f"{share:.1%}"
# Label function for customer count mosaic plot
def labelizer2(key):
value = customer_dict[key] / total_cust
return f"{value:.1%}"
# Tile coloring based on customer segment
def props(key):
premium_customer = key[1]
return {'facecolor': color_map.get(premium_customer, '#cccccc')}
# Count unique customers by segment and life stage
customer_count = merged_data.groupby(["PREMIUM_CUSTOMER", "LIFESTAGE"])["LYLTY_CARD_NBR"].nunique().unstack()
# Total unique customers
total_cust = pb_df_cleaned["LYLTY_CARD_NBR"].nunique()
# Convert customer counts into dictionary
customer_dict = {
(col, index): row[col]
for index, row in customer_count.iterrows()
for col in customer_count.columns
}
# Apply abbreviation to segment names
customer_dict = {(auto_abbrev(key[0]), key[1]): value for key, value in customer_dict.items()}
# Plot the mosaic chart
fig, ax = plt.subplots(figsize=(12, 8))
mosaic(
sales_dict,
properties=props,
gap=0.005,
title="Contribution to Total Sales by Lifestage within Each Customer Segment",
labelizer=labelizer,
ax=ax
)
plt.show()
As illustrated above, Budget Old Families contribute the largest share of total sales at 8.7 percent. This is followed closely by Mainstream Young Singles and Couples at 8.2 percent, with Mainstream Retirees right behind at 8.1 percent. Premium Old Singles and Couples also play a notable role, accounting for 6.8 percent of total sales. One key observation is that high sales contribution does not always reflect group size. For example, Premium customers are a smaller segment overall, yet Premium Old Singles and Couples overperform relative to their population share. On the other hand, some larger groups, such as Mainstream Young Families, contribute proportionally less.
# -----------------------------------------------------------
# Mosaic Plot for Customer Contribution
# -----------------------------------------------------------
# This block visualizes how different life stage and segment
# combinations contribute to the overall customer base. It
# uses a mosaic chart to highlight proportional differences
# clearly across groups.
# -----------------------------------------------------------
fig, ax = plt.subplots(figsize=(12, 8))
mosaic(
data=customer_dict,
properties=props,
labelizer=labelizer2,
gap=0.005,
title="Contribution to Total Customers by Lifestage within Each Customer Segment",
ax=ax
)
plt.show()
The customer distribution reveals that Mainstream Young Singles and Couples make up the largest share of the customer base at 11 percent, yet they do not translate that size into proportionate sales. Meanwhile, Budget Older Families stand out because they overperform relative to their size. They represent just over 6 percent of customers but deliver nearly 9 percent of total sales, showing stronger purchasing intensity than larger groups.
Premium Older Singles and Couples also appear prominently in the customer base at 6.5 percent, but unlike Budget Older Families, their sales contribution does not scale at the same rate. This mismatch across groups highlights that sales performance is not simply a function of customer volume, but rather driven by behaviors such as how often customers buy, how many units they purchase, and which products they gravitate toward.
# -----------------------------------------------------------
# Average Units per Transaction by Lifestage and Segment
# This computes how many chip units each group buys per trip
# and visualizes differences across lifestages and segments.
# -----------------------------------------------------------
# Calculate the average units per transaction
avg_units = merged_data.groupby(["LIFESTAGE", "PREMIUM_CUSTOMER"])["PROD_QTY"].mean().unstack()
# Assign bar colors based on Premium Customer segment
bar_colors = [color_map.get(col, '#cccccc') for col in avg_units.columns if col != 'LIFESTAGE']
# Plot grouped bar chart
ax = avg_units.plot(kind='bar', color=bar_colors, figsize=(12, 6))
# Add value labels on top of each bar
for bar_group in ax.containers:
for bar in bar_group:
height = bar.get_height()
ax.annotate(
f'{height:.2f}',
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha='center', va='bottom',
fontsize=8
)
# Add plot title and axis labels
plt.title('Average Units per Transaction by Lifestage and Customer Segment')
plt.xlabel('Lifestage')
plt.ylabel('Average Units')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Based on the chart above, Mainstream Young Singles and Couples purchase more units per transaction than their Budget and Premium counterparts, suggesting a stronger inclination toward larger basket sizes. A similar trend appears for Midage Singles and Couples, though the gap between segments is smaller.
To understand whether pricing behavior also contributes to these sales patterns, we now examine the average price paid per unit across customer segments below
# -----------------------------------------------------------
# Average Price per Unit by Lifestage and Customer Segment
# This block calculates the price per unit and visualizes
# how much each lifestage and segment pays on average.
# -----------------------------------------------------------
# Calculate unit price for each transaction
merged_data["PRICE"] = merged_data["TOT_SALES"] / merged_data["PROD_QTY"]
# Compute the average unit price grouped by Lifestage and segment
avg_price_count = merged_data.groupby(["LIFESTAGE", "PREMIUM_CUSTOMER"])["PRICE"].mean().unstack()
# Define bar colors using the color_map for each segment
bar_colors = [color_map.get(col, '#cccccc') for col in avg_price_count.columns]
# Plot the average unit price as a bar chart
ax = avg_price_count.plot(
kind="bar",
figsize=(12, 6),
color=bar_colors,
title="Average Price per Unit by Lifestage and Customer Segment"
)
# Axis labels
ax.set_xlabel("Lifestage")
ax.set_ylabel("Average Unit Price")
# Rotate x-axis labels for readability
plt.xticks(rotation=45)
# Add value labels on top of each bar
for container in ax.containers:
ax.bar_label(container, fmt='%.2f', padding=3, fontsize=8)
plt.tight_layout()
plt.show()
It appears that Mainstream Young Singles and Couples, as well as Midage Singles and Couples, are willing to pay more per unit than their Premium and Budget counterparts. This points to a degree of price insensitivity within these Mainstream groups, who also purchase more units per transaction. Premium customers, on the other hand, likely buy chips less frequently and may gravitate toward specialty or healthier options, which could explain their lower volumes and lower average unit prices. These patterns are consistent with the smaller customer counts we observe among Premium Young and Midage Singles and Couples. To confirm whether these price differences are meaningful, we next perform an independent t-test comparing Mainstream unit prices against the rest of the population.
We will now examine brand affinity to determine which brands are most preferred by Mainstream vs. Non-Mainstream customers. This will guide inventory planning and marketing strategies.
# -----------------------------------------------------------
# One-tailed Independent T-Test for Price Sensitivity
# -----------------------------------------------------------
# This block compares whether Mainstream Singles/Couples pay
# more per unit than their non-Mainstream counterparts.
# The test is restricted to Young and Midage Singles/Couples
# to match patterns observed earlier in the analysis.
# -----------------------------------------------------------
# Calculate price per unit
merged_data["price_per_unit"] = merged_data["TOT_SALES"] / merged_data["PROD_QTY"]
# Filter for the two target lifestages
ym_sg_cp = merged_data[
merged_data["LIFESTAGE"].isin(["YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES"])
].copy()
# Label Mainstream vs Non-Mainstream
ym_sg_cp["GROUP"] = np.where(
ym_sg_cp["PREMIUM_CUSTOMER"] == "Mainstream", "MS", "Not MS"
)
# Split into groups
ms_cust = ym_sg_cp[ym_sg_cp["GROUP"] == "MS"]
nms_cust = ym_sg_cp[ym_sg_cp["GROUP"] == "Not MS"]
# Extract price-per-unit series
pp_unit_ms = ms_cust["price_per_unit"]
pp_unit_nms = nms_cust["price_per_unit"]
# Welch t-test (one-tailed: Mainstream > Non-Mainstream)
t_stat, p_value = ttest_ind(
pp_unit_ms, pp_unit_nms,
alternative='greater',
equal_var=False
)
# Means
mean_ms = pp_unit_ms.mean()
mean_nms = pp_unit_nms.mean()
# Sample sizes and variances
n1, n2 = len(pp_unit_ms), len(pp_unit_nms)
var1 = np.var(pp_unit_ms, ddof=1)
var2 = np.var(pp_unit_nms, ddof=1)
# Welch-Satterthwaite degrees of freedom
df = (var1/n1 + var2/n2)**2 / (
(var1**2)/(n1**2 * (n1 - 1)) +
(var2**2)/(n2**2 * (n2 - 1))
)
# Standard error of mean difference
se = np.sqrt(var1/n1 + var2/n2)
# Critical t-value and confidence interval
t_crit = t.ppf(0.95, df)
mean_diff = mean_ms - mean_nms
ci_lower = mean_diff - t_crit * se
ci_upper = np.inf
# Print results
print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of freedom: {df:.0f}")
if p_value < 2.2e-16:
print("P-value: < 2.2e-16")
else:
print(f"P-value: {p_value:.20e}")
print(f"Means: Mainstream = {mean_ms:.6f}, Not Mainstream = {mean_nms:.6f}")
print(f"95% Confidence Interval (one-tailed): [{ci_lower:.6f}, ∞)")
T-statistic: 40.61 Degrees of freedom: 58792 P-value: < 2.2e-16 Means: Mainstream = 4.045586, Not Mainstream = 3.688165 95% Confidence Interval (one-tailed): [0.342943, ∞)
The t-test returned a p-value < 2.2e-16, confirming that Mainstream Young and Midage Singles/Couples pay significantly higher unit prices than their Budget and Premium peers within the same lifestage groups.
Despite being willing to pay more per unit, Mainstream Young Singles and Couples purchase the fewest units per trip. This presents a clear opportunity to drive growth by increasing basket size rather than raising prices. Tactics such as small multi-buy promotions and better shelf visibility for their preferred brands and pack sizes could meaningfully lift sales, while ensuring strong availability of premium options remains important given their higher willingness to pay. With this in mind, we now examine their brand and size preferences to guide inventory planning.
# -----------------------------------------------------------
# Select Mainstream Young Singles/Couples and all other customers
# -----------------------------------------------------------
# 1. Target group: Mainstream Young Singles/Couples
ms_yg_sc = merged_data[
(merged_data["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") &
(merged_data["PREMIUM_CUSTOMER"] == "Mainstream")
]
# 2. Comparison group: Everyone else
nms_n_ygsc = merged_data[
~(
(merged_data["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") &
(merged_data["PREMIUM_CUSTOMER"] == "Mainstream")
)
]
# -----------------------------------------------------------
# Brand Affinity Analysis Comparing Mainstream vs Others
# -----------------------------------------------------------
# This block calculates how strongly Mainstream Young Singles/Couples
# prefer each brand compared to the rest of the population. The affinity
# score helps identify which brands resonate most with this target group.
# Total quantity purchased by each group
qty_ms = ms_yg_sc["PROD_QTY"].sum()
qty_nms = nms_n_ygsc["PROD_QTY"].sum()
# Brand purchase share within Mainstream Young Singles/Couples
ms_brand_shares = (
ms_yg_sc.groupby("BRAND")["PROD_QTY"].sum()
.div(qty_ms)
.reset_index(name="Mainstream")
)
# Brand purchase share within all other customers
nms_brand_shares = (
nms_n_ygsc.groupby("BRAND")["PROD_QTY"].sum()
.div(qty_nms)
.reset_index(name="Other")
)
# Merge to align brands across both groups
brand_proportions = pd.merge(
ms_brand_shares, nms_brand_shares, how="left", on="BRAND"
)
# Affinity score shows relative preference for each brand
brand_proportions["affinity"] = (
brand_proportions["Mainstream"] / brand_proportions["Other"]
)
# Sort to identify strongest affinities
brand_proportions.sort_values(by="affinity", ascending=False)
| BRAND | Mainstream | Other | affinity | |
|---|---|---|---|---|
| 18 | TYRRELLS | 0.029587 | 0.023933 | 1.236230 |
| 17 | TWISTIES | 0.043306 | 0.035283 | 1.227396 |
| 8 | KETTLE | 0.185649 | 0.154217 | 1.203818 |
| 16 | TOSTITOS | 0.042581 | 0.035377 | 1.203633 |
| 10 | OLD_EL_PASO | 0.041598 | 0.034753 | 1.196953 |
| 11 | PRINGLES | 0.111980 | 0.093744 | 1.194531 |
| 5 | DORITOS | 0.122877 | 0.105278 | 1.167171 |
| 4 | COBS | 0.041856 | 0.036375 | 1.150696 |
| 7 | INFUZIONS | 0.060649 | 0.053157 | 1.140942 |
| 15 | THINS | 0.056611 | 0.053084 | 1.066440 |
| 3 | CHEEZELS | 0.016851 | 0.017370 | 0.970137 |
| 14 | SUNBITES | 0.036628 | 0.040771 | 0.898373 |
| 13 | SMITHS | 0.093420 | 0.121710 | 0.767559 |
| 6 | FRENCH_FRIES | 0.003702 | 0.005364 | 0.690110 |
| 2 | CHEETOS | 0.007533 | 0.011240 | 0.670143 |
| 12 | RED_ROCK_DELI | 0.045377 | 0.068427 | 0.663146 |
| 9 | NATURAL_CHIP_CO | 0.018379 | 0.028741 | 0.639449 |
| 1 | CCS | 0.010484 | 0.017602 | 0.595596 |
| 19 | WOOLWORTHS | 0.028189 | 0.057429 | 0.490852 |
| 0 | BURGER RINGS | 0.002744 | 0.006145 | 0.446535 |
# -----------------------------------------------------------
# Visualizing Brand Affinity Scores
# -----------------------------------------------------------
# This chart highlights which brands Mainstream Young Singles/Couples
# prefer more than other customers. Brands with affinity above 1 show
# stronger preference in this target group and are colored green for
# quick interpretation, while those below 1 are shown in gray.
# Sort brands
plot_brand_df = brand_proportions.sort_values("affinity", ascending=True)
# Color logic for intuitive interpretation
colors = plot_brand_df["affinity"].apply(
lambda x: "#2ca02c" if x > 1 else "gray"
)
plt.figure(figsize=(10, 8))
plt.barh(plot_brand_df["BRAND"], plot_brand_df["affinity"], color=colors)
# Add numerical affinity labels
for i, v in enumerate(plot_brand_df["affinity"]):
plt.text(v + 0.02, i, f"{v:.2f}", va="center", fontsize=9)
# Add reference line at affinity = 1
plt.axvline(1, color="black", linestyle="--", alpha=0.6)
plt.title("Brand Affinity – Mainstream Young S/C vs Other Customers")
plt.xlabel("Affinity Score")
plt.ylabel("Brand")
plt.grid(axis="x", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()
Our Mainstream Customers are 24%, 23%, and 20% more likely to purchase Tyrrells,Twisties, and Kettle repectively.
# -----------------------------------------------------------
# Calculating Pack Size Affinity
# -----------------------------------------------------------
# This section compares the pack size preferences of Mainstream
# Young Singles/Couples against all other customers. The affinity
# score shows whether a size is chosen more often by the target
# group relative to everyone else.
# Compute pack size proportions for Mainstream customers
ms_pack_size = (
ms_yg_sc.groupby("PACKAGE SIZE")["PROD_QTY"]
.sum()
.div(qty_ms)
.reset_index(name="ms_packsize_prop")
.sort_values(by="ms_packsize_prop", ascending=False)
)
# Compute pack size proportions for Non Mainstream customers
nms_pack_size = (
nms_n_ygsc.groupby("PACKAGE SIZE")["PROD_QTY"]
.sum()
.div(qty_nms)
.reset_index(name="nms_packsize_other")
.sort_values(by="nms_packsize_other")
)
# Merge and compute affinity score
size_affinity = pd.merge(ms_pack_size, nms_pack_size, how="left", on="PACKAGE SIZE")
size_affinity["size_affinity"] = size_affinity["ms_packsize_prop"] / size_affinity["nms_packsize_other"]
# Sort by affinity score
size_affinity = size_affinity.sort_values(by="size_affinity", ascending=False)
size_affinity
| PACKAGE SIZE | ms_packsize_prop | nms_packsize_other | size_affinity | |
|---|---|---|---|---|
| 9 | 270.0 | 0.029846 | 0.023377 | 1.276688 |
| 8 | 380.0 | 0.030156 | 0.023832 | 1.265356 |
| 5 | 330.0 | 0.057465 | 0.046727 | 1.229809 |
| 2 | 134.0 | 0.111980 | 0.093744 | 1.194531 |
| 3 | 110.0 | 0.099658 | 0.083643 | 1.191477 |
| 10 | 210.0 | 0.027309 | 0.023401 | 1.166997 |
| 11 | 135.0 | 0.013849 | 0.012180 | 1.136992 |
| 12 | 250.0 | 0.013460 | 0.011905 | 1.130606 |
| 4 | 170.0 | 0.075740 | 0.075440 | 1.003976 |
| 6 | 300.0 | 0.054954 | 0.057264 | 0.959675 |
| 0 | 175.0 | 0.239102 | 0.251514 | 0.950653 |
| 1 | 150.0 | 0.155130 | 0.163447 | 0.949118 |
| 7 | 165.0 | 0.052185 | 0.058004 | 0.899677 |
| 14 | 190.0 | 0.007015 | 0.011590 | 0.605253 |
| 17 | 180.0 | 0.003365 | 0.005651 | 0.595457 |
| 15 | 160.0 | 0.006005 | 0.011526 | 0.521044 |
| 16 | 90.0 | 0.005954 | 0.011719 | 0.508041 |
| 19 | 125.0 | 0.002821 | 0.005623 | 0.501744 |
| 13 | 200.0 | 0.008413 | 0.017379 | 0.484084 |
| 18 | 70.0 | 0.002847 | 0.005889 | 0.483474 |
| 20 | 220.0 | 0.002744 | 0.006145 | 0.446535 |
# -----------------------------------------------------------
# Plotting Pack Size Affinity for Mainstream Young S/C
# -----------------------------------------------------------
# This visual highlights which pack sizes Mainstream Young Singles/Couples
# prefer more or less compared to all other customers. Affinity above 1 means
# they buy that size more frequently relative to others.
# Sort sizes by affinity
plot_size_df = size_affinity.sort_values("size_affinity", ascending=True)
# Colors: green if >1, gray otherwise
colors = plot_size_df["size_affinity"].apply(
lambda x: "#2ca02c" if x > 1 else "gray"
)
plt.figure(figsize=(10, 8))
# Convert PACKAGE SIZE to string for categorical axis labels
y_labels = plot_size_df["PACKAGE SIZE"].astype(str)
plt.barh(y_labels, plot_size_df["size_affinity"], color=colors)
# Add text labels on the bars
for i, v in enumerate(plot_size_df["size_affinity"]):
plt.text(v + 0.02, i, f"{v:.2f}", va="center", fontsize=9)
# Neutral reference line
plt.axvline(1, color="black", linestyle="--", alpha=0.6)
plt.title("Size Affinity – Mainstream Young S/C vs Other Customers")
plt.xlabel("Affinity Score")
plt.ylabel("Chip Size (g)")
plt.grid(axis="x", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()
Based on the information above, our Mainstream customers are 28% and 27%, and 23% more likely to purchase Chip bags that are 270g, 380g, and 330g respectively.
Looking at how our customers shop, it’s clear that Mainstream young singles and couples play a big role in chip sales. They’re noticeably more likely to pick up Tyrrells (24%), Twisties (23%), and Kettle (20%), so these brands should definitely be higher on our procurement priority list.
We see a similar pattern with bag sizes too. Packs around 270g, 380g, and 330g are bought much more often,28%, 27%, and 23% more likely, respectively. These are the sizes this group gravitates toward the most, which suggests they’re ideal for restocking and maintaining good shelf availability.
Focusing our inventory on these brand-and-size combinations will help us meet demand more consistently, avoid unnecessary stockouts, and keep sales growing. It may also be worth negotiating better terms with suppliers for these high-demand items since they move so quickly.
As always, customer habits can shift over time, so it’s important that we keep monitoring sales and adjust our procurement approach whenever we see changes in what people are buying.