# -----------------------------------------------------------
# Importing
# -----------------------------------------------------------
# In this project, we use Python for customer segmentation,
# statistical testing, and exploratory analysis.
# The libraries below cover data handling, visualization,
# and basic hypothesis testing.
# -----------------------------------------------------------

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, t
from statsmodels.graphics.mosaicplot import mosaic
from matplotlib.ticker import FuncFormatter

# Make sure we can see enough rows during inspection
pd.set_option('display.max_rows', 300)

# -----------------------------------------------------------
#  Load Customer and Transaction Data
# -----------------------------------------------------------
# We load the two datasets used throughout this analysis:
# - Purchase Behavior: customer attributes and segments
# - Transaction Data: item-level purchase records
# These will later be merged to study purchasing patterns.
# -----------------------------------------------------------

pb_df = pd.read_csv(
    r"C:\Users\rageg\Downloads\Quantium Forage\Purchase Behavior.csv"
)

td_df = pd.read_excel(
    r"C:\Users\rageg\Downloads\Quantium Forage\Transaction Data.xlsx"
)

td_df.head()

pb_df.head()

# -----------------------------------------------------------
#  Process Purchase Behavior Data
# -----------------------------------------------------------
# We clean the customer dataset by:
# - Creating a working copy
# - Removing missing values
# - Dropping duplicate loyalty card records
# This ensures each customer is represented once.
# -----------------------------------------------------------

pb_df_cleaned = pb_df.copy()

# Check column data types
pb_df_cleaned.dtypes

# Remove missing values and duplicate customers
pb_df_cleaned = pb_df_cleaned.dropna().drop_duplicates(subset="LYLTY_CARD_NBR")

# Display original and cleaned shapes
print("Original dataset:\n", pb_df)
print("Cleaned dataset:\n", pb_df_cleaned)

# Show unique levels for key categorical variables
print("Customer lifestage levels:\n", pb_df_cleaned["LIFESTAGE"].unique())
print("\nCustomer membership levels:\n", pb_df_cleaned["PREMIUM_CUSTOMER"].unique())

Original dataset:
        LYLTY_CARD_NBR               LIFESTAGE PREMIUM_CUSTOMER
0                1000   YOUNG SINGLES/COUPLES          Premium
1                1002   YOUNG SINGLES/COUPLES       Mainstream
2                1003          YOUNG FAMILIES           Budget
3                1004   OLDER SINGLES/COUPLES       Mainstream
4                1005  MIDAGE SINGLES/COUPLES       Mainstream
...               ...                     ...              ...
72632         2370651  MIDAGE SINGLES/COUPLES       Mainstream
72633         2370701          YOUNG FAMILIES       Mainstream
72634         2370751          YOUNG FAMILIES          Premium
72635         2370961          OLDER FAMILIES           Budget
72636         2373711   YOUNG SINGLES/COUPLES       Mainstream

[72637 rows x 3 columns]
Cleaned dataset:
        LYLTY_CARD_NBR               LIFESTAGE PREMIUM_CUSTOMER
0                1000   YOUNG SINGLES/COUPLES          Premium
1                1002   YOUNG SINGLES/COUPLES       Mainstream
2                1003          YOUNG FAMILIES           Budget
3                1004   OLDER SINGLES/COUPLES       Mainstream
4                1005  MIDAGE SINGLES/COUPLES       Mainstream
...               ...                     ...              ...
72632         2370651  MIDAGE SINGLES/COUPLES       Mainstream
72633         2370701          YOUNG FAMILIES       Mainstream
72634         2370751          YOUNG FAMILIES          Premium
72635         2370961          OLDER FAMILIES           Budget
72636         2373711   YOUNG SINGLES/COUPLES       Mainstream

[72637 rows x 3 columns]
Customer lifestage levels:
 ['YOUNG SINGLES/COUPLES' 'YOUNG FAMILIES' 'OLDER SINGLES/COUPLES'
 'MIDAGE SINGLES/COUPLES' 'NEW FAMILIES' 'OLDER FAMILIES' 'RETIREES']

Customer membership levels:
 ['Premium' 'Mainstream' 'Budget']

# -----------------------------------------------------------
#  Processing Transaction Data
# -----------------------------------------------------------
# Here we explore the product name field by splitting each
# name into individual words. This helps identify uncommon
# or unique terms that may interfere with grouping or analysis.
# -----------------------------------------------------------

# Make a clean copy of the transaction dataset
td_df_cleaned = td_df.copy()

# Extract all unique product names
unique_prod_names = td_df_cleaned["PROD_NAME"].unique()
unique_prod_names = pd.Series(unique_prod_names)

# Split each product name into alphabetical words
# (Remove numbers, punctuation, symbols, etc.)
word_list = [
    word
    for name in unique_prod_names
    for word in re.split(r"[^A-Za-z]+", name)
]

# Convert the extracted words into a DataFrame
product_words_df = pd.DataFrame(word_list, columns=["words"])

# Keep only valid alphabetic words with at least 3 characters
product_words_df = (
    product_words_df[product_words_df["words"].str.fullmatch(r"[A-Za-z]{3,}")]
    .reset_index(drop=True)
)

# Count how often each word appears
word_counts = (
    product_words_df["words"]
    .value_counts()
    .reset_index()
)
word_counts.columns = ["words", "N"]

print("WORD COUNTS")
# print(word_counts)

# Identify words that appear only once across all product names
one_time_word = word_counts[word_counts["N"] == 1].reset_index(drop=True)

# Display one-time words
one_time_word.head()

WORD COUNTS

# ------------------------------------------------------------
# Checking whether rare words actually appear in chip products
# ------------------------------------------------------------

# Convert rare-word list into a set of regex-safe patterns
rare_word_patterns = [fr"\b{re.escape(word)}\b" for word in one_time_word["words"]]

# Combine into one long OR-pattern for matching
combined_pattern = "|".join(rare_word_patterns)
print("Regex pattern:", combined_pattern)

# Find all product names containing any of these rare words
matched_products = unique_prod_names[
    unique_prod_names.str.contains(combined_pattern, case=False, regex=True)
]

# Review the first 100 matched products to evaluate whether they belong in the chips category
matched_products.head(100)

Regex pattern: \bSmoked\b|\bChpsBtroot\b|\bChipotle\b|\bBarbeque\b|\bMystery\b|\bFlavour\b|\bChed\b|\bTasty\b|\bRicotta\b|\bSeaSalt\b|\bSplash\b|\bPuffs\b|\bWhlgrn\b|\bBarbecue\b|\bChpsHny\b|\bCutSalt\b|\bStacked\b|\bCrips\b|\bStyle\b|\bMaple\b|\bBag\b|\bBig\b|\bOrgnl\b|\bSnbts\b|\bMstrd\b|\bCheddr\b|\bSunbites\b|\bBelly\b|\bPork\b|\bRst\b|\bSlow\b|\bAioli\b|\bChikn\b|\bPepper\b|\bOnionStacked\b|\bSauce\b|\bSnag\b|\bOnin\b|\bFrch\b|\bWhlegrn\b|\bChimuchurri\b|\bCrm\b|\bSteak\b|\bPapadums\b|\bChutny\b|\bMango\b|\bCompny\b|\bChpsFeta\b|\bStrws\b|\bVeg\b|\bHerbs\b|\bVingar\b|\bSlt\b|\bBalls\b|\bBacon\b|\bCrnkle\b|\bJlpno\b|\bSpicy\b|\bSouthern\b|\bCamembert\b|\bPotatoMix\b|\bSweetChili\b|\bPesto\b|\bBasil\b|\bMozzarella\b|\bRoast\b|\bBox\b|\bChp\b|\bDorito\b|\bChckn\b|\bInfzns\b|\bHony\b|\bCrn\b|\bOnionDip\b|\bCrnchers\b|\bMexicana\b|\bChli\b|\bGcamole\b|\bTruffle\b|\bsaltd\b|\bOriginl\b|\bMzzrlla\b|\bSpcy\b|\bFriedChicken\b|\bFig\b|\bMac\b|\bHot\b|\bGarden\b|\bCoconut\b|\bSthrn\b|\bMed\b|\bChili\b|\bFries\b|\bSpce\b|\bHrb\b|\bTmato\b|\bChlli\b|\bMexican\b|\bJalapeno\b|\bTom\b|\bChnky\b|\bSiracha\b|\bPot\b|\bLight\b|\bNCC\b|\bJam\b|\bBtroot\b|\bPlus\b|\bGrnWves\b|\bCrackers\b|\bPrawn\b|\bRib\b|\bSeasonedchicken\b|\bBolognese\b

0        Natural Chip        Compny SeaSalt175g
4      Kettle Tortilla ChpsHny&Jlpno Chili 150g
8       Doritos Corn Chip Mexican Jalapeno 150g
10        Kettle Sensations   Siracha Lime 150g
13               Thins Chips Light&  Tangy 175g
16       NCC Sour Cream &    Garden Chives 175g
17      Doritos Corn Chip Southern Chicken 150g
18                     Cheezels Cheese Box 125g
20       Infzns Crn Crnchers Tangy Gcamole 110g
25             Pringles Sthrn FriedChicken 134g
26                 Pringles Sweet&Spcy BBQ 134g
27     Red Rock Deli SR    Salsa & Mzzrlla 150g
28       Thins Chips         Originl saltd 175g
29      Red Rock Deli Sp    Salt & Truffle 150G
30     Smiths Thinly       Swt Chli&S/Cream175G
32                     Doritos Mexicana    170g
33     Smiths Crinkle Cut  French OnionDip 150g
34       Natural ChipCo      Hony Soy Chckn175g
35             Dorito Corn Chp     Supreme 380g
37       Smiths Thinly Cut   Roast Chicken 175g
39       Kettle Mozzarella   Basil & Pesto 175g
40     Infuzions Thai SweetChili PotatoMix 110g
41     Kettle Sensations   Camembert & Fig 150g
42        Smith Crinkle Cut   Mac N Cheese 150g
44             Thins Chips Seasonedchicken 175g
46      Infuzions BBQ Rib   Prawn Crackers 110g
47        GrnWves Plus Btroot & Chilli Jam 180g
51                 Kettle 135g Swt Pot Sea Salt
55     Old El Paso Salsa   Dip Chnky Tom Ht300g
56     Cobs Popd Swt/Chlli &Sr/Cream Chips 110g
58      Natural Chip Co     Tmato Hrb&Spce 175g
62               French Fries Potato Chips 175g
63      Old El Paso Salsa   Dip Tomato Med 300g
66             RRD Chilli&         Coconut 150g
68         Thins Potato Chips  Hot & Spicy 175g
69       Cobs Popd Sour Crm  &Chives Chips 110g
70       Smiths Crnkle Chip  Orgnl Big Bag 380g
72           Kettle Sensations   BBQ&Maple 150g
73            WW D/Style Chip     Sea Salt 200g
74          Pringles Chicken    Salt Crips 134g
75               WW Original Stacked Chips 160g
76       Smiths Chip Thinly  CutSalt/Vinegr175g
80      Smiths Crinkle Cut  Chips Barbecue 170g
81                           Cheetos Puffs 165g
84                Tostitos Splash Of  Lime 175g
86      Kettle Tortilla ChpsBtroot&Ricotta 150g
87                     CCs Tasty Cheese    175g
89            Tostitos Smoked     Chipotle 175g
90                     Pringles Barbeque   134g
92             Pringles Mystery    Flavour 134g
93       Tyrrells Crisps     Ched & Chives 165g
94         Snbts Whlgrn Crisps Cheddr&Mstrd 90g
95               Cheetos Chs & Bacon Balls 190g
96                     Pringles Slt Vingar 134g
97     Infuzions SourCream&Herbs Veg Strws 110g
98         Kettle Tortilla ChpsFeta&Garlic 150g
99      Infuzions Mango     Chutny Papadums 70g
100        RRD Steak &         Chimuchurri 150g
102    Sunbites Whlegrn    Crisps Frch/Onin 90g
105         Smiths Crinkle Cut  Snag&Sauce 150g
106      WW Sour Cream &OnionStacked Chips 160g
107                    RRD Lime & Pepper   165g
109       Red Rock Deli Chikn&Garlic Aioli 150g
110         RRD SR Slow Rst     Pork Belly 150g
112          Smith Crinkle Cut   Bolognese 150g
dtype: object

# ------------------------------------------------------------
# Extracting brand names from product descriptions
# ------------------------------------------------------------

# The first word of each product name typically corresponds to the brand.
# We convert it to uppercase to keep formatting consistent.
td_df_cleaned["BRAND"] = (
    td_df_cleaned["PROD_NAME"]
    .str.split()
    .str[0]
    .str.upper()
)

# Inspect brand frequency to confirm that extraction worked correctly
td_df_cleaned["BRAND"].value_counts()

BRAND
KETTLE        41288
SMITHS        28860
PRINGLES      25102
DORITOS       24962
THINS         14075
RRD           11894
INFUZIONS     11057
WW            10320
COBS           9693
TOSTITOS       9471
TWISTIES       9454
OLD            9324
TYRRELLS       6442
GRAIN          6272
NATURAL        6050
RED            5885
CHEEZELS       4603
CCS            4551
WOOLWORTHS     4437
DORITO         3185
INFZNS         3144
SMITH          2963
CHEETOS        2927
SNBTS          1576
BURGER         1564
GRNWVES        1468
SUNBITES       1432
NCC            1419
FRENCH         1418
Name: count, dtype: int64

# -----------------------------------------------------------
# Standardizing Brand Names
# -----------------------------------------------------------
# This mapping groups alternate spellings and abbreviations
# under a single consistent brand label so brand-level analysis
# reflects true customer preferences.
# -----------------------------------------------------------

brand_map = {
    # Sunbites / Grain Waves
    "SUNBITES": "SUNBITES",
    "SNBTS": "SUNBITES",
    "GRAIN": "SUNBITES",
    "GRNWVES": "SUNBITES",

    # Smith's
    "SMITHS": "SMITHS",
    "SMITH": "SMITHS",

    # Infuzions
    "INFUZIONS": "INFUZIONS",
    "INFZNS": "INFUZIONS",

    # Red Rock Deli
    "RRD": "RED_ROCK_DELI",
    "RED": "RED_ROCK_DELI",

    # Natural Chip Co.
    "NATURAL": "NATURAL_CHIP_CO",
    "NCC": "NATURAL_CHIP_CO",

    # Old El Paso
    "OLD": "OLD_EL_PASO",
    
    # French Fries
    "FRENCH": "FRENCH_FRIES",
    
    # Doritos
    "DORITO": "DORITOS",
    "DORITOS": "DORITOS",

    # Woolworths private label
    "WOOLWORTHS": "WOOLWORTHS",
    "WOOLSWORTHS": "WOOLWORTHS",
    "WW": "WOOLWORTHS",

    # Burger Rings
    "BURGER": "BURGER RINGS"
}

# -----------------------------------------------------------
# Applying the Brand Mapping
# -----------------------------------------------------------
# Replace all alternate brand spellings with the standardized
# names defined in the mapping above.
# -----------------------------------------------------------

td_df_cleaned["BRAND"] = td_df_cleaned["BRAND"].replace(brand_map)
td_df_cleaned["BRAND"].value_counts()

BRAND
KETTLE             41288
SMITHS             31823
DORITOS            28147
PRINGLES           25102
RED_ROCK_DELI      17779
WOOLWORTHS         14757
INFUZIONS          14201
THINS              14075
SUNBITES           10748
COBS                9693
TOSTITOS            9471
TWISTIES            9454
OLD_EL_PASO         9324
NATURAL_CHIP_CO     7469
TYRRELLS            6442
CHEEZELS            4603
CCS                 4551
CHEETOS             2927
BURGER RINGS        1564
FRENCH_FRIES        1418
Name: count, dtype: int64

# Seeing the shape of our quantitative fields
td_df_cleaned.info()
td_df_cleaned.describe(include= "all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264836 entries, 0 to 264835
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DATE            264836 non-null  int64  
 1   STORE_NBR       264836 non-null  int64  
 2   LYLTY_CARD_NBR  264836 non-null  int64  
 3   TXN_ID          264836 non-null  int64  
 4   PROD_NBR        264836 non-null  int64  
 5   PROD_NAME       264836 non-null  object 
 6   PROD_QTY        264836 non-null  int64  
 7   TOT_SALES       264836 non-null  float64
 8   BRAND           264836 non-null  object 
dtypes: float64(1), int64(6), object(2)
memory usage: 18.2+ MB

# -----------------------------------------------------------
#  Removing Extreme Outliers in PROD_QTY
# -----------------------------------------------------------
# Two transactions show impossible 200-unit purchases, so we
# drop them to keep totals and averages realistic.
# -----------------------------------------------------------

td_df_cleaned.sort_values(by="PROD_QTY", ascending=False)
td_df_cleaned = td_df_cleaned.drop(index=[69762, 69763])

# -----------------------------------------------------------
# Fixing Date Format and Ensuring Proper Data Types
# -----------------------------------------------------------
# Convert Excel-style dates into true timestamps and set key
# text fields to string for consistency.
# -----------------------------------------------------------

td_df_cleaned['DATE'] = pd.to_datetime(td_df_cleaned['DATE'], unit='D', origin='1899-12-30')
td_df_cleaned = td_df_cleaned.astype({"PROD_NAME": "string", "BRAND": "string"})
td_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 264834 entries, 0 to 264835
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   DATE            264834 non-null  datetime64[ns]
 1   STORE_NBR       264834 non-null  int64         
 2   LYLTY_CARD_NBR  264834 non-null  int64         
 3   TXN_ID          264834 non-null  int64         
 4   PROD_NBR        264834 non-null  int64         
 5   PROD_NAME       264834 non-null  string        
 6   PROD_QTY        264834 non-null  int64         
 7   TOT_SALES       264834 non-null  float64       
 8   BRAND           264834 non-null  string        
dtypes: datetime64[ns](1), float64(1), int64(5), string(2)
memory usage: 20.2 MB

# -----------------------------------------------------------
# Merging Transaction + Customer Data
# -----------------------------------------------------------
# Join both datasets using the loyalty card number, check for
# merge-created duplicates, and remove them to keep the data clean.
# -----------------------------------------------------------

merged_data = pd.merge(td_df_cleaned, pb_df_cleaned, how="left", on="LYLTY_CARD_NBR")

# Check whether the merge introduced duplicate rows
merged_data[merged_data.duplicated()]

# Drop any duplicates that appear after merging
merged_data = merged_data.drop_duplicates(keep='first')

# Confirm no remaining duplicates
merged_data[merged_data.duplicated()]

# -----------------------------------------------------------
# Extracting Package Sizes and Validating Units
# -----------------------------------------------------------
# Pull numeric pack sizes (in grams) from product names, check for
# any malformed entries, and review basic stats to ensure values 
# look reasonable.
# -----------------------------------------------------------

# Extract numeric values that precede "g" (grams)
merged_data["PACKAGE SIZE"] = merged_data["PROD_NAME"].str.extract(
    r"(?i)(\d+)(?=g)", expand=False
).astype(float)

# Identify product names that contain numbers
numbered = merged_data[merged_data["PROD_NAME"].str.contains(r"\d+", regex=True, na=False)]

# From those, detect any that *do not* follow the "<number>g" format
non_g_products = numbered[
    ~numbered["PROD_NAME"].str.contains(r"(?i)\d+g", regex=True, na=False)
]

# Print unexpected formats (if empty → all good)
print(non_g_products[["PROD_NAME"]])

# Check for missing values, including PACKAGE SIZE
print(merged_data.isna().sum())

# Review distribution of package sizes
print(merged_data["PACKAGE SIZE"].describe())

Empty DataFrame
Columns: [PROD_NAME]
Index: []
DATE                0
STORE_NBR           0
LYLTY_CARD_NBR      0
TXN_ID              0
PROD_NBR            0
PROD_NAME           0
PROD_QTY            0
TOT_SALES           0
BRAND               0
LIFESTAGE           0
PREMIUM_CUSTOMER    0
PACKAGE SIZE        0
dtype: int64
count    264833.000000
mean        182.425540
std          64.325268
min          70.000000
25%         150.000000
50%         170.000000
75%         175.000000
max         380.000000
Name: PACKAGE SIZE, dtype: float64

# -----------------------------------------------------------
# Checking for Missing Transaction Dates + Daily Volume Plot
# -----------------------------------------------------------
# Count transactions per day, detect missing calendar dates, 
# and visualize daily transaction volume.
# -----------------------------------------------------------

# Count how many transactions occurred each day
daily_order_count = merged_data.groupby("DATE").size().reset_index(name="count")
print(daily_order_count)

# Generate a complete date range covering the entire period
full_range = pd.date_range(
    start=daily_order_count["DATE"].min(),
    end=daily_order_count["DATE"].max()
)

# Identify missing calendar dates
existing_dates = set(daily_order_count["DATE"])
missing_dates = [d for d in full_range if d not in existing_dates]
print(missing_dates)

# Plot daily transaction counts
plt.plot(daily_order_count["DATE"], daily_order_count["count"])
plt.title("Transactions per Day")
plt.xlabel("Date")
plt.ylabel("Number of Transactions")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

          DATE  count
0   2018-07-01    724
1   2018-07-02    711
2   2018-07-03    722
3   2018-07-04    714
4   2018-07-05    712
..         ...    ...
359 2019-06-26    723
360 2019-06-27    709
361 2019-06-28    730
362 2019-06-29    745
363 2019-06-30    744

[364 rows x 2 columns]
[Timestamp('2018-12-25 00:00:00')]

# -----------------------------------------------------------
# 🎄 December Daily Transaction Trend
# -----------------------------------------------------------
# Focus on December to spot seasonal spikes or dips in activity.
# A line + scatter combo helps show both the trend and daily detail.
# -----------------------------------------------------------

# Filter December transactions
december_data = daily_order_count[daily_order_count["DATE"].dt.month == 12]

# Plot daily transaction counts
plt.plot(december_data["DATE"], december_data["count"], label="Transaction Line")
plt.scatter(december_data["DATE"], december_data["count"], color='red', s=10, label="Daily Points")

plt.xlabel("Date")
plt.ylabel("Transaction Count")
plt.title("Transactions per Day in December")
plt.xticks(rotation=45, fontsize="x-small")
plt.legend()
plt.tight_layout()
plt.show()

# -----------------------------------------------------------
# Preparing Membership and Sales Distribution Charts
# -----------------------------------------------------------
# These charts show how customers and total sales are distributed
# across the three membership groups. Comma formatting is added
# to the y axis to make the values easier to read.
# -----------------------------------------------------------

# Comma formatter for y-axis
def comma_formatter(x, pos):
    return f"{x:,.0f}"

def dollar_formatter(x, pos):
    return f"${x:,.0f}"

# Fixed display order
category_order = ["Mainstream", "Budget", "Premium"]

# Color map
color_map = {
    "Mainstream": "#1f77b4",
    "Premium": "#ff7f0e",
    "Budget": "#2ca02c"
}

colors = [color_map[c] for c in category_order]

# --------------------------------------------------------
# 1️⃣ Membership Status Distribution (Comma-Formatted Axis)
# --------------------------------------------------------
membership_status = pb_df_cleaned["PREMIUM_CUSTOMER"].value_counts()
membership_status = membership_status.reindex(category_order)

plt.figure(figsize=(7,5))
plt.bar(category_order, membership_status.values, color=colors)

plt.title("Membership Status Distribution")
plt.xlabel("Membership Status")
plt.ylabel("Customers")

# ADD COMMA SEPARATOR TO Y-AXIS
plt.gca().yaxis.set_major_formatter(FuncFormatter(comma_formatter))

# Add labels on top of bars (formatted)
for i, v in enumerate(membership_status.values):
    plt.text(i, v + 300, f"{v:,}", ha='center', fontsize='small')

plt.tight_layout()
plt.show()


# --------------------------------------------------------
# 2️⃣ Total Sales by Membership Status (Comma-Formatted Axis)
# --------------------------------------------------------
sales_per_stat = merged_data.groupby("PREMIUM_CUSTOMER")["TOT_SALES"].sum()
sales_per_stat = sales_per_stat.reindex(category_order)

plt.figure(figsize=(7,5))
plt.bar(category_order, sales_per_stat.values, color=colors)

plt.title("Total Sales by Membership Status")
plt.xlabel("Membership Status")
plt.ylabel("Total Sales")

# ADD COMMA SEPARATOR TO Y-AXIS (Dollar version)
plt.gca().yaxis.set_major_formatter(FuncFormatter(dollar_formatter))

# Add labels on bars with comma formatting
for i, v in enumerate(sales_per_stat.values):
    plt.text(i, v + 10000, f"${v:,.2f}", ha='center', fontsize='small')

plt.tight_layout()
plt.show()

# -----------------------------------------------------------
# Cleaning and abbreviating customer life stage labels
# -----------------------------------------------------------
# This helper function standardizes and shortens long lifestage
# names to make plot labels cleaner and more readable.
# The underlying logic remains unchanged.
# -----------------------------------------------------------

def auto_abbrev(name):
    """Abbreviates life stage names for cleaner plot labels."""
    
    name = name.upper().strip()  # Standardize to uppercase and remove spaces

    name = re.sub(r"SINGLES/COUPLES", "S/C", name)  # Abbreviate singles/couples
    name = re.sub(r"FAMILIES", "fam", name)         # Abbreviate families
    name = re.sub(r"YOUNG", "young", name)          # Lowercase 'young'
    name = re.sub(r"OLDER", "old", name)            # Lowercase 'older'
    name = re.sub(r"MIDAGE", "Mid", name)           # Capitalize 'Mid'

    return name.title()  # Return in title case

# -----------------------------------------------------------
# Visualizing sales contribution using a mosaic plot
# -----------------------------------------------------------
# This block summarizes total sales and customer counts by 
# premium customer segment and life stage. Labels are abbreviated 
# for clarity, and a mosaic chart is used to show the proportion 
# of sales coming from each segment and life stage combination.
# -----------------------------------------------------------

# Group the data by Premium Customer segment and Life Stage, summing total sales
life_stages_count = merged_data.groupby(["PREMIUM_CUSTOMER", "LIFESTAGE"])["TOT_SALES"].sum().unstack()

# Convert the grouped sales data into a dictionary with keys as (segment, lifestage)
sales_dict = {
    (col, index): row[col]
    for index, row in life_stages_count.iterrows()
    for col in life_stages_count.columns
}

# Apply abbreviation to segment names for cleaner labels
sales_dict = {(auto_abbrev(key[0]), key[1]): value for key, value in sales_dict.items()}

# Total sales for percentage label calculations
total_sales = sum(sales_dict.values())

# Label function for sales mosaic plot
def labelizer(key):
    share = sales_dict[key] / total_sales
    return f"{share:.1%}"

# Label function for customer count mosaic plot
def labelizer2(key):
    value = customer_dict[key] / total_cust
    return f"{value:.1%}"

# Tile coloring based on customer segment
def props(key):
    premium_customer = key[1]
    return {'facecolor': color_map.get(premium_customer, '#cccccc')}

# Count unique customers by segment and life stage
customer_count = merged_data.groupby(["PREMIUM_CUSTOMER", "LIFESTAGE"])["LYLTY_CARD_NBR"].nunique().unstack()

# Total unique customers
total_cust = pb_df_cleaned["LYLTY_CARD_NBR"].nunique()

# Convert customer counts into dictionary
customer_dict = {
    (col, index): row[col]
    for index, row in customer_count.iterrows()
    for col in customer_count.columns
}

# Apply abbreviation to segment names
customer_dict = {(auto_abbrev(key[0]), key[1]): value for key, value in customer_dict.items()}

# Plot the mosaic chart
fig, ax = plt.subplots(figsize=(12, 8))
mosaic(
    sales_dict,
    properties=props,
    gap=0.005,
    title="Contribution to Total Sales by Lifestage within Each Customer Segment",
    labelizer=labelizer,
    ax=ax
)

plt.show()

# -----------------------------------------------------------
# Mosaic Plot for Customer Contribution
# -----------------------------------------------------------
# This block visualizes how different life stage and segment
# combinations contribute to the overall customer base. It
# uses a mosaic chart to highlight proportional differences
# clearly across groups.
# -----------------------------------------------------------

fig, ax = plt.subplots(figsize=(12, 8))
mosaic(
    data=customer_dict,
    properties=props,
    labelizer=labelizer2,
    gap=0.005,
    title="Contribution to Total Customers by Lifestage within Each Customer Segment",
    ax=ax
)

plt.show()

# -----------------------------------------------------------
# Average Units per Transaction by Lifestage and Segment
# This computes how many chip units each group buys per trip
# and visualizes differences across lifestages and segments.
# -----------------------------------------------------------

# Calculate the average units per transaction
avg_units = merged_data.groupby(["LIFESTAGE", "PREMIUM_CUSTOMER"])["PROD_QTY"].mean().unstack()

# Assign bar colors based on Premium Customer segment
bar_colors = [color_map.get(col, '#cccccc') for col in avg_units.columns if col != 'LIFESTAGE']

# Plot grouped bar chart
ax = avg_units.plot(kind='bar', color=bar_colors, figsize=(12, 6))

# Add value labels on top of each bar
for bar_group in ax.containers:
    for bar in bar_group:
        height = bar.get_height()
        ax.annotate(
            f'{height:.2f}',
            xy=(bar.get_x() + bar.get_width() / 2, height),
            xytext=(0, 3),
            textcoords="offset points",
            ha='center', va='bottom',
            fontsize=8
        )

# Add plot title and axis labels
plt.title('Average Units per Transaction by Lifestage and Customer Segment')
plt.xlabel('Lifestage')
plt.ylabel('Average Units')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -----------------------------------------------------------
# Average Price per Unit by Lifestage and Customer Segment
# This block calculates the price per unit and visualizes
# how much each lifestage and segment pays on average.
# -----------------------------------------------------------

# Calculate unit price for each transaction
merged_data["PRICE"] = merged_data["TOT_SALES"] / merged_data["PROD_QTY"]

# Compute the average unit price grouped by Lifestage and segment
avg_price_count = merged_data.groupby(["LIFESTAGE", "PREMIUM_CUSTOMER"])["PRICE"].mean().unstack()

# Define bar colors using the color_map for each segment
bar_colors = [color_map.get(col, '#cccccc') for col in avg_price_count.columns]

# Plot the average unit price as a bar chart
ax = avg_price_count.plot(
    kind="bar",
    figsize=(12, 6),
    color=bar_colors,
    title="Average Price per Unit by Lifestage and Customer Segment"
)

# Axis labels
ax.set_xlabel("Lifestage")
ax.set_ylabel("Average Unit Price")

# Rotate x-axis labels for readability
plt.xticks(rotation=45)

# Add value labels on top of each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', padding=3, fontsize=8)

plt.tight_layout()
plt.show()

# -----------------------------------------------------------
# One-tailed Independent T-Test for Price Sensitivity
# -----------------------------------------------------------
# This block compares whether Mainstream Singles/Couples pay
# more per unit than their non-Mainstream counterparts.
# The test is restricted to Young and Midage Singles/Couples
# to match patterns observed earlier in the analysis.
# -----------------------------------------------------------

# Calculate price per unit
merged_data["price_per_unit"] = merged_data["TOT_SALES"] / merged_data["PROD_QTY"]

# Filter for the two target lifestages
ym_sg_cp = merged_data[
    merged_data["LIFESTAGE"].isin(["YOUNG SINGLES/COUPLES", "MIDAGE SINGLES/COUPLES"])
].copy()

# Label Mainstream vs Non-Mainstream
ym_sg_cp["GROUP"] = np.where(
    ym_sg_cp["PREMIUM_CUSTOMER"] == "Mainstream", "MS", "Not MS"
)

# Split into groups
ms_cust = ym_sg_cp[ym_sg_cp["GROUP"] == "MS"]
nms_cust = ym_sg_cp[ym_sg_cp["GROUP"] == "Not MS"]

# Extract price-per-unit series
pp_unit_ms = ms_cust["price_per_unit"]
pp_unit_nms = nms_cust["price_per_unit"]

# Welch t-test (one-tailed: Mainstream > Non-Mainstream)
t_stat, p_value = ttest_ind(
    pp_unit_ms, pp_unit_nms,
    alternative='greater',
    equal_var=False
)

# Means
mean_ms = pp_unit_ms.mean()
mean_nms = pp_unit_nms.mean()

# Sample sizes and variances
n1, n2 = len(pp_unit_ms), len(pp_unit_nms)
var1 = np.var(pp_unit_ms, ddof=1)
var2 = np.var(pp_unit_nms, ddof=1)

# Welch-Satterthwaite degrees of freedom
df = (var1/n1 + var2/n2)**2 / (
    (var1**2)/(n1**2 * (n1 - 1)) +
    (var2**2)/(n2**2 * (n2 - 1))
)

# Standard error of mean difference
se = np.sqrt(var1/n1 + var2/n2)

# Critical t-value and confidence interval
t_crit = t.ppf(0.95, df)
mean_diff = mean_ms - mean_nms
ci_lower = mean_diff - t_crit * se
ci_upper = np.inf

# Print results
print(f"T-statistic: {t_stat:.2f}")
print(f"Degrees of freedom: {df:.0f}")

if p_value < 2.2e-16:
    print("P-value: < 2.2e-16")
else:
    print(f"P-value: {p_value:.20e}")

print(f"Means: Mainstream = {mean_ms:.6f}, Not Mainstream = {mean_nms:.6f}")
print(f"95% Confidence Interval (one-tailed): [{ci_lower:.6f}, ∞)")

T-statistic: 40.61
Degrees of freedom: 58792
P-value: < 2.2e-16
Means: Mainstream = 4.045586, Not Mainstream = 3.688165
95% Confidence Interval (one-tailed): [0.342943, ∞)

# -----------------------------------------------------------
# Select Mainstream Young Singles/Couples and all other customers
# -----------------------------------------------------------

# 1. Target group: Mainstream Young Singles/Couples
ms_yg_sc = merged_data[
    (merged_data["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") &
    (merged_data["PREMIUM_CUSTOMER"] == "Mainstream")
]

# 2. Comparison group: Everyone else
nms_n_ygsc = merged_data[
    ~(
        (merged_data["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") &
        (merged_data["PREMIUM_CUSTOMER"] == "Mainstream")
    )
]

# -----------------------------------------------------------
# Brand Affinity Analysis Comparing Mainstream vs Others
# -----------------------------------------------------------
# This block calculates how strongly Mainstream Young Singles/Couples
# prefer each brand compared to the rest of the population. The affinity
# score helps identify which brands resonate most with this target group.

# Total quantity purchased by each group
qty_ms = ms_yg_sc["PROD_QTY"].sum()
qty_nms = nms_n_ygsc["PROD_QTY"].sum()

# Brand purchase share within Mainstream Young Singles/Couples
ms_brand_shares = (
    ms_yg_sc.groupby("BRAND")["PROD_QTY"].sum()
    .div(qty_ms)
    .reset_index(name="Mainstream")
)

# Brand purchase share within all other customers
nms_brand_shares = (
    nms_n_ygsc.groupby("BRAND")["PROD_QTY"].sum()
    .div(qty_nms)
    .reset_index(name="Other")
)

# Merge to align brands across both groups
brand_proportions = pd.merge(
    ms_brand_shares, nms_brand_shares, how="left", on="BRAND"
)

# Affinity score shows relative preference for each brand
brand_proportions["affinity"] = (
    brand_proportions["Mainstream"] / brand_proportions["Other"]
)

# Sort to identify strongest affinities
brand_proportions.sort_values(by="affinity", ascending=False)

# -----------------------------------------------------------
# Visualizing Brand Affinity Scores
# -----------------------------------------------------------
# This chart highlights which brands Mainstream Young Singles/Couples
# prefer more than other customers. Brands with affinity above 1 show
# stronger preference in this target group and are colored green for
# quick interpretation, while those below 1 are shown in gray.

# Sort brands
plot_brand_df = brand_proportions.sort_values("affinity", ascending=True)

# Color logic for intuitive interpretation
colors = plot_brand_df["affinity"].apply(
    lambda x: "#2ca02c" if x > 1 else "gray"
)

plt.figure(figsize=(10, 8))
plt.barh(plot_brand_df["BRAND"], plot_brand_df["affinity"], color=colors)

# Add numerical affinity labels
for i, v in enumerate(plot_brand_df["affinity"]):
    plt.text(v + 0.02, i, f"{v:.2f}", va="center", fontsize=9)

# Add reference line at affinity = 1
plt.axvline(1, color="black", linestyle="--", alpha=0.6)

plt.title("Brand Affinity – Mainstream Young S/C vs Other Customers")
plt.xlabel("Affinity Score")
plt.ylabel("Brand")
plt.grid(axis="x", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

# -----------------------------------------------------------
# Calculating Pack Size Affinity
# -----------------------------------------------------------
# This section compares the pack size preferences of Mainstream
# Young Singles/Couples against all other customers. The affinity
# score shows whether a size is chosen more often by the target
# group relative to everyone else.

# Compute pack size proportions for Mainstream customers
ms_pack_size = (
    ms_yg_sc.groupby("PACKAGE SIZE")["PROD_QTY"]
    .sum()
    .div(qty_ms)
    .reset_index(name="ms_packsize_prop")
    .sort_values(by="ms_packsize_prop", ascending=False)
)

# Compute pack size proportions for Non Mainstream customers
nms_pack_size = (
    nms_n_ygsc.groupby("PACKAGE SIZE")["PROD_QTY"]
    .sum()
    .div(qty_nms)
    .reset_index(name="nms_packsize_other")
    .sort_values(by="nms_packsize_other")
)

# Merge and compute affinity score
size_affinity = pd.merge(ms_pack_size, nms_pack_size, how="left", on="PACKAGE SIZE")
size_affinity["size_affinity"] = size_affinity["ms_packsize_prop"] / size_affinity["nms_packsize_other"]

# Sort by affinity score
size_affinity = size_affinity.sort_values(by="size_affinity", ascending=False)

size_affinity

# -----------------------------------------------------------
# Plotting Pack Size Affinity for Mainstream Young S/C
# -----------------------------------------------------------
# This visual highlights which pack sizes Mainstream Young Singles/Couples
# prefer more or less compared to all other customers. Affinity above 1 means
# they buy that size more frequently relative to others.

# Sort sizes by affinity
plot_size_df = size_affinity.sort_values("size_affinity", ascending=True)

# Colors: green if >1, gray otherwise
colors = plot_size_df["size_affinity"].apply(
    lambda x: "#2ca02c" if x > 1 else "gray"
)

plt.figure(figsize=(10, 8))

# Convert PACKAGE SIZE to string for categorical axis labels
y_labels = plot_size_df["PACKAGE SIZE"].astype(str)

plt.barh(y_labels, plot_size_df["size_affinity"], color=colors)

# Add text labels on the bars
for i, v in enumerate(plot_size_df["size_affinity"]):
    plt.text(v + 0.02, i, f"{v:.2f}", va="center", fontsize=9)

# Neutral reference line
plt.axvline(1, color="black", linestyle="--", alpha=0.6)

plt.title("Size Affinity – Mainstream Young S/C vs Other Customers")
plt.xlabel("Affinity Score")
plt.ylabel("Chip Size (g)")
plt.grid(axis="x", linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

	DATE	STORE_NBR	LYLTY_CARD_NBR	TXN_ID	PROD_NBR	PROD_NAME	PROD_QTY	TOT_SALES
0	43390	1	1000	1	5	Natural Chip Compny SeaSalt175g	2	6.0
1	43599	1	1307	348	66	CCs Nacho Cheese 175g	3	6.3
2	43605	1	1343	383	61	Smiths Crinkle Cut Chips Chicken 170g	2	2.9
3	43329	2	2373	974	69	Smiths Chip Thinly S/Cream&Onion 175g	5	15.0
4	43330	2	2426	1038	108	Kettle Tortilla ChpsHny&Jlpno Chili 150g	3	13.8

	DATE	STORE_NBR	LYLTY_CARD_NBR	TXN_ID	PROD_NBR	PROD_NAME	PROD_QTY	TOT_SALES	BRAND
count	264836.000000	264836.00000	2.648360e+05	2.648360e+05	264836.000000	264836	264836.000000	264836.000000	264836
unique	NaN	NaN	NaN	NaN	NaN	114	NaN	NaN	20
top	NaN	NaN	NaN	NaN	NaN	Kettle Mozzarella Basil & Pesto 175g	NaN	NaN	KETTLE
freq	NaN	NaN	NaN	NaN	NaN	3304	NaN	NaN	41288
mean	43464.036260	135.08011	1.355495e+05	1.351583e+05	56.583157	NaN	1.907309	7.304200	NaN
std	105.389282	76.78418	8.057998e+04	7.813303e+04	32.826638	NaN	0.643654	3.083226	NaN
min	43282.000000	1.00000	1.000000e+03	1.000000e+00	1.000000	NaN	1.000000	1.500000	NaN
25%	43373.000000	70.00000	7.002100e+04	6.760150e+04	28.000000	NaN	2.000000	5.400000	NaN
50%	43464.000000	130.00000	1.303575e+05	1.351375e+05	56.000000	NaN	2.000000	7.400000	NaN
75%	43555.000000	203.00000	2.030942e+05	2.027012e+05	85.000000	NaN	2.000000	9.200000	NaN
max	43646.000000	272.00000	2.373711e+06	2.415841e+06	114.000000	NaN	200.000000	650.000000	NaN

	BRAND	Mainstream	Other	affinity
18	TYRRELLS	0.029587	0.023933	1.236230
17	TWISTIES	0.043306	0.035283	1.227396
8	KETTLE	0.185649	0.154217	1.203818
16	TOSTITOS	0.042581	0.035377	1.203633
10	OLD_EL_PASO	0.041598	0.034753	1.196953
11	PRINGLES	0.111980	0.093744	1.194531
5	DORITOS	0.122877	0.105278	1.167171
4	COBS	0.041856	0.036375	1.150696
7	INFUZIONS	0.060649	0.053157	1.140942
15	THINS	0.056611	0.053084	1.066440
3	CHEEZELS	0.016851	0.017370	0.970137
14	SUNBITES	0.036628	0.040771	0.898373
13	SMITHS	0.093420	0.121710	0.767559
6	FRENCH_FRIES	0.003702	0.005364	0.690110
2	CHEETOS	0.007533	0.011240	0.670143
12	RED_ROCK_DELI	0.045377	0.068427	0.663146
9	NATURAL_CHIP_CO	0.018379	0.028741	0.639449
1	CCS	0.010484	0.017602	0.595596
19	WOOLWORTHS	0.028189	0.057429	0.490852
0	BURGER RINGS	0.002744	0.006145	0.446535

	PACKAGE SIZE	ms_packsize_prop	nms_packsize_other	size_affinity
9	270.0	0.029846	0.023377	1.276688
8	380.0	0.030156	0.023832	1.265356
5	330.0	0.057465	0.046727	1.229809
2	134.0	0.111980	0.093744	1.194531
3	110.0	0.099658	0.083643	1.191477
10	210.0	0.027309	0.023401	1.166997
11	135.0	0.013849	0.012180	1.136992
12	250.0	0.013460	0.011905	1.130606
4	170.0	0.075740	0.075440	1.003976
6	300.0	0.054954	0.057264	0.959675
0	175.0	0.239102	0.251514	0.950653
1	150.0	0.155130	0.163447	0.949118
7	165.0	0.052185	0.058004	0.899677
14	190.0	0.007015	0.011590	0.605253
17	180.0	0.003365	0.005651	0.595457
15	160.0	0.006005	0.011526	0.521044
16	90.0	0.005954	0.011719	0.508041
19	125.0	0.002821	0.005623	0.501744
13	200.0	0.008413	0.017379	0.484084
18	70.0	0.002847	0.005889	0.483474
20	220.0	0.002744	0.006145	0.446535

	LYLTY_CARD_NBR	LIFESTAGE	PREMIUM_CUSTOMER
0	1000	YOUNG SINGLES/COUPLES	Premium
1	1002	YOUNG SINGLES/COUPLES	Mainstream
2	1003	YOUNG FAMILIES	Budget
3	1004	OLDER SINGLES/COUPLES	Mainstream
4	1005	MIDAGE SINGLES/COUPLES	Mainstream