In [1]:
# --------------------------------------
# Start of statistical analysis
# --------------------------------------
# Import the required libraries
import matplotlib.pyplot as plt
import statsmodels as Stats
import pandas as pd
import numpy as np
from scipy.stats import t
In [2]:
# --------------------------------------
# Load the dataset
# --------------------------------------
# Read CSV file
qvi_data = pd.read_csv("C:\\Users\\rageg\\Downloads\\QVI_data.csv")
# Check the initial structure
qvi_data.info()
# Convert DATE column to datetime
qvi_data["DATE"] = pd.to_datetime(qvi_data["DATE"])
# Check structure after conversion
qvi_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 263214 entries, 0 to 263213 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LYLTY_CARD_NBR 263214 non-null int64 1 DATE 263214 non-null object 2 STORE_NBR 263214 non-null int64 3 TXN_ID 263214 non-null int64 4 PROD_NBR 263214 non-null int64 5 PROD_NAME 263214 non-null object 6 PROD_QTY 263214 non-null int64 7 TOT_SALES 263214 non-null float64 8 PACK_SIZE 263214 non-null int64 9 BRAND 263214 non-null object 10 LIFESTAGE 263214 non-null object 11 PREMIUM_CUSTOMER 263214 non-null object dtypes: float64(1), int64(6), object(5) memory usage: 24.1+ MB <class 'pandas.core.frame.DataFrame'> RangeIndex: 263214 entries, 0 to 263213 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LYLTY_CARD_NBR 263214 non-null int64 1 DATE 263214 non-null datetime64[ns] 2 STORE_NBR 263214 non-null int64 3 TXN_ID 263214 non-null int64 4 PROD_NBR 263214 non-null int64 5 PROD_NAME 263214 non-null object 6 PROD_QTY 263214 non-null int64 7 TOT_SALES 263214 non-null float64 8 PACK_SIZE 263214 non-null int64 9 BRAND 263214 non-null object 10 LIFESTAGE 263214 non-null object 11 PREMIUM_CUSTOMER 263214 non-null object dtypes: datetime64[ns](1), float64(1), int64(6), object(4) memory usage: 24.1+ MB
In [3]:
# --------------------------------------
# Create monthly identifiers and unit price
# --------------------------------------
# Convert DATE to MONTH_ID formatted as YYYYMM
qvi_data["MONTH_ID"] = qvi_data["DATE"].dt.strftime("%Y%m").astype(int)
# Compute unit price
qvi_data["UNIT_PRICE"] = qvi_data["TOT_SALES"] / qvi_data["PROD_QTY"]
qvi_data
# --------------------------------------
# Group by store and month to create monthly metrics
# --------------------------------------
qvi_grouped = qvi_data.groupby(
["STORE_NBR", "MONTH_ID"]
).agg(
TOTAL_SALES=("TOT_SALES", "sum"),
CUSTOMERS=("LYLTY_CARD_NBR", "nunique"),
TRANSACTIONS=("TXN_ID", "nunique"),
AVG_PRICE_PER_UNIT=("UNIT_PRICE", "mean"),
CHIPS_SOLD=("PROD_QTY", "sum")
).reset_index()
# Compute additional derived metrics
qvi_grouped["CHIPS_PER_CUSTOMER"] = qvi_grouped["CHIPS_SOLD"] / qvi_grouped["CUSTOMERS"]
qvi_grouped["TXN_PER_CUSTOMER"] = qvi_grouped["TRANSACTIONS"] / qvi_grouped["CUSTOMERS"]
In [4]:
# --------------------------------------
# Filter to pre-trial period and full observation stores
# --------------------------------------
# Copy grouped data
df_full = qvi_grouped.copy()
# Keep pre-trial months only
df_pre = df_full[df_full["MONTH_ID"] < 201902]
# Count months per store
full_obs_counts = df_full["STORE_NBR"].value_counts()
# Identify stores with all twelve required observations
stores_with_12 = full_obs_counts[full_obs_counts == 12].index
# Keep only pre-trial months for stores with full observations
qvi_grouped_full_obs = df_pre[df_pre["STORE_NBR"].isin(stores_with_12)]
In [5]:
# --------------------------------------
# Define trial stores and metrics
# --------------------------------------
trials = [77, 86, 88]
# Metrics used for correlation and magnitude distance
metrics = ["TOTAL_SALES", "CUSTOMERS"]
# Identify control stores
controls = [
store for store in qvi_grouped_full_obs["STORE_NBR"].unique()
if store not in trials
]
# --------------------------------------
# Magnitude distance function
# --------------------------------------
def mag_dist(df, trial_stores, mets):
# Split trial and control data
trials_df = df[df["STORE_NBR"].isin(trial_stores)]
controls_df = df[~df["STORE_NBR"].isin(trial_stores)]
# Merge on month to compare trial and control stores
new_df = pd.merge(
controls_df,
trials_df,
on="MONTH_ID",
suffixes=("_CNTRL", "_TRIAL")
)
# Initialize comparison table
comparison_df = new_df[["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"]].copy()
# Compute absolute distance for each metric
for metric in mets:
comparison_df[metric + "_abs_dist"] = (
new_df[metric + "_CNTRL"] - new_df[metric + "_TRIAL"]
).abs()
# Standardize distances using min-max scaling (flipped so higher is more similar)
for metric in mets:
col = metric + "_abs_dist"
min_val = comparison_df[col].min()
max_val = comparison_df[col].max()
if max_val > min_val:
scaled = (comparison_df[col] - min_val) / (max_val - min_val)
flipped = 1 - scaled
comparison_df[col.replace("_abs_dist", "_std_dist")] = flipped
else:
comparison_df[col.replace("_abs_dist", "_std_dist")] = 1.0
# Select standardized distance columns
std_list = [metric + "_std_dist" for metric in mets]
std_dist = comparison_df[
["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"] + std_list
]
# Average similarity across months
mean_distances = (
std_dist.groupby(["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
.mean()
.reset_index()
)
return mean_distances
# --------------------------------------
# Correlation function
# --------------------------------------
def p_corr_wide(df, trial_stores, mets):
# Identify control stores
controls = [
store for store in df["STORE_NBR"].unique()
if store not in trial_stores
]
final_df = None
for metric in mets:
# Pivot to compute correlations across stores
pivoted = df.pivot(index="MONTH_ID", columns="STORE_NBR", values=metric)
# Store-to-store correlations
corr_df = pivoted.corr().loc[controls, trial_stores]
# Convert correlation matrix to long format
corr_long = corr_df.reset_index().melt(
id_vars="STORE_NBR",
var_name="STORE_NBR_TRIAL",
value_name=f"{metric}_corr"
).rename(columns={"STORE_NBR": "STORE_NBR_CNTRL"})
# Merge correlations for all metrics
if final_df is None:
final_df = corr_long
else:
final_df = final_df.merge(
corr_long,
on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"],
how="inner"
)
return final_df
In [6]:
# --------------------------------------
# Prepare trial store identifiers
# --------------------------------------
# Copy grouped data
df = qvi_grouped.copy()
# Define trial store numbers
trial_store77 = 77
trial_store86 = 86
trial_store88 = 88
In [7]:
# --------------------------------------
# Compute correlation and magnitude distance
# --------------------------------------
# Correlation scores between control and trial stores
store_corr = p_corr_wide(qvi_grouped_full_obs, trials, metrics)
# Magnitude distance scores between control and trial stores
store_mag = mag_dist(qvi_grouped_full_obs, trials, metrics)
store_mag
Out[7]:
| STORE_NBR_CNTRL | STORE_NBR_TRIAL | TOTAL_SALES_std_dist | CUSTOMERS_std_dist | |
|---|---|---|---|---|
| 0 | 1 | 77 | 0.957554 | 0.955703 |
| 1 | 1 | 86 | 0.520879 | 0.579181 |
| 2 | 1 | 88 | 0.190486 | 0.376523 |
| 3 | 2 | 77 | 0.942248 | 0.943522 |
| 4 | 2 | 86 | 0.494721 | 0.529347 |
| ... | ... | ... | ... | ... |
| 763 | 271 | 86 | 0.950847 | 0.950166 |
| 764 | 271 | 88 | 0.633696 | 0.747508 |
| 765 | 272 | 77 | 0.894178 | 0.961240 |
| 766 | 272 | 86 | 0.658295 | 0.562569 |
| 767 | 272 | 88 | 0.327902 | 0.359911 |
768 rows × 4 columns
In [8]:
# --------------------------------------
# Select the best control store for a trial store
# --------------------------------------
def select_control_store(store_corr, store_mag, trial_store, corr_weight=0.5, mag_weight=0.5):
"""
Compute the best control store match for a given trial store.
store_corr is the correlation table from p_corr_wide
store_mag is the magnitude distance table from mag_dist
trial_store is the store being evaluated
corr_weight controls how much correlation contributes to the score
mag_weight controls how much magnitude distance contributes to the score
Returns the best control store and the table of all scores
"""
# --------------------------------------
# Combine correlation and magnitude for TOTAL_SALES
# --------------------------------------
corr_sales = store_corr[store_corr["STORE_NBR_TRIAL"] == trial_store][
["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "TOTAL_SALES_corr"]
]
mag_sales = store_mag[store_mag["STORE_NBR_TRIAL"] == trial_store][
["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "TOTAL_SALES_std_dist"]
]
sales_df = corr_sales.merge(mag_sales, on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
sales_df["Final_score_sales"] = (
sales_df["TOTAL_SALES_corr"] * corr_weight +
sales_df["TOTAL_SALES_std_dist"] * mag_weight
)
# --------------------------------------
# Combine correlation and magnitude for CUSTOMERS
# --------------------------------------
corr_cust = store_corr[store_corr["STORE_NBR_TRIAL"] == trial_store][
["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "CUSTOMERS_corr"]
]
mag_cust = store_mag[store_mag["STORE_NBR_TRIAL"] == trial_store][
["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "CUSTOMERS_std_dist"]
]
custo_df = corr_cust.merge(mag_cust, on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
custo_df["Final_score_custo"] = (
custo_df["CUSTOMERS_corr"] * corr_weight +
custo_df["CUSTOMERS_std_dist"] * mag_weight
)
# --------------------------------------
# Merge both drivers into one table
# --------------------------------------
final_df = sales_df.merge(
custo_df,
on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"]
)[[
"STORE_NBR_CNTRL",
"STORE_NBR_TRIAL",
"Final_score_sales",
"Final_score_custo"
]]
# --------------------------------------
# Compute composite score
# --------------------------------------
final_df["SCORE"] = (
final_df["Final_score_sales"] * 0.5 +
final_df["Final_score_custo"] * 0.5
)
# Sort stores by composite score
final_df = final_df.sort_values("SCORE", ascending=False)
# Select the highest ranked control store
best_control = int(final_df.iloc[0]["STORE_NBR_CNTRL"])
return best_control, final_df
In [9]:
# --------------------------------------
# Plot pre-trial sales and customers
# --------------------------------------
def plot_pretrial_sales_and_customers(df, trial_store, control_store):
"""
Create side-by-side panels comparing pre-trial sales and customers
for a trial store versus its matched control store.
"""
temp = df.copy()
# Label store types
temp["Store_type"] = temp["STORE_NBR"].apply(
lambda x: "Trial" if x == trial_store
else "Control" if x == control_store
else "Other stores"
)
# --------------------------------------
# Convert MONTH_ID to a datetime month
# --------------------------------------
def add_month_date(x):
return pd.to_datetime(
x.astype(str).str[:4] + "-" +
x.astype(str).str[4:] + "-01"
)
# --------------------------------------
# Pre-trial sales
# --------------------------------------
sales = (
temp.groupby(["MONTH_ID", "Store_type"])["TOTAL_SALES"]
.mean()
.reset_index()
)
sales["TransactionMonth"] = add_month_date(sales["MONTH_ID"])
sales = sales[sales["MONTH_ID"] < 201903]
# --------------------------------------
# Pre-trial customers
# --------------------------------------
cust = (
temp.groupby(["MONTH_ID", "Store_type"])["CUSTOMERS"]
.mean()
.reset_index(name="numberCustomers")
)
cust["TransactionMonth"] = add_month_date(cust["MONTH_ID"])
cust = cust[cust["MONTH_ID"] < 201903]
# --------------------------------------
# Create side-by-side comparison plot
# --------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True)
# Left panel: Sales
for stype, sub in sales.groupby("Store_type"):
axes[0].plot(sub["TransactionMonth"], sub["TOTAL_SALES"], label=stype, linewidth=2)
axes[0].set_title(f"Store {trial_store} Pre-trial Sales (Control {control_store})")
axes[0].set_xlabel("Month")
axes[0].set_ylabel("Total Sales")
axes[0].grid(alpha=0.25)
# Right panel: Customers
for stype, sub in cust.groupby("Store_type"):
axes[1].plot(sub["TransactionMonth"], sub["numberCustomers"], label=stype, linewidth=2)
axes[1].set_title(f"Store {trial_store} Pre-trial Customers (Control {control_store})")
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Total Customers")
axes[1].grid(alpha=0.25)
# Shared legend
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=3)
fig.tight_layout(rect=[0, 0, 1, 0.93])
plt.show()
In [10]:
# --------------------------------------
# Select control stores for each trial store
# --------------------------------------
control_store77 = select_control_store(store_corr, store_mag, trial_store77)[0]
control_store86 = select_control_store(store_corr, store_mag, trial_store86)[0]
control_store88 = select_control_store(store_corr, store_mag, trial_store88)[0]
In [11]:
# --------------------------------------
# Plot pre-trial comparisons for all trial stores
# --------------------------------------
plot_pretrial_sales_and_customers(qvi_grouped, trial_store77, control_store77)
plot_pretrial_sales_and_customers(qvi_grouped, trial_store86, control_store86)
plot_pretrial_sales_and_customers(qvi_grouped, trial_store88, control_store88)
In [12]:
# --------------------------------------
# Scaling function for pre-trial metrics
# --------------------------------------
metric_cols = ["TOTAL_SALES", "CUSTOMERS"]
def scale_pretrial_metric(df, trial_store, control_store, metric_col, cutoff=201902):
df_copy = df.copy()
# Filter to pre-trial months
full_pre = df_copy[df_copy["MONTH_ID"] < cutoff]
# Compute total values for scaling
trial_pre_total = full_pre[full_pre["STORE_NBR"] == trial_store][metric_col].sum()
control_pre_total = full_pre[full_pre["STORE_NBR"] == control_store][metric_col].sum()
# Compute scaling factor, avoid division by zero
if control_pre_total == 0:
scaling_factor = np.nan
else:
scaling_factor = trial_pre_total / control_pre_total
# Name of the scaled column
new_col = f"scaled_{metric_col}"
# Apply scaling to the control store only
df_copy[new_col] = np.where(
df_copy["STORE_NBR"] == control_store,
df_copy[metric_col] * scaling_factor,
np.nan
)
return df_copy, scaling_factor
# --------------------------------------
# Compute percentage difference and t-values
# --------------------------------------
def compute_percentage_diff_and_tvalues(df_scaled, trial_store, control_store, metric,
pre_cutoff=201902,
trial_start=201902,
trial_end=201904):
"""
Compute percentage difference between trial and scaled control,
compute pre-trial standard deviation,
compute t-values,
compute critical t-value for significance.
"""
scaled_col = f"scaled_{metric}"
# Extract scaled control values
control_scaled = df_scaled[df_scaled["STORE_NBR"] == control_store][
["MONTH_ID", scaled_col]
].rename(columns={scaled_col: "scaledControl"})
# Extract trial values
trial_vals = df_scaled[df_scaled["STORE_NBR"] == trial_store][
["MONTH_ID", metric]
].rename(columns={metric: "trialValue"})
# Merge trial and control values
percentageDiff = control_scaled.merge(trial_vals, on="MONTH_ID")
# Compute percentage difference
percentageDiff["percentageDiff"] = (
(percentageDiff["trialValue"] - percentageDiff["scaledControl"]).abs()
/ percentageDiff["scaledControl"]
)
# Compute pre-trial standard deviation
stdDev = percentageDiff[percentageDiff["MONTH_ID"] < pre_cutoff][
"percentageDiff"
].std()
# Compute t-values
percentageDiff["tValue"] = percentageDiff["percentageDiff"] / stdDev
# Convert MONTH_ID to datetime
percentageDiff["TransactionMonth"] = pd.to_datetime(
percentageDiff["MONTH_ID"].astype(str) + "01",
format="%Y%m%d"
)
# Extract trial-period t-values
trial_tvalues = percentageDiff[
(percentageDiff["MONTH_ID"] >= trial_start) &
(percentageDiff["MONTH_ID"] <= trial_end)
][["TransactionMonth", "tValue"]]
# Compute critical t-value (df = 7)
degrees_of_freedom = 7
critical_t = t.ppf(0.95, df=degrees_of_freedom)
return trial_tvalues, percentageDiff, stdDev, critical_t
In [14]:
# --------------------------------------
# Scale sales and customer metrics for each trial store
# --------------------------------------
# Scaled sales data
scaled_sales_store77 = scale_pretrial_metric(df_full, trial_store77, control_store77, metric_cols[0])[0]
scaled_sales_store86 = scale_pretrial_metric(df_full, trial_store86, control_store86, metric_cols[0])[0]
scaled_sales_store88 = scale_pretrial_metric(df_full, trial_store88, control_store88, metric_cols[0])[0]
# Scaled customer data
scaled_cust_store77 = scale_pretrial_metric(df_full, trial_store77, control_store77, metric_cols[1])[0]
scaled_cust_store86 = scale_pretrial_metric(df_full, trial_store86, control_store86, metric_cols[1])[0]
scaled_cust_store88 = scale_pretrial_metric(df_full, trial_store88, control_store88, metric_cols[1])[0]
In [15]:
# --------------------------------------
# Compute percentage differences and t-values for Store 77
# --------------------------------------
tvals_77_sales, diff_77_sales, sd_77_sales, crit_77_sales = compute_percentage_diff_and_tvalues(
scaled_sales_store77,
trial_store77,
control_store77,
metric_cols[0]
)
tvals_77_cust, diff_77_cust, sd_77_cust, crit_77_cust = compute_percentage_diff_and_tvalues(
scaled_cust_store77,
trial_store77,
control_store77,
metric_cols[1]
)
# --------------------------------------
# Compute percentage differences and t-values for Store 86
# --------------------------------------
tvals_86_sales, diff_86_sales, sd_86_sales, crit_86_sales = compute_percentage_diff_and_tvalues(
scaled_sales_store86,
trial_store86,
control_store86,
metric_cols[0]
)
tvals_86_cust, diff_86_cust, sd_86_cust, crit_86_cust = compute_percentage_diff_and_tvalues(
scaled_cust_store86,
trial_store86,
control_store86,
metric_cols[1]
)
# --------------------------------------
# Compute percentage differences and t-values for Store 88
# --------------------------------------
tvals_88_sales, diff_88_sales, sd_88_sales, crit_88_sales = compute_percentage_diff_and_tvalues(
scaled_sales_store88,
trial_store88,
control_store88,
metric_cols[0]
)
tvals_88_cust, diff_88_cust, sd_88_cust, crit_88_cust = compute_percentage_diff_and_tvalues(
scaled_cust_store88,
trial_store88,
control_store88,
metric_cols[1]
)
# Display difference table for Store 77 sales
diff_77_sales
Out[15]:
| MONTH_ID | scaledControl | trialValue | percentageDiff | tValue | TransactionMonth | |
|---|---|---|---|---|---|---|
| 0 | 201807 | 297.565550 | 296.8 | 0.002573 | 0.051515 | 2018-07-01 |
| 1 | 201808 | 292.652187 | 255.5 | 0.126950 | 2.542011 | 2018-08-01 |
| 2 | 201809 | 233.998916 | 225.2 | 0.037602 | 0.752940 | 2018-09-01 |
| 3 | 201810 | 190.085733 | 204.5 | 0.075830 | 1.518406 | 2018-10-01 |
| 4 | 201811 | 216.597421 | 245.3 | 0.132516 | 2.653459 | 2018-11-01 |
| 5 | 201812 | 286.408121 | 267.3 | 0.066716 | 1.335911 | 2018-12-01 |
| 6 | 201901 | 181.692071 | 204.4 | 0.124980 | 2.502571 | 2019-01-01 |
| 7 | 201902 | 249.762622 | 235.0 | 0.059107 | 1.183534 | 2019-02-01 |
| 8 | 201903 | 203.802205 | 278.5 | 0.366521 | 7.339116 | 2019-03-01 |
| 9 | 201904 | 162.345704 | 263.5 | 0.623080 | 12.476373 | 2019-04-01 |
| 10 | 201905 | 352.533799 | 299.3 | 0.151003 | 3.023650 | 2019-05-01 |
| 11 | 201906 | 226.219424 | 264.7 | 0.170103 | 3.406093 | 2019-06-01 |
In [16]:
# --------------------------------------
# Highlight trial period rows in styled tables
# --------------------------------------
def highlight_trial_rows(row):
month = row["MONTH_ID"]
if 201902 <= month <= 201904:
return ["background-color: gray"] * len(row)
else:
return [""] * len(row)
(
diff_77_cust
.style
.apply(highlight_trial_rows, axis=1)
.set_caption("Percentage Difference and T-Values for Store 77 (Customers)")
)
Out[16]:
| Â | MONTH_ID | scaledControl | trialValue | percentageDiff | tValue | TransactionMonth |
|---|---|---|---|---|---|---|
| 0 | 201807 | 51.171141 | 51 | 0.003344 | 0.183352 | 2018-07-01 00:00:00 |
| 1 | 201808 | 48.161074 | 47 | 0.024108 | 1.321664 | 2018-08-01 00:00:00 |
| 2 | 201809 | 42.140940 | 42 | 0.003344 | 0.183352 | 2018-09-01 00:00:00 |
| 3 | 201810 | 35.117450 | 37 | 0.053607 | 2.938874 | 2018-10-01 00:00:00 |
| 4 | 201811 | 40.134228 | 41 | 0.021572 | 1.182622 | 2018-11-01 00:00:00 |
| 5 | 201812 | 47.157718 | 46 | 0.024550 | 1.345883 | 2018-12-01 00:00:00 |
| 6 | 201901 | 35.117450 | 35 | 0.003344 | 0.183352 | 2019-01-01 00:00:00 |
| 7 | 201902 | 45.151007 | 45 | 0.003344 | 0.183352 | 2019-02-01 00:00:00 |
| 8 | 201903 | 40.134228 | 50 | 0.245819 | 13.476388 | 2019-03-01 00:00:00 |
| 9 | 201904 | 30.100671 | 47 | 0.561427 | 30.778725 | 2019-04-01 00:00:00 |
| 10 | 201905 | 57.191275 | 55 | 0.038315 | 2.100509 | 2019-05-01 00:00:00 |
| 11 | 201906 | 41.137584 | 41 | 0.003344 | 0.183352 | 2019-06-01 00:00:00 |
In [17]:
# --------------------------------------
# Plot uplift panels for sales and customers
# --------------------------------------
def plot_uplift_panels(df_full,
scaled_sales_df, scaled_cust_df,
trial_store, control_store,
sd_sales, sd_cust):
"""
Produce a panel of two plots showing sales uplift and customer uplift
for a trial store with corresponding 95 percent confidence intervals.
"""
# --------------------------------------
# Helper function to generate CI tables
# --------------------------------------
def build_uplift_table(df_full, scaled_df, trial_store, control_store, metric_col, stdDev):
df_plot = df_full.copy()
# Label store types
df_plot["Store_type"] = df_plot["STORE_NBR"].apply(
lambda x: "Trial" if x == trial_store
else "Control" if x == control_store
else "Other stores"
)
# Aggregate base metric
agg = (
df_plot.groupby(["MONTH_ID", "Store_type"])[metric_col]
.mean()
.reset_index(name="metric")
)
agg["TransactionMonth"] = pd.to_datetime(
agg["MONTH_ID"].astype(str) + "01",
format="%Y%m%d"
)
# Keep only Trial and Control lines
agg = agg[agg["Store_type"].isin(["Trial", "Control"])]
# Compute confidence interval bands
control_vals = agg[agg["Store_type"] == "Control"].copy()
upper = control_vals.copy()
upper["metric"] = upper["metric"] * (1 + stdDev * 2)
upper["Store_type"] = "Control 95 percent CI"
lower = control_vals.copy()
lower["metric"] = lower["metric"] * (1 - stdDev * 2)
lower["Store_type"] = "Control 5 percent CI"
final = pd.concat([agg, upper, lower])
return final
# --------------------------------------
# Build uplift datasets for both metrics
# --------------------------------------
sales_uplift = build_uplift_table(
df_full, scaled_sales_df, trial_store, control_store,
"TOTAL_SALES", sd_sales
)
cust_uplift = build_uplift_table(
df_full, scaled_cust_df, trial_store, control_store,
"CUSTOMERS", sd_cust
)
# --------------------------------------
# Create the figure layout
# --------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
shade_start = pd.to_datetime("2019-02")
shade_end = pd.to_datetime("2019-04")
# --------------------------------------
# Sales uplift panel
# --------------------------------------
axes[0].axvspan(shade_start, shade_end, color="lightgray", alpha=0.3)
for stype, sub in sales_uplift.groupby("Store_type"):
axes[0].plot(sub["TransactionMonth"], sub["metric"], label=stype, linewidth=2)
axes[0].set_title(f"Store {trial_store} Sales Uplift with Confidence Bands")
axes[0].set_ylabel("Total Sales")
axes[0].grid(alpha=0.25)
# --------------------------------------
# Customers uplift panel
# --------------------------------------
axes[1].axvspan(shade_start, shade_end, color="lightgray", alpha=0.3)
for stype, sub in cust_uplift.groupby("Store_type"):
axes[1].plot(sub["TransactionMonth"], sub["metric"], label=stype, linewidth=2)
axes[1].set_title(f"Store {trial_store} Customer Uplift with Confidence Bands")
axes[1].set_ylabel("Total Customers")
axes[1].grid(alpha=0.25)
# Shared legend
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=3)
plt.tight_layout(rect=[0, 0, 1, 0.90])
plt.show()
In [18]:
# --------------------------------------
# Plot uplift panels for each trial store
# --------------------------------------
# Store 77 uplift panels
plot_uplift_panels(
df_full,
scaled_sales_store77,
scaled_cust_store77,
trial_store77,
control_store77,
sd_77_sales,
sd_77_cust
)
# Store 86 uplift panels
plot_uplift_panels(
df_full,
scaled_sales_store86,
scaled_cust_store86,
trial_store86,
control_store86,
sd_86_sales,
sd_86_cust
)
# Store 88 uplift panels
plot_uplift_panels(
df_full,
scaled_sales_store88,
scaled_cust_store88,
trial_store88,
control_store88,
sd_88_sales,
sd_88_cust
)
In [20]:
# --------------------------------------
# Plot percentage uplift for trial months
# --------------------------------------
def plot_uplift(df, trial_store, control_store, metric, title_suffix="Sales"):
"""
Plot percentage uplift for the trial period based on:
uplift = (trial - scaled_control) / scaled_control
"""
# Trial months used in the analysis
trial_months = [201902, 201903, 201904]
# Filter to trial months only
trial_df = df[df["MONTH_ID"].isin(trial_months)]
# Extract trial and corresponding scaled control values
trial_vals = trial_df[trial_df["STORE_NBR"] == trial_store][metric].values
control_vals = trial_df[trial_df["STORE_NBR"] == control_store][f"scaled_{metric}"].values
# Compute uplift percentage
uplift = (trial_vals - control_vals) / control_vals * 100
# Format month labels as YYYY-MM
month_labels = [f"{str(m)[:4]}-{str(m)[4:]}" for m in trial_months]
# --------------------------------------
# Create bar chart
# --------------------------------------
plt.figure(figsize=(8, 5))
plt.bar(month_labels, uplift, color="#2ca02c")
# Annotate bar values
for i, v in enumerate(uplift):
offset = 0.02 * v if v != 0 else 1
plt.text(i, v + offset, f"{v:.1f} percent", ha="center", fontsize=10)
plt.axhline(0, color="black", linewidth=1)
# Title including trial and control stores
plt.title(
f"Percentage Uplift for Store {trial_store} (Control {control_store}) — {title_suffix}"
)
plt.xlabel("Month")
plt.ylabel("Uplift (percent)")
plt.tight_layout()
plt.show()
In [26]:
# --------------------------------------
# Uplift plots for Store 77
# --------------------------------------
plot_uplift(
df=scaled_sales_store77,
trial_store=trial_store77,
control_store=control_store77,
metric="TOTAL_SALES",
title_suffix="Sales"
)
plot_uplift(
df=scaled_cust_store77,
trial_store=trial_store77,
control_store=control_store77,
metric="CUSTOMERS",
title_suffix="Customers"
)
# --------------------------------------
# Uplift plots for Store 86
# --------------------------------------
plot_uplift(
df=scaled_sales_store86,
trial_store=trial_store86,
control_store=control_store86,
metric="TOTAL_SALES",
title_suffix="Sales"
)
plot_uplift(
df=scaled_cust_store86,
trial_store=trial_store86,
control_store=control_store86,
metric="CUSTOMERS",
title_suffix="Customers"
)
# --------------------------------------
# Uplift plots for Store 88
# --------------------------------------
plot_uplift(
df=scaled_sales_store88,
trial_store=trial_store88,
control_store=control_store88,
metric="TOTAL_SALES",
title_suffix="Sales"
)
plot_uplift(
df=scaled_cust_store88,
trial_store=trial_store88,
control_store=control_store88,
metric="CUSTOMERS",
title_suffix="Customers"
)