In [1]:
# --------------------------------------
# Start of statistical analysis
# --------------------------------------

# Import the required libraries
import matplotlib.pyplot as plt
import statsmodels as Stats
import pandas as pd
import numpy as np
from scipy.stats import t
In [2]:
# --------------------------------------
# Load the dataset
# --------------------------------------

# Read CSV file
qvi_data = pd.read_csv("C:\\Users\\rageg\\Downloads\\QVI_data.csv")

# Check the initial structure
qvi_data.info()

# Convert DATE column to datetime
qvi_data["DATE"] = pd.to_datetime(qvi_data["DATE"])

# Check structure after conversion
qvi_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263214 entries, 0 to 263213
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   LYLTY_CARD_NBR    263214 non-null  int64  
 1   DATE              263214 non-null  object 
 2   STORE_NBR         263214 non-null  int64  
 3   TXN_ID            263214 non-null  int64  
 4   PROD_NBR          263214 non-null  int64  
 5   PROD_NAME         263214 non-null  object 
 6   PROD_QTY          263214 non-null  int64  
 7   TOT_SALES         263214 non-null  float64
 8   PACK_SIZE         263214 non-null  int64  
 9   BRAND             263214 non-null  object 
 10  LIFESTAGE         263214 non-null  object 
 11  PREMIUM_CUSTOMER  263214 non-null  object 
dtypes: float64(1), int64(6), object(5)
memory usage: 24.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263214 entries, 0 to 263213
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   LYLTY_CARD_NBR    263214 non-null  int64         
 1   DATE              263214 non-null  datetime64[ns]
 2   STORE_NBR         263214 non-null  int64         
 3   TXN_ID            263214 non-null  int64         
 4   PROD_NBR          263214 non-null  int64         
 5   PROD_NAME         263214 non-null  object        
 6   PROD_QTY          263214 non-null  int64         
 7   TOT_SALES         263214 non-null  float64       
 8   PACK_SIZE         263214 non-null  int64         
 9   BRAND             263214 non-null  object        
 10  LIFESTAGE         263214 non-null  object        
 11  PREMIUM_CUSTOMER  263214 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(6), object(4)
memory usage: 24.1+ MB
In [3]:
# --------------------------------------
# Create monthly identifiers and unit price
# --------------------------------------

# Convert DATE to MONTH_ID formatted as YYYYMM
qvi_data["MONTH_ID"] = qvi_data["DATE"].dt.strftime("%Y%m").astype(int)

# Compute unit price
qvi_data["UNIT_PRICE"] = qvi_data["TOT_SALES"] / qvi_data["PROD_QTY"]

qvi_data


# --------------------------------------
# Group by store and month to create monthly metrics
# --------------------------------------

qvi_grouped = qvi_data.groupby(
    ["STORE_NBR", "MONTH_ID"]
).agg(
    TOTAL_SALES=("TOT_SALES", "sum"),
    CUSTOMERS=("LYLTY_CARD_NBR", "nunique"),
    TRANSACTIONS=("TXN_ID", "nunique"),
    AVG_PRICE_PER_UNIT=("UNIT_PRICE", "mean"),
    CHIPS_SOLD=("PROD_QTY", "sum")
).reset_index()

# Compute additional derived metrics
qvi_grouped["CHIPS_PER_CUSTOMER"] = qvi_grouped["CHIPS_SOLD"] / qvi_grouped["CUSTOMERS"]
qvi_grouped["TXN_PER_CUSTOMER"] = qvi_grouped["TRANSACTIONS"] / qvi_grouped["CUSTOMERS"]
In [4]:
# --------------------------------------
# Filter to pre-trial period and full observation stores
# --------------------------------------

# Copy grouped data
df_full = qvi_grouped.copy()

# Keep pre-trial months only
df_pre = df_full[df_full["MONTH_ID"] < 201902]

# Count months per store
full_obs_counts = df_full["STORE_NBR"].value_counts()

# Identify stores with all twelve required observations
stores_with_12 = full_obs_counts[full_obs_counts == 12].index

# Keep only pre-trial months for stores with full observations
qvi_grouped_full_obs = df_pre[df_pre["STORE_NBR"].isin(stores_with_12)]
In [5]:
# --------------------------------------
# Define trial stores and metrics
# --------------------------------------

trials = [77, 86, 88]

# Metrics used for correlation and magnitude distance
metrics = ["TOTAL_SALES", "CUSTOMERS"]

# Identify control stores
controls = [
    store for store in qvi_grouped_full_obs["STORE_NBR"].unique()
    if store not in trials
]


# --------------------------------------
# Magnitude distance function
# --------------------------------------

def mag_dist(df, trial_stores, mets):

    # Split trial and control data
    trials_df = df[df["STORE_NBR"].isin(trial_stores)]
    controls_df = df[~df["STORE_NBR"].isin(trial_stores)]
    
    # Merge on month to compare trial and control stores
    new_df = pd.merge(
        controls_df,
        trials_df,
        on="MONTH_ID",
        suffixes=("_CNTRL", "_TRIAL")
    )
    
    # Initialize comparison table
    comparison_df = new_df[["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"]].copy()
    
    # Compute absolute distance for each metric
    for metric in mets:
        comparison_df[metric + "_abs_dist"] = (
            new_df[metric + "_CNTRL"] - new_df[metric + "_TRIAL"]
        ).abs()
    
    # Standardize distances using min-max scaling (flipped so higher is more similar)
    for metric in mets:
        col = metric + "_abs_dist"
        min_val = comparison_df[col].min()
        max_val = comparison_df[col].max()
        
        if max_val > min_val:
            scaled = (comparison_df[col] - min_val) / (max_val - min_val)
            flipped = 1 - scaled
            comparison_df[col.replace("_abs_dist", "_std_dist")] = flipped
        else:
            comparison_df[col.replace("_abs_dist", "_std_dist")] = 1.0
    
    # Select standardized distance columns
    std_list = [metric + "_std_dist" for metric in mets]
    std_dist = comparison_df[
        ["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"] + std_list
    ]
    
    # Average similarity across months
    mean_distances = (
        std_dist.groupby(["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
        .mean()
        .reset_index()
    )
    
    return mean_distances


# --------------------------------------
# Correlation function
# --------------------------------------

def p_corr_wide(df, trial_stores, mets):

    # Identify control stores
    controls = [
        store for store in df["STORE_NBR"].unique()
        if store not in trial_stores
    ]
    
    final_df = None

    for metric in mets:

        # Pivot to compute correlations across stores
        pivoted = df.pivot(index="MONTH_ID", columns="STORE_NBR", values=metric)
        
        # Store-to-store correlations
        corr_df = pivoted.corr().loc[controls, trial_stores]
        
        # Convert correlation matrix to long format
        corr_long = corr_df.reset_index().melt(
            id_vars="STORE_NBR",
            var_name="STORE_NBR_TRIAL",
            value_name=f"{metric}_corr"
        ).rename(columns={"STORE_NBR": "STORE_NBR_CNTRL"})
        
        # Merge correlations for all metrics
        if final_df is None:
            final_df = corr_long
        else:
            final_df = final_df.merge(
                corr_long,
                on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"],
                how="inner"
            )

    return final_df
In [6]:
# --------------------------------------
# Prepare trial store identifiers
# --------------------------------------

# Copy grouped data
df = qvi_grouped.copy()

# Define trial store numbers
trial_store77 = 77
trial_store86 = 86
trial_store88 = 88
In [7]:
# --------------------------------------
# Compute correlation and magnitude distance
# --------------------------------------

# Correlation scores between control and trial stores
store_corr = p_corr_wide(qvi_grouped_full_obs, trials, metrics)

# Magnitude distance scores between control and trial stores
store_mag = mag_dist(qvi_grouped_full_obs, trials, metrics)

store_mag
Out[7]:
STORE_NBR_CNTRL STORE_NBR_TRIAL TOTAL_SALES_std_dist CUSTOMERS_std_dist
0 1 77 0.957554 0.955703
1 1 86 0.520879 0.579181
2 1 88 0.190486 0.376523
3 2 77 0.942248 0.943522
4 2 86 0.494721 0.529347
... ... ... ... ...
763 271 86 0.950847 0.950166
764 271 88 0.633696 0.747508
765 272 77 0.894178 0.961240
766 272 86 0.658295 0.562569
767 272 88 0.327902 0.359911

768 rows × 4 columns

In [8]:
# --------------------------------------
# Select the best control store for a trial store
# --------------------------------------

def select_control_store(store_corr, store_mag, trial_store, corr_weight=0.5, mag_weight=0.5):
    """
    Compute the best control store match for a given trial store.
    store_corr is the correlation table from p_corr_wide
    store_mag is the magnitude distance table from mag_dist
    trial_store is the store being evaluated
    corr_weight controls how much correlation contributes to the score
    mag_weight controls how much magnitude distance contributes to the score
    Returns the best control store and the table of all scores
    """

    # --------------------------------------
    # Combine correlation and magnitude for TOTAL_SALES
    # --------------------------------------
    
    corr_sales = store_corr[store_corr["STORE_NBR_TRIAL"] == trial_store][
        ["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "TOTAL_SALES_corr"]
    ]

    mag_sales = store_mag[store_mag["STORE_NBR_TRIAL"] == trial_store][
        ["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "TOTAL_SALES_std_dist"]
    ]

    sales_df = corr_sales.merge(mag_sales, on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
    sales_df["Final_score_sales"] = (
        sales_df["TOTAL_SALES_corr"] * corr_weight +
        sales_df["TOTAL_SALES_std_dist"] * mag_weight
    )

    # --------------------------------------
    # Combine correlation and magnitude for CUSTOMERS
    # --------------------------------------
    
    corr_cust = store_corr[store_corr["STORE_NBR_TRIAL"] == trial_store][
        ["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "CUSTOMERS_corr"]
    ]

    mag_cust = store_mag[store_mag["STORE_NBR_TRIAL"] == trial_store][
        ["STORE_NBR_CNTRL", "STORE_NBR_TRIAL", "CUSTOMERS_std_dist"]
    ]

    custo_df = corr_cust.merge(mag_cust, on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"])
    custo_df["Final_score_custo"] = (
        custo_df["CUSTOMERS_corr"] * corr_weight +
        custo_df["CUSTOMERS_std_dist"] * mag_weight
    )

    # --------------------------------------
    # Merge both drivers into one table
    # --------------------------------------
    
    final_df = sales_df.merge(
        custo_df,
        on=["STORE_NBR_CNTRL", "STORE_NBR_TRIAL"]
    )[[
        "STORE_NBR_CNTRL",
        "STORE_NBR_TRIAL",
        "Final_score_sales",
        "Final_score_custo"
    ]]

    # --------------------------------------
    # Compute composite score
    # --------------------------------------
    
    final_df["SCORE"] = (
        final_df["Final_score_sales"] * 0.5 +
        final_df["Final_score_custo"] * 0.5
    )

    # Sort stores by composite score
    final_df = final_df.sort_values("SCORE", ascending=False)

    # Select the highest ranked control store
    best_control = int(final_df.iloc[0]["STORE_NBR_CNTRL"])

    return best_control, final_df
In [9]:
# --------------------------------------
# Plot pre-trial sales and customers
# --------------------------------------

def plot_pretrial_sales_and_customers(df, trial_store, control_store):
    """
    Create side-by-side panels comparing pre-trial sales and customers
    for a trial store versus its matched control store.
    """

    temp = df.copy()

    # Label store types
    temp["Store_type"] = temp["STORE_NBR"].apply(
        lambda x: "Trial" if x == trial_store
        else "Control" if x == control_store
        else "Other stores"
    )

    # --------------------------------------
    # Convert MONTH_ID to a datetime month
    # --------------------------------------
    def add_month_date(x):
        return pd.to_datetime(
            x.astype(str).str[:4] + "-" +
            x.astype(str).str[4:] + "-01"
        )

    # --------------------------------------
    # Pre-trial sales
    # --------------------------------------
    sales = (
        temp.groupby(["MONTH_ID", "Store_type"])["TOTAL_SALES"]
            .mean()
            .reset_index()
    )
    sales["TransactionMonth"] = add_month_date(sales["MONTH_ID"])
    sales = sales[sales["MONTH_ID"] < 201903]

    # --------------------------------------
    # Pre-trial customers
    # --------------------------------------
    cust = (
        temp.groupby(["MONTH_ID", "Store_type"])["CUSTOMERS"]
            .mean()
            .reset_index(name="numberCustomers")
    )
    cust["TransactionMonth"] = add_month_date(cust["MONTH_ID"])
    cust = cust[cust["MONTH_ID"] < 201903]

    # --------------------------------------
    # Create side-by-side comparison plot
    # --------------------------------------
    fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True)

    # Left panel: Sales
    for stype, sub in sales.groupby("Store_type"):
        axes[0].plot(sub["TransactionMonth"], sub["TOTAL_SALES"], label=stype, linewidth=2)
    axes[0].set_title(f"Store {trial_store} Pre-trial Sales (Control {control_store})")
    axes[0].set_xlabel("Month")
    axes[0].set_ylabel("Total Sales")
    axes[0].grid(alpha=0.25)

    # Right panel: Customers
    for stype, sub in cust.groupby("Store_type"):
        axes[1].plot(sub["TransactionMonth"], sub["numberCustomers"], label=stype, linewidth=2)
    axes[1].set_title(f"Store {trial_store} Pre-trial Customers (Control {control_store})")
    axes[1].set_xlabel("Month")
    axes[1].set_ylabel("Total Customers")
    axes[1].grid(alpha=0.25)

    # Shared legend
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=3)

    fig.tight_layout(rect=[0, 0, 1, 0.93])
    plt.show()
In [10]:
# --------------------------------------
# Select control stores for each trial store
# --------------------------------------

control_store77 = select_control_store(store_corr, store_mag, trial_store77)[0]
control_store86 = select_control_store(store_corr, store_mag, trial_store86)[0]
control_store88 = select_control_store(store_corr, store_mag, trial_store88)[0]
In [11]:
# --------------------------------------
# Plot pre-trial comparisons for all trial stores
# --------------------------------------

plot_pretrial_sales_and_customers(qvi_grouped, trial_store77, control_store77)
plot_pretrial_sales_and_customers(qvi_grouped, trial_store86, control_store86)
plot_pretrial_sales_and_customers(qvi_grouped, trial_store88, control_store88)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [12]:
# --------------------------------------
# Scaling function for pre-trial metrics
# --------------------------------------

metric_cols = ["TOTAL_SALES", "CUSTOMERS"]

def scale_pretrial_metric(df, trial_store, control_store, metric_col, cutoff=201902):

    df_copy = df.copy()

    # Filter to pre-trial months
    full_pre = df_copy[df_copy["MONTH_ID"] < cutoff]

    # Compute total values for scaling
    trial_pre_total = full_pre[full_pre["STORE_NBR"] == trial_store][metric_col].sum()
    control_pre_total = full_pre[full_pre["STORE_NBR"] == control_store][metric_col].sum()

    # Compute scaling factor, avoid division by zero
    if control_pre_total == 0:
        scaling_factor = np.nan
    else:
        scaling_factor = trial_pre_total / control_pre_total

    # Name of the scaled column
    new_col = f"scaled_{metric_col}"

    # Apply scaling to the control store only
    df_copy[new_col] = np.where(
        df_copy["STORE_NBR"] == control_store,
        df_copy[metric_col] * scaling_factor,
        np.nan
    )

    return df_copy, scaling_factor


# --------------------------------------
# Compute percentage difference and t-values
# --------------------------------------

def compute_percentage_diff_and_tvalues(df_scaled, trial_store, control_store, metric,
                                        pre_cutoff=201902,
                                        trial_start=201902,
                                        trial_end=201904):
    """
    Compute percentage difference between trial and scaled control,
    compute pre-trial standard deviation,
    compute t-values,
    compute critical t-value for significance.
    """

    scaled_col = f"scaled_{metric}"

    # Extract scaled control values
    control_scaled = df_scaled[df_scaled["STORE_NBR"] == control_store][
        ["MONTH_ID", scaled_col]
    ].rename(columns={scaled_col: "scaledControl"})

    # Extract trial values
    trial_vals = df_scaled[df_scaled["STORE_NBR"] == trial_store][
        ["MONTH_ID", metric]
    ].rename(columns={metric: "trialValue"})

    # Merge trial and control values
    percentageDiff = control_scaled.merge(trial_vals, on="MONTH_ID")

    # Compute percentage difference
    percentageDiff["percentageDiff"] = (
        (percentageDiff["trialValue"] - percentageDiff["scaledControl"]).abs()
        / percentageDiff["scaledControl"]
    )

    # Compute pre-trial standard deviation
    stdDev = percentageDiff[percentageDiff["MONTH_ID"] < pre_cutoff][
        "percentageDiff"
    ].std()

    # Compute t-values
    percentageDiff["tValue"] = percentageDiff["percentageDiff"] / stdDev

    # Convert MONTH_ID to datetime
    percentageDiff["TransactionMonth"] = pd.to_datetime(
        percentageDiff["MONTH_ID"].astype(str) + "01",
        format="%Y%m%d"
    )

    # Extract trial-period t-values
    trial_tvalues = percentageDiff[
        (percentageDiff["MONTH_ID"] >= trial_start) &
        (percentageDiff["MONTH_ID"] <= trial_end)
    ][["TransactionMonth", "tValue"]]

    # Compute critical t-value (df = 7)
    degrees_of_freedom = 7
    critical_t = t.ppf(0.95, df=degrees_of_freedom)

    return trial_tvalues, percentageDiff, stdDev, critical_t
In [14]:
# --------------------------------------
# Scale sales and customer metrics for each trial store
# --------------------------------------

# Scaled sales data
scaled_sales_store77 = scale_pretrial_metric(df_full, trial_store77, control_store77, metric_cols[0])[0]
scaled_sales_store86 = scale_pretrial_metric(df_full, trial_store86, control_store86, metric_cols[0])[0]
scaled_sales_store88 = scale_pretrial_metric(df_full, trial_store88, control_store88, metric_cols[0])[0]

# Scaled customer data
scaled_cust_store77 = scale_pretrial_metric(df_full, trial_store77, control_store77, metric_cols[1])[0]
scaled_cust_store86 = scale_pretrial_metric(df_full, trial_store86, control_store86, metric_cols[1])[0]
scaled_cust_store88 = scale_pretrial_metric(df_full, trial_store88, control_store88, metric_cols[1])[0]
In [15]:
# --------------------------------------
# Compute percentage differences and t-values for Store 77
# --------------------------------------

tvals_77_sales, diff_77_sales, sd_77_sales, crit_77_sales = compute_percentage_diff_and_tvalues(
    scaled_sales_store77,
    trial_store77,
    control_store77,
    metric_cols[0]
)

tvals_77_cust, diff_77_cust, sd_77_cust, crit_77_cust = compute_percentage_diff_and_tvalues(
    scaled_cust_store77,
    trial_store77,
    control_store77,
    metric_cols[1]
)


# --------------------------------------
# Compute percentage differences and t-values for Store 86
# --------------------------------------

tvals_86_sales, diff_86_sales, sd_86_sales, crit_86_sales = compute_percentage_diff_and_tvalues(
    scaled_sales_store86,
    trial_store86,
    control_store86,
    metric_cols[0]
)

tvals_86_cust, diff_86_cust, sd_86_cust, crit_86_cust = compute_percentage_diff_and_tvalues(
    scaled_cust_store86,
    trial_store86,
    control_store86,
    metric_cols[1]
)


# --------------------------------------
# Compute percentage differences and t-values for Store 88
# --------------------------------------

tvals_88_sales, diff_88_sales, sd_88_sales, crit_88_sales = compute_percentage_diff_and_tvalues(
    scaled_sales_store88,
    trial_store88,
    control_store88,
    metric_cols[0]
)

tvals_88_cust, diff_88_cust, sd_88_cust, crit_88_cust = compute_percentage_diff_and_tvalues(
    scaled_cust_store88,
    trial_store88,
    control_store88,
    metric_cols[1]
)

# Display difference table for Store 77 sales
diff_77_sales
Out[15]:
MONTH_ID scaledControl trialValue percentageDiff tValue TransactionMonth
0 201807 297.565550 296.8 0.002573 0.051515 2018-07-01
1 201808 292.652187 255.5 0.126950 2.542011 2018-08-01
2 201809 233.998916 225.2 0.037602 0.752940 2018-09-01
3 201810 190.085733 204.5 0.075830 1.518406 2018-10-01
4 201811 216.597421 245.3 0.132516 2.653459 2018-11-01
5 201812 286.408121 267.3 0.066716 1.335911 2018-12-01
6 201901 181.692071 204.4 0.124980 2.502571 2019-01-01
7 201902 249.762622 235.0 0.059107 1.183534 2019-02-01
8 201903 203.802205 278.5 0.366521 7.339116 2019-03-01
9 201904 162.345704 263.5 0.623080 12.476373 2019-04-01
10 201905 352.533799 299.3 0.151003 3.023650 2019-05-01
11 201906 226.219424 264.7 0.170103 3.406093 2019-06-01
In [16]:
# --------------------------------------
# Highlight trial period rows in styled tables
# --------------------------------------

def highlight_trial_rows(row):
    month = row["MONTH_ID"]
    if 201902 <= month <= 201904:
        return ["background-color: gray"] * len(row)
    else:
        return [""] * len(row)

(
    diff_77_cust
    .style
    .apply(highlight_trial_rows, axis=1)
    .set_caption("Percentage Difference and T-Values for Store 77 (Customers)")
)
Out[16]:
Percentage Difference and T-Values for Store 77 (Customers)
  MONTH_ID scaledControl trialValue percentageDiff tValue TransactionMonth
0 201807 51.171141 51 0.003344 0.183352 2018-07-01 00:00:00
1 201808 48.161074 47 0.024108 1.321664 2018-08-01 00:00:00
2 201809 42.140940 42 0.003344 0.183352 2018-09-01 00:00:00
3 201810 35.117450 37 0.053607 2.938874 2018-10-01 00:00:00
4 201811 40.134228 41 0.021572 1.182622 2018-11-01 00:00:00
5 201812 47.157718 46 0.024550 1.345883 2018-12-01 00:00:00
6 201901 35.117450 35 0.003344 0.183352 2019-01-01 00:00:00
7 201902 45.151007 45 0.003344 0.183352 2019-02-01 00:00:00
8 201903 40.134228 50 0.245819 13.476388 2019-03-01 00:00:00
9 201904 30.100671 47 0.561427 30.778725 2019-04-01 00:00:00
10 201905 57.191275 55 0.038315 2.100509 2019-05-01 00:00:00
11 201906 41.137584 41 0.003344 0.183352 2019-06-01 00:00:00
In [17]:
# --------------------------------------
# Plot uplift panels for sales and customers
# --------------------------------------

def plot_uplift_panels(df_full,
                       scaled_sales_df, scaled_cust_df,
                       trial_store, control_store,
                       sd_sales, sd_cust):
    """
    Produce a panel of two plots showing sales uplift and customer uplift
    for a trial store with corresponding 95 percent confidence intervals.
    """

    # --------------------------------------
    # Helper function to generate CI tables
    # --------------------------------------
    def build_uplift_table(df_full, scaled_df, trial_store, control_store, metric_col, stdDev):

        df_plot = df_full.copy()

        # Label store types
        df_plot["Store_type"] = df_plot["STORE_NBR"].apply(
            lambda x: "Trial" if x == trial_store
            else "Control" if x == control_store
            else "Other stores"
        )

        # Aggregate base metric
        agg = (
            df_plot.groupby(["MONTH_ID", "Store_type"])[metric_col]
            .mean()
            .reset_index(name="metric")
        )

        agg["TransactionMonth"] = pd.to_datetime(
            agg["MONTH_ID"].astype(str) + "01",
            format="%Y%m%d"
        )

        # Keep only Trial and Control lines
        agg = agg[agg["Store_type"].isin(["Trial", "Control"])]

        # Compute confidence interval bands
        control_vals = agg[agg["Store_type"] == "Control"].copy()

        upper = control_vals.copy()
        upper["metric"] = upper["metric"] * (1 + stdDev * 2)
        upper["Store_type"] = "Control 95 percent CI"

        lower = control_vals.copy()
        lower["metric"] = lower["metric"] * (1 - stdDev * 2)
        lower["Store_type"] = "Control 5 percent CI"

        final = pd.concat([agg, upper, lower])
        return final

    # --------------------------------------
    # Build uplift datasets for both metrics
    # --------------------------------------
    sales_uplift = build_uplift_table(
        df_full, scaled_sales_df, trial_store, control_store,
        "TOTAL_SALES", sd_sales
    )

    cust_uplift = build_uplift_table(
        df_full, scaled_cust_df, trial_store, control_store,
        "CUSTOMERS", sd_cust
    )

    # --------------------------------------
    # Create the figure layout
    # --------------------------------------
    fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)

    shade_start = pd.to_datetime("2019-02")
    shade_end = pd.to_datetime("2019-04")

    # --------------------------------------
    # Sales uplift panel
    # --------------------------------------
    axes[0].axvspan(shade_start, shade_end, color="lightgray", alpha=0.3)

    for stype, sub in sales_uplift.groupby("Store_type"):
        axes[0].plot(sub["TransactionMonth"], sub["metric"], label=stype, linewidth=2)

    axes[0].set_title(f"Store {trial_store} Sales Uplift with Confidence Bands")
    axes[0].set_ylabel("Total Sales")
    axes[0].grid(alpha=0.25)

    # --------------------------------------
    # Customers uplift panel
    # --------------------------------------
    axes[1].axvspan(shade_start, shade_end, color="lightgray", alpha=0.3)

    for stype, sub in cust_uplift.groupby("Store_type"):
        axes[1].plot(sub["TransactionMonth"], sub["metric"], label=stype, linewidth=2)

    axes[1].set_title(f"Store {trial_store} Customer Uplift with Confidence Bands")
    axes[1].set_ylabel("Total Customers")
    axes[1].grid(alpha=0.25)

    # Shared legend
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=3)

    plt.tight_layout(rect=[0, 0, 1, 0.90])
    plt.show()
In [18]:
# --------------------------------------
# Plot uplift panels for each trial store
# --------------------------------------

# Store 77 uplift panels
plot_uplift_panels(
    df_full,
    scaled_sales_store77,
    scaled_cust_store77,
    trial_store77,
    control_store77,
    sd_77_sales,
    sd_77_cust
)

# Store 86 uplift panels
plot_uplift_panels(
    df_full,
    scaled_sales_store86,
    scaled_cust_store86,
    trial_store86,
    control_store86,
    sd_86_sales,
    sd_86_cust
)

# Store 88 uplift panels
plot_uplift_panels(
    df_full,
    scaled_sales_store88,
    scaled_cust_store88,
    trial_store88,
    control_store88,
    sd_88_sales,
    sd_88_cust
)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [20]:
# --------------------------------------
# Plot percentage uplift for trial months
# --------------------------------------

def plot_uplift(df, trial_store, control_store, metric, title_suffix="Sales"):
    """
    Plot percentage uplift for the trial period based on:
    uplift = (trial - scaled_control) / scaled_control
    """

    # Trial months used in the analysis
    trial_months = [201902, 201903, 201904]

    # Filter to trial months only
    trial_df = df[df["MONTH_ID"].isin(trial_months)]

    # Extract trial and corresponding scaled control values
    trial_vals = trial_df[trial_df["STORE_NBR"] == trial_store][metric].values
    control_vals = trial_df[trial_df["STORE_NBR"] == control_store][f"scaled_{metric}"].values

    # Compute uplift percentage
    uplift = (trial_vals - control_vals) / control_vals * 100

    # Format month labels as YYYY-MM
    month_labels = [f"{str(m)[:4]}-{str(m)[4:]}" for m in trial_months]

    # --------------------------------------
    # Create bar chart
    # --------------------------------------
    plt.figure(figsize=(8, 5))
    plt.bar(month_labels, uplift, color="#2ca02c")

    # Annotate bar values
    for i, v in enumerate(uplift):
        offset = 0.02 * v if v != 0 else 1
        plt.text(i, v + offset, f"{v:.1f} percent", ha="center", fontsize=10)

    plt.axhline(0, color="black", linewidth=1)

    # Title including trial and control stores
    plt.title(
        f"Percentage Uplift for Store {trial_store} (Control {control_store}) — {title_suffix}"
    )

    plt.xlabel("Month")
    plt.ylabel("Uplift (percent)")
    plt.tight_layout()
    plt.show()
In [26]:
# --------------------------------------
# Uplift plots for Store 77
# --------------------------------------

plot_uplift(
    df=scaled_sales_store77,
    trial_store=trial_store77,
    control_store=control_store77,
    metric="TOTAL_SALES",
    title_suffix="Sales"
)

plot_uplift(
    df=scaled_cust_store77,
    trial_store=trial_store77,
    control_store=control_store77,
    metric="CUSTOMERS",
    title_suffix="Customers"
)


# --------------------------------------
# Uplift plots for Store 86
# --------------------------------------

plot_uplift(
    df=scaled_sales_store86,
    trial_store=trial_store86,
    control_store=control_store86,
    metric="TOTAL_SALES",
    title_suffix="Sales"
)

plot_uplift(
    df=scaled_cust_store86,
    trial_store=trial_store86,
    control_store=control_store86,
    metric="CUSTOMERS",
    title_suffix="Customers"
)


# --------------------------------------
# Uplift plots for Store 88
# --------------------------------------

plot_uplift(
    df=scaled_sales_store88,
    trial_store=trial_store88,
    control_store=control_store88,
    metric="TOTAL_SALES",
    title_suffix="Sales"
)

plot_uplift(
    df=scaled_cust_store88,
    trial_store=trial_store88,
    control_store=control_store88,
    metric="CUSTOMERS",
    title_suffix="Customers"
)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image