From b6c088a7f31777c25bf60cb7e84cc495e0672ad1 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 14 Feb 2024 13:45:31 -0800
Subject: [PATCH 01/14] Initial commit of occupancy statistic visualizations

---
 python_files/generate_supplementary_plots.py | 18 +++-
 python_files/supplementary_plot_helpers.py   | 93 ++++++++++++++++++++
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index fa7bf23..dcf3134 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -297,7 +297,7 @@
 cell_table = pd.read_csv(
     os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
 )
-functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds_test")
+functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds")
 if not os.path.exists(functional_marker_viz_dir):
     os.makedirs(functional_marker_viz_dir)
 
@@ -431,3 +431,19 @@
 # Feature extraction
 
 
+# Occupancy statistics
+# TODO: make a constant in supplementary_plot_helpers
+cell_table = pd.read_csv(
+    os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
+)
+occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats")
+if not os.path.exists(occupancy_stats_viz_dir):
+    os.makedirs(occupancy_stats_viz_dir)
+
+# massive GridSearch
+for tiles_per_row_col in [2, 4, 8, 16, 32, 64, 128]:
+    for positive_threshold in np.arange(1, 26):
+        supplementary_plot_helpers.visualize_occupancy_statistics(
+            cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col,
+            positive_threshold=positive_threshold,
+        )
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index 34f2b11..5eb12cb 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -1,4 +1,6 @@
+import itertools
 import math
+import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
 import natsort as ns
 import numpy as np
@@ -337,3 +339,94 @@ def stitch_before_after_norm(
 
     pre_norm_tiled.save(pre_norm_stitched_path)
     post_norm_tiled.save(post_norm_stitched_path)
+
+
+# occupancy statistic helpers
+def visualize_occupancy_statistics(
+    cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path]max_image_size: int = 2048,
+    tiles_per_row_col: int = 8, positive_threshold: int = 5
+):
+    """Define occupancy statistics over a cohort, and visualize distribution of positive
+    tile counts across FOV.
+
+    Args:
+        cell_table (pd.DataFrame):
+            The cell table associated with the cohort, need to contain all FOVs
+        save_dir (Union[str, pathlib.Path]):
+            Directory to save the visualizations in
+        max_image_size (int):
+            The max image size to load in
+        tiles_per_row_col (int):
+            The row/col dims of tiles to define over each image
+        positive_threshold (int):
+            The cell count in a tile required for positivity
+    """
+    # get all the FOV names
+    fov_names: np.ndarray = cell_table["fov"].unique()
+
+    # define the tile size in pixels
+    tile_size: int = max_image_size // tiles_per_row_col
+
+    # define the occupancy_stats array
+    occupancy_stats: xr.DataArray = xr.DataArray(
+        np.zeros((len(fov_names), tile_size, tile_size), dtype=np.int16),
+        coords=[fov_names, np.zeros(tile_size), np.zeros(tile_size)],
+        dims=["fov", "x", "y"]
+    )
+
+    # for each FOV, update the corresponding tile count
+    for fov in fov_names:
+        cell_table_fov: pd.DataFrame = cell_table[cell_table["fov"] == fov].copy()
+        cell_table_fov["tile_row"] = cell_table_fov["centroid-1"] // tile_size
+        cell_table_row["tile_col"] = cell_table_fov["centroid-0"] // tile_size
+        fov_occupancy_table: pd.DataFrame = cell_table_fov.groupby(
+            ["tile_row", "tile_col"]
+        ).size().reset_index(name="occupancy")
+        occupancy_stats.loc[
+            fov, fov_occupancy_table["tile_row"].values, fov_occupancy_table["tile_col"].values
+        ] += fov_occupancy_table["occupancy"].values
+
+    # define the tiles that are positive based on threshold
+    occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold
+
+    # count number of positive tiles per FOV
+    fov_positive_tiles: Dict[str, int] = occupancy_stats_positivity.sum(
+        dim=["tile_row", "tile_col"]
+    ).to_dict()["data"]
+
+    # 1. visualize histogram of positive tile counts across cohort
+    fig, axs = plt.subplots(
+        3,
+        1,
+        figsize=(10, 10)
+    )
+    axs[0].hist(list(fov_positive_tiles.values()))
+
+    # 2. visualize heatmap of average positivity of tiles across FOVs
+    cmap_positivity = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"])
+    c_positivity = axs[1].imshow(
+        occupancy_stats_positivity.mean(dim="fov").values,
+        cmap=cmap_positivity,
+        aspect="auto",
+        origin="lower"
+    )
+    for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values)
+        axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
+    fig.colorbar(c_positivity, ax=axs[1])
+
+    # 3. visualize heatmap of average number of counts across FOVs
+    cmap_counts = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"])
+    c_counts = axs[2].imshow(
+        occupancy_stats.mean(dim="fov").values,
+        cmap=cmap_counts,
+        aspect="auto"
+    )
+    for x, y in itertools.pairwise(occupancy_stats.tile_row.values)
+        axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
+    fig.colorbar(c_positivity, ax=axs[2])
+
+    # save the figure to save_dir
+    fig.savefig(
+        pathlib.Path(save_dir) / f"occupancy_stats_ts{tiles_per_row_col}_pt{positive_threshold}.png"
+        dpi=300
+    )

From 73e9692587ec066a3e6ebc6d461055e9c75377d0 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 14 Feb 2024 13:50:50 -0800
Subject: [PATCH 02/14] Missed a comma

---
 python_files/supplementary_plot_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index 5eb12cb..d9add96 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -343,7 +343,7 @@ def stitch_before_after_norm(
 
 # occupancy statistic helpers
 def visualize_occupancy_statistics(
-    cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path]max_image_size: int = 2048,
+    cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048,
     tiles_per_row_col: int = 8, positive_threshold: int = 5
 ):
     """Define occupancy statistics over a cohort, and visualize distribution of positive

From 8400c3c9a99faf4374e46c89bc0e6f987534a197 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 14 Feb 2024 13:51:09 -0800
Subject: [PATCH 03/14] Missed a few colons

---
 python_files/supplementary_plot_helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index d9add96..a3d1b63 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -410,7 +410,7 @@ def visualize_occupancy_statistics(
         aspect="auto",
         origin="lower"
     )
-    for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values)
+    for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values):
         axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
     fig.colorbar(c_positivity, ax=axs[1])
 
@@ -421,7 +421,7 @@ def visualize_occupancy_statistics(
         cmap=cmap_counts,
         aspect="auto"
     )
-    for x, y in itertools.pairwise(occupancy_stats.tile_row.values)
+    for x, y in itertools.pairwise(occupancy_stats.tile_row.values):
         axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
     fig.colorbar(c_positivity, ax=axs[2])
 

From fab6b08d741fc9a6f367d9945d309a0925397a2b Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 14 Feb 2024 13:52:09 -0800
Subject: [PATCH 04/14] Forgot a comma

---
 python_files/supplementary_plot_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index a3d1b63..40644f5 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -427,6 +427,6 @@ def visualize_occupancy_statistics(
 
     # save the figure to save_dir
     fig.savefig(
-        pathlib.Path(save_dir) / f"occupancy_stats_ts{tiles_per_row_col}_pt{positive_threshold}.png"
+        pathlib.Path(save_dir) / f"occupancy_stats_{tiles_per_row_col}_{positive_threshold}.png",
         dpi=300
     )

From dd271fc53e3f9bdc9f2e4f5ef46e4b6e217da0b2 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Thu, 15 Feb 2024 13:11:34 -0800
Subject: [PATCH 05/14] More efficient occupancy statistic calculations, and
 don't overlay tile names on heatmap

---
 python_files/generate_supplementary_plots.py | 821 ++++++++++---------
 python_files/supplementary_plot_helpers.py   |  69 +-
 2 files changed, 455 insertions(+), 435 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index dcf3134..1215dc1 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import matplotlib
 from ark.utils.plot_utils import cohort_cluster_plot
-from toffy import qc_comp, qc_metrics_plots
+# from toffy import qc_comp, qc_metrics_plots
 from alpineer import io_utils
 
 
@@ -24,411 +24,411 @@
 SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
 
 
-# Panel validation
-
-
-# ROI selection
-
-
-# QC
-qc_metrics = ["Non-zero mean intensity"]
-channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update",
-                   "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed",
-                   "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"]
-
-## FOV spatial location
-cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
-qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics"
-if not os.path.exists(qc_tma_metrics_dir):
-    os.makedirs(qc_tma_metrics_dir)
-
-fovs = io_utils.list_folders(cohort_path)
-tmas = list(set([fov.split('_R')[0] for fov in fovs]))
-
-qc_tmas = qc_comp.QCTMA(
-    qc_metrics=qc_metrics,
-    cohort_path=cohort_path,
-    metrics_dir=qc_tma_metrics_dir,
-)
-
-qc_tmas.compute_qc_tma_metrics(tmas=tmas)
-qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude)
-qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300)
-
-## longitudinal controls
-control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
-qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control"
-if not os.path.exists(qc_control_metrics_dir):
-    os.makedirs(qc_control_metrics_dir)
-
-folders = io_utils.list_folders(control_path, "TMA3_")
-control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4
-                   else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4]
-                   for name in folders]
-
-all_folders = io_utils.list_folders(control_path)
-for i, control in enumerate(control_substrs):
-    control_sample_name = control
-    print(control)
-    if control == 'tonsil_bottom':
-        fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25]
-    else:
-        fovs = [folder for folder in all_folders if control in folder]
-
-    qc_control = qc_comp.QCControlMetrics(
-        qc_metrics=qc_metrics,
-        cohort_path=control_path,
-        metrics_dir=qc_control_metrics_dir,
-    )
-
-    qc_control.compute_control_qc_metrics(
-        control_sample_name=control_sample_name,
-        fovs=fovs,
-        channel_exclude=channel_exclude,
-        channel_include=None,
-    )
-
-    qc_metrics_plots.longitudinal_control_heatmap(
-        qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300
-    )
-
-dfs = []
-for control in control_substrs:
-    df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv"))
-    df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])]
-    log2_norm_df: pd.DataFrame = df.pivot(
-        index="channel", columns="fov", values="Non-zero mean intensity"
-    ).transform(func=lambda row: np.log2(row / row.mean()), axis=1)
-    if control != 'tonsil_bottom_duplicate1':
-        dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col]
-        log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df
-
-    mean_t_df: pd.DataFrame = (
-        log2_norm_df.mean(axis=0)
-        .to_frame(name="mean")
-        .transpose()
-        .sort_values(by="mean", axis=1)
-    )
-    transformed_df: pd.DataFrame = pd.concat(
-        objs=[log2_norm_df, mean_t_df]
-    ).sort_values(by="mean", axis=1, inplace=False)
-    transformed_df.rename_axis("channel", axis=0, inplace=True)
-    transformed_df.rename_axis("fov", axis=1, inplace=True)
-
-    dfs.append(transformed_df)
-all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True)
-all_data = all_data.groupby(['channel']).mean()
-all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2)
-
-
-fig = plt.figure(figsize=(12,12), dpi=300)
-fig.set_layout_engine(layout="constrained")
-gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1])
-_norm = Normalize(vmin=-1, vmax=1)
-_cmap = sns.color_palette("vlag", as_cmap=True)
-fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ")
-
-annotation_kws = {
-    "horizontalalignment": "center",
-    "verticalalignment": "center",
-    "fontsize": 8,
-}
-
-ax_heatmap = fig.add_subplot(gs[0, 0])
-sns.heatmap(
-    data=all_data[~all_data.index.isin(["mean"])],
-    ax=ax_heatmap,
-    linewidths=1,
-    linecolor="black",
-    cbar_kws={"shrink": 0.5},
-    annot=True,
-    annot_kws=annotation_kws,
-    xticklabels=False,
-    norm=_norm,
-    cmap=_cmap,
-)
-
-ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$")
-ax_heatmap.set_yticks(
-    ticks=ax_heatmap.get_yticks(),
-    labels=ax_heatmap.get_yticklabels(),
-    rotation=0,
-)
-ax_heatmap.set_xlabel(None)
-
-ax_avg = fig.add_subplot(gs[1, 0])
-sns.heatmap(
-    data=all_data[all_data.index.isin(["mean"])],
-    ax=ax_avg,
-    linewidths=1,
-    linecolor="black",
-    annot=True,
-    annot_kws=annotation_kws,
-    fmt=".2f",
-    cmap=ListedColormap(["white"]),
-    cbar=False,
-)
-ax_avg.set_yticks(
-    ticks=ax_avg.get_yticks(),
-    labels=["Mean"],
-    rotation=0,
-)
-ax_avg.set_xticks(
-    ticks=ax_avg.get_xticks(),
-    labels=ax_avg.get_xticklabels(),
-    rotation=45,
-    ha="right",
-    rotation_mode="anchor",
-)
-ax_heatmap.set_ylabel("Channel")
-ax_avg.set_xlabel("FOV")
-
-fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300,
-            bbox_inches="tight")
-
-
-# Image processing
-## show a run with images stitched in acquisition order pre- and post-normalization
-acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order")
-if not os.path.exists(acquisition_order_viz_dir):
-    os.makedirs(acquisition_order_viz_dir)
-
-run_name = "2022-01-14_TONIC_TMA2_run1"
-pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta"
-post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized"
-save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
-
-# NOTE: image not scaled up, this happens in Photoshop
-supplementary_plot_helpers.stitch_before_after_norm(
-    pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name,
-    "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1
-)
-
-
-# Cell identification and classification
-cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
-cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
-                 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
-                 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
-                 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
-cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
-
-save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
-
-## cell cluster counts
-sns.histplot(data=cell_table, x="cell_cluster")
-sns.despine()
-plt.title("Cell Cluster Counts")
-plt.xlabel("Cell Cluster")
-plt.xticks(rotation=75)
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
-
-## fov cell counts
-cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
-plt.figure(figsize=(8, 6))
-g = sns.histplot(data=cluster_counts, kde=True)
-sns.despine()
-plt.title("Histogram of Cell Counts per Image")
-plt.xlabel("Number of Cells in an Image")
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
-
-## cell type composition by tissue location of met
-meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
-meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
-
-all_data = cell_table.merge(meta_data, on=['fov'], how='left')
-base_data = all_data[all_data.Timepoint == 'baseline']
-
-all_locals = np.unique(base_data.Localization)
-dfs = []
-for region in all_locals:
-    localization_data = base_data[base_data.Localization == region]
-
-    df = localization_data.groupby("cell_cluster_broad").count().reset_index()
-    df = df.set_index('cell_cluster_broad').transpose()
-    sub_df = df.iloc[:1].reset_index(drop=True)
-    sub_df.insert(0, "Localization", [region])
-    sub_df['Localization'] = sub_df['Localization'].map(str)
-    sub_df = sub_df.set_index('Localization')
-
-    dfs.append(sub_df)
-prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
-
-color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'],
-             'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite',  'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']}
-prop_data = prop_data[color_map['cell_cluster_broad']]
-
-colors = color_map['color']
-prop_data.plot(kind='bar', stacked=True, color=colors)
-plt.ticklabel_format(style='plain', useOffset=False, axis='y')
-plt.gca().set_ylabel("Cell Proportions")
-plt.gca().set_xlabel("Tissue Location")
-plt.xticks(rotation=30)
-plt.title("Cell Type Composition by Tissue Location")
-plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300)
-
-## colored cell cluster masks from random subset of 20 FOVs
-random.seed(13)
-seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
-
-all_fovs = list(cell_table['fov'].unique())
-fovs = random.sample(all_fovs, 20)
-cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
-
-cohort_cluster_plot(
-    fovs=fovs,
-    seg_dir=seg_dir,
-    save_dir=save_dir,
-    cell_data=cell_table_subset,
-    erode=True,
-    fov_col='fov',
-    label_col='label',
-    cluster_col='cell_cluster_broad',
-    seg_suffix="_whole_cell.tiff",
-    cmap=color_map,
-    display_fig=False,
-)
-
-# Functional marker thresholding
-cell_table = pd.read_csv(
-    os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
-)
-functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds")
-if not os.path.exists(functional_marker_viz_dir):
-    os.makedirs(functional_marker_viz_dir)
-
-marker_info = {
-    "Ki67": {
-        "populations": ["Cancer", "Mast"],
-        "threshold": 0.002,
-        "x_range": (0, 0.012),
-        "x_ticks": np.array([0, 0.004, 0.008, 0.012]),
-        "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]),
-    },
-    "CD38": {
-        "populations": ["Endothelium", "Cancer_EMT"],
-        "threshold": 0.004,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-    },
-    "CD45RB": {
-        "populations": ["CD4T", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.015),
-        "x_ticks": np.array([0, 0.005, 0.010, 0.015]),
-        "x_tick_labels": np.array([0, 0.005, 0.010, 0.015])
-    },
-    "CD45RO": {
-        "populations": ["CD4T", "Fibroblast"],
-        "threshold": 0.002,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
-    },
-    "CD57": {
-        "populations": ["CD8T", "B"],
-        "threshold": 0.002,
-        "x_range": (0, 0.006),
-        "x_ticks": np.array([0, 0.002, 0.004, 0.006]),
-        "x_tick_labels": np.array([0, 0.002, 0.004, 0.006])
-    },
-    "CD69": {
-        "populations": ["Treg", "Cancer"],
-        "threshold": 0.002,
-        "x_range": (0, 0.008),
-        "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]),
-        "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008])
-    },
-    "GLUT1": {
-        "populations": ["Cancer_EMT", "M2_Mac"],
-        "threshold": 0.002,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
-    },
-    "IDO": {
-        "populations": ["APC", "M1_Mac"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
-    },
-    "PD1": {
-        "populations": ["CD8T", "Stroma"],
-        "threshold": 0.0005,
-        "x_range": (0, 0.002),
-        "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]),
-        "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002])
-    },
-    "PDL1": {
-        "populations": ["Cancer", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]),
-    },
-    "HLA1": {
-        "populations": ["APC", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.025),
-        "x_ticks": np.array([0, 0.0125, 0.025]),
-        "x_tick_labels": np.array([0, 0.0125, 0.025])
-    },
-    "HLADR": {
-        "populations": ["APC", "Neutrophil"],
-        "threshold": 0.001,
-        "x_range": (0, 0.025),
-        "x_ticks": np.array([0, 0.0125, 0.025]),
-        "x_tick_labels": np.array([0, 0.0125, 0.025])
-    },
-    "TBET": {
-        "populations": ["NK", "B"],
-        "threshold": 0.0015,
-        "x_range": (0, 0.0045),
-        "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]),
-        "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045])
-    },
-    "TCF1": {
-        "populations": ["CD4T", "M1_Mac"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
-    },
-    "TIM3": {
-        "populations": ["Monocyte", "Endothelium"],
-        "threshold": 0.001,
-        "x_range": (0, 0.004),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004])
-    },
-    "Vim": {
-        "populations": ["Endothelium", "B"],
-        "threshold": 0.002,
-        "x_range": (0, 0.06),
-        "x_ticks": np.array([0, 0.02, 0.04, 0.06]),
-        "x_tick_labels": np.array([0, 0.02, 0.04, 0.06])
-    },
-    "Fe": {
-        "populations": ["Fibroblast", "Cancer"],
-        "threshold": 0.1,
-        "x_range": (0, 0.3),
-        "x_ticks": np.array([0, 0.1, 0.2, 0.3]),
-        "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]),
-    }
-}
-supplementary_plot_helpers.functional_marker_thresholding(
-    cell_table, functional_marker_viz_dir, marker_info=marker_info,
-    figsize=(20, 40)
-)
-
-
-# Feature extraction
+# # Panel validation
+
+
+# # ROI selection
+
+
+# # QC
+# qc_metrics = ["Non-zero mean intensity"]
+# channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update",
+#                    "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed",
+#                    "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"]
+
+# ## FOV spatial location
+# cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
+# qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics"
+# if not os.path.exists(qc_tma_metrics_dir):
+#     os.makedirs(qc_tma_metrics_dir)
+
+# fovs = io_utils.list_folders(cohort_path)
+# tmas = list(set([fov.split('_R')[0] for fov in fovs]))
+
+# qc_tmas = qc_comp.QCTMA(
+#     qc_metrics=qc_metrics,
+#     cohort_path=cohort_path,
+#     metrics_dir=qc_tma_metrics_dir,
+# )
+
+# qc_tmas.compute_qc_tma_metrics(tmas=tmas)
+# qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude)
+# qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300)
+
+# ## longitudinal controls
+# control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
+# qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control"
+# if not os.path.exists(qc_control_metrics_dir):
+#     os.makedirs(qc_control_metrics_dir)
+
+# folders = io_utils.list_folders(control_path, "TMA3_")
+# control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4
+#                    else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4]
+#                    for name in folders]
+
+# all_folders = io_utils.list_folders(control_path)
+# for i, control in enumerate(control_substrs):
+#     control_sample_name = control
+#     print(control)
+#     if control == 'tonsil_bottom':
+#         fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25]
+#     else:
+#         fovs = [folder for folder in all_folders if control in folder]
+
+#     qc_control = qc_comp.QCControlMetrics(
+#         qc_metrics=qc_metrics,
+#         cohort_path=control_path,
+#         metrics_dir=qc_control_metrics_dir,
+#     )
+
+#     qc_control.compute_control_qc_metrics(
+#         control_sample_name=control_sample_name,
+#         fovs=fovs,
+#         channel_exclude=channel_exclude,
+#         channel_include=None,
+#     )
+
+#     qc_metrics_plots.longitudinal_control_heatmap(
+#         qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300
+#     )
+
+# dfs = []
+# for control in control_substrs:
+#     df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv"))
+#     df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])]
+#     log2_norm_df: pd.DataFrame = df.pivot(
+#         index="channel", columns="fov", values="Non-zero mean intensity"
+#     ).transform(func=lambda row: np.log2(row / row.mean()), axis=1)
+#     if control != 'tonsil_bottom_duplicate1':
+#         dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col]
+#         log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df
+
+#     mean_t_df: pd.DataFrame = (
+#         log2_norm_df.mean(axis=0)
+#         .to_frame(name="mean")
+#         .transpose()
+#         .sort_values(by="mean", axis=1)
+#     )
+#     transformed_df: pd.DataFrame = pd.concat(
+#         objs=[log2_norm_df, mean_t_df]
+#     ).sort_values(by="mean", axis=1, inplace=False)
+#     transformed_df.rename_axis("channel", axis=0, inplace=True)
+#     transformed_df.rename_axis("fov", axis=1, inplace=True)
+
+#     dfs.append(transformed_df)
+# all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True)
+# all_data = all_data.groupby(['channel']).mean()
+# all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2)
+
+
+# fig = plt.figure(figsize=(12,12), dpi=300)
+# fig.set_layout_engine(layout="constrained")
+# gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1])
+# _norm = Normalize(vmin=-1, vmax=1)
+# _cmap = sns.color_palette("vlag", as_cmap=True)
+# fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ")
+
+# annotation_kws = {
+#     "horizontalalignment": "center",
+#     "verticalalignment": "center",
+#     "fontsize": 8,
+# }
+
+# ax_heatmap = fig.add_subplot(gs[0, 0])
+# sns.heatmap(
+#     data=all_data[~all_data.index.isin(["mean"])],
+#     ax=ax_heatmap,
+#     linewidths=1,
+#     linecolor="black",
+#     cbar_kws={"shrink": 0.5},
+#     annot=True,
+#     annot_kws=annotation_kws,
+#     xticklabels=False,
+#     norm=_norm,
+#     cmap=_cmap,
+# )
+
+# ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$")
+# ax_heatmap.set_yticks(
+#     ticks=ax_heatmap.get_yticks(),
+#     labels=ax_heatmap.get_yticklabels(),
+#     rotation=0,
+# )
+# ax_heatmap.set_xlabel(None)
+
+# ax_avg = fig.add_subplot(gs[1, 0])
+# sns.heatmap(
+#     data=all_data[all_data.index.isin(["mean"])],
+#     ax=ax_avg,
+#     linewidths=1,
+#     linecolor="black",
+#     annot=True,
+#     annot_kws=annotation_kws,
+#     fmt=".2f",
+#     cmap=ListedColormap(["white"]),
+#     cbar=False,
+# )
+# ax_avg.set_yticks(
+#     ticks=ax_avg.get_yticks(),
+#     labels=["Mean"],
+#     rotation=0,
+# )
+# ax_avg.set_xticks(
+#     ticks=ax_avg.get_xticks(),
+#     labels=ax_avg.get_xticklabels(),
+#     rotation=45,
+#     ha="right",
+#     rotation_mode="anchor",
+# )
+# ax_heatmap.set_ylabel("Channel")
+# ax_avg.set_xlabel("FOV")
+
+# fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300,
+#             bbox_inches="tight")
+
+
+# # Image processing
+# ## show a run with images stitched in acquisition order pre- and post-normalization
+# acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order")
+# if not os.path.exists(acquisition_order_viz_dir):
+#     os.makedirs(acquisition_order_viz_dir)
+
+# run_name = "2022-01-14_TONIC_TMA2_run1"
+# pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta"
+# post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized"
+# save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
+
+# # NOTE: image not scaled up, this happens in Photoshop
+# supplementary_plot_helpers.stitch_before_after_norm(
+#     pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name,
+#     "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1
+# )
+
+
+# # Cell identification and classification
+# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
+# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
+#                  'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
+#                  'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
+#                  'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
+# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
+
+# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
+
+# ## cell cluster counts
+# sns.histplot(data=cell_table, x="cell_cluster")
+# sns.despine()
+# plt.title("Cell Cluster Counts")
+# plt.xlabel("Cell Cluster")
+# plt.xticks(rotation=75)
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
+
+# ## fov cell counts
+# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
+# plt.figure(figsize=(8, 6))
+# g = sns.histplot(data=cluster_counts, kde=True)
+# sns.despine()
+# plt.title("Histogram of Cell Counts per Image")
+# plt.xlabel("Number of Cells in an Image")
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
+
+# ## cell type composition by tissue location of met
+# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
+# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
+
+# all_data = cell_table.merge(meta_data, on=['fov'], how='left')
+# base_data = all_data[all_data.Timepoint == 'baseline']
+
+# all_locals = np.unique(base_data.Localization)
+# dfs = []
+# for region in all_locals:
+#     localization_data = base_data[base_data.Localization == region]
+
+#     df = localization_data.groupby("cell_cluster_broad").count().reset_index()
+#     df = df.set_index('cell_cluster_broad').transpose()
+#     sub_df = df.iloc[:1].reset_index(drop=True)
+#     sub_df.insert(0, "Localization", [region])
+#     sub_df['Localization'] = sub_df['Localization'].map(str)
+#     sub_df = sub_df.set_index('Localization')
+
+#     dfs.append(sub_df)
+# prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
+
+# color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'],
+#              'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite',  'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']}
+# prop_data = prop_data[color_map['cell_cluster_broad']]
+
+# colors = color_map['color']
+# prop_data.plot(kind='bar', stacked=True, color=colors)
+# plt.ticklabel_format(style='plain', useOffset=False, axis='y')
+# plt.gca().set_ylabel("Cell Proportions")
+# plt.gca().set_xlabel("Tissue Location")
+# plt.xticks(rotation=30)
+# plt.title("Cell Type Composition by Tissue Location")
+# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300)
+
+# ## colored cell cluster masks from random subset of 20 FOVs
+# random.seed(13)
+# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
+
+# all_fovs = list(cell_table['fov'].unique())
+# fovs = random.sample(all_fovs, 20)
+# cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
+
+# cohort_cluster_plot(
+#     fovs=fovs,
+#     seg_dir=seg_dir,
+#     save_dir=save_dir,
+#     cell_data=cell_table_subset,
+#     erode=True,
+#     fov_col='fov',
+#     label_col='label',
+#     cluster_col='cell_cluster_broad',
+#     seg_suffix="_whole_cell.tiff",
+#     cmap=color_map,
+#     display_fig=False,
+# )
+
+# # Functional marker thresholding
+# cell_table = pd.read_csv(
+#     os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
+# )
+# functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds")
+# if not os.path.exists(functional_marker_viz_dir):
+#     os.makedirs(functional_marker_viz_dir)
+
+# marker_info = {
+#     "Ki67": {
+#         "populations": ["Cancer", "Mast"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.012),
+#         "x_ticks": np.array([0, 0.004, 0.008, 0.012]),
+#         "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]),
+#     },
+#     "CD38": {
+#         "populations": ["Endothelium", "Cancer_EMT"],
+#         "threshold": 0.004,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#     },
+#     "CD45RB": {
+#         "populations": ["CD4T", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.015),
+#         "x_ticks": np.array([0, 0.005, 0.010, 0.015]),
+#         "x_tick_labels": np.array([0, 0.005, 0.010, 0.015])
+#     },
+#     "CD45RO": {
+#         "populations": ["CD4T", "Fibroblast"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
+#     },
+#     "CD57": {
+#         "populations": ["CD8T", "B"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.006),
+#         "x_ticks": np.array([0, 0.002, 0.004, 0.006]),
+#         "x_tick_labels": np.array([0, 0.002, 0.004, 0.006])
+#     },
+#     "CD69": {
+#         "populations": ["Treg", "Cancer"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.008),
+#         "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]),
+#         "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008])
+#     },
+#     "GLUT1": {
+#         "populations": ["Cancer_EMT", "M2_Mac"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
+#     },
+#     "IDO": {
+#         "populations": ["APC", "M1_Mac"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
+#     },
+#     "PD1": {
+#         "populations": ["CD8T", "Stroma"],
+#         "threshold": 0.0005,
+#         "x_range": (0, 0.002),
+#         "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]),
+#         "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002])
+#     },
+#     "PDL1": {
+#         "populations": ["Cancer", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]),
+#     },
+#     "HLA1": {
+#         "populations": ["APC", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.025),
+#         "x_ticks": np.array([0, 0.0125, 0.025]),
+#         "x_tick_labels": np.array([0, 0.0125, 0.025])
+#     },
+#     "HLADR": {
+#         "populations": ["APC", "Neutrophil"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.025),
+#         "x_ticks": np.array([0, 0.0125, 0.025]),
+#         "x_tick_labels": np.array([0, 0.0125, 0.025])
+#     },
+#     "TBET": {
+#         "populations": ["NK", "B"],
+#         "threshold": 0.0015,
+#         "x_range": (0, 0.0045),
+#         "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]),
+#         "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045])
+#     },
+#     "TCF1": {
+#         "populations": ["CD4T", "M1_Mac"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
+#     },
+#     "TIM3": {
+#         "populations": ["Monocyte", "Endothelium"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.004),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004])
+#     },
+#     "Vim": {
+#         "populations": ["Endothelium", "B"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.06),
+#         "x_ticks": np.array([0, 0.02, 0.04, 0.06]),
+#         "x_tick_labels": np.array([0, 0.02, 0.04, 0.06])
+#     },
+#     "Fe": {
+#         "populations": ["Fibroblast", "Cancer"],
+#         "threshold": 0.1,
+#         "x_range": (0, 0.3),
+#         "x_ticks": np.array([0, 0.1, 0.2, 0.3]),
+#         "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]),
+#     }
+# }
+# supplementary_plot_helpers.functional_marker_thresholding(
+#     cell_table, functional_marker_viz_dir, marker_info=marker_info,
+#     figsize=(20, 40)
+# )
+
+
+# # Feature extraction
 
 
 # Occupancy statistics
@@ -440,10 +440,15 @@
 if not os.path.exists(occupancy_stats_viz_dir):
     os.makedirs(occupancy_stats_viz_dir)
 
+# supplementary_plot_helpers.visualize_occupancy_statistics(
+#     cell_table, occupancy_stats_viz_dir
+# )
+
 # massive GridSearch
-for tiles_per_row_col in [2, 4, 8, 16, 32, 64, 128]:
-    for positive_threshold in np.arange(1, 26):
+for tiles_per_row_col in [32, 64, 128, 256, 512]:
+    for positive_threshold in [1, 2, 5, 10]:
         supplementary_plot_helpers.visualize_occupancy_statistics(
             cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col,
             positive_threshold=positive_threshold,
         )
+        print(f"Generated plot for # tiles {tiles_per_row_col} and threshold {positive_threshold}")
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index 40644f5..ce8ed62 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -344,7 +344,7 @@ def stitch_before_after_norm(
 # occupancy statistic helpers
 def visualize_occupancy_statistics(
     cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048,
-    tiles_per_row_col: int = 8, positive_threshold: int = 5
+    tiles_per_row_col: int = 8, positive_threshold: int = 20
 ):
     """Define occupancy statistics over a cohort, and visualize distribution of positive
     tile counts across FOV.
@@ -367,32 +367,34 @@ def visualize_occupancy_statistics(
     # define the tile size in pixels
     tile_size: int = max_image_size // tiles_per_row_col
 
+    cell_table["tile_row"] = (cell_table["centroid-1"] // tile_size).astype(int)
+    cell_table["tile_col"] = (cell_table["centroid-0"] // tile_size).astype(int)
+
+    # Group by FOV and tile positions, then count occupancy
+    occupancy_counts: pd.DataFrame = cell_table.groupby(
+        ["fov", "tile_row", "tile_col"]
+    ).size().reset_index(name="occupancy")
+
     # define the occupancy_stats array
-    occupancy_stats: xr.DataArray = xr.DataArray(
-        np.zeros((len(fov_names), tile_size, tile_size), dtype=np.int16),
-        coords=[fov_names, np.zeros(tile_size), np.zeros(tile_size)],
+    occupancy_stats = xr.DataArray(
+        np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16),
+        coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)],
         dims=["fov", "x", "y"]
     )
 
-    # for each FOV, update the corresponding tile count
-    for fov in fov_names:
-        cell_table_fov: pd.DataFrame = cell_table[cell_table["fov"] == fov].copy()
-        cell_table_fov["tile_row"] = cell_table_fov["centroid-1"] // tile_size
-        cell_table_row["tile_col"] = cell_table_fov["centroid-0"] // tile_size
-        fov_occupancy_table: pd.DataFrame = cell_table_fov.groupby(
-            ["tile_row", "tile_col"]
-        ).size().reset_index(name="occupancy")
-        occupancy_stats.loc[
-            fov, fov_occupancy_table["tile_row"].values, fov_occupancy_table["tile_col"].values
-        ] += fov_occupancy_table["occupancy"].values
+    # Update the DataArray based on occupancy_counts without explicit Python loops
+    for index, row in occupancy_counts.iterrows():
+        occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"]
 
     # define the tiles that are positive based on threshold
     occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold
 
     # count number of positive tiles per FOV
-    fov_positive_tiles: Dict[str, int] = occupancy_stats_positivity.sum(
-        dim=["tile_row", "tile_col"]
-    ).to_dict()["data"]
+    fov_positive_tiles: xr.DataArray = occupancy_stats_positivity.sum(dim=["x", "y"])
+    print(fov_positive_tiles)
+    fov_positive_tile_counts: Dict[str, int] = dict(
+        zip(fov_positive_tiles.fov.values, fov_positive_tiles.values)
+    )
 
     # 1. visualize histogram of positive tile counts across cohort
     fig, axs = plt.subplots(
@@ -400,7 +402,9 @@ def visualize_occupancy_statistics(
         1,
         figsize=(10, 10)
     )
-    axs[0].hist(list(fov_positive_tiles.values()))
+    print(list(fov_positive_tile_counts.values()))
+    axs[0].hist(list(fov_positive_tile_counts.values()))
+    axs[0].set_title("Distribution of tile positivity")
 
     # 2. visualize heatmap of average positivity of tiles across FOVs
     cmap_positivity = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"])
@@ -408,22 +412,33 @@ def visualize_occupancy_statistics(
         occupancy_stats_positivity.mean(dim="fov").values,
         cmap=cmap_positivity,
         aspect="auto",
-        origin="lower"
+        vmin=0,
+        vmax=1
     )
-    for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values):
-        axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
-    fig.colorbar(c_positivity, ax=axs[1])
+    axs[1].set_xticks([])
+    axs[1].set_yticks([])
+    axs[1].set_title("Mean positivity per tile")
+    fig.colorbar(c_positivity, ax=axs[1], ticks=[0, 0.25, 0.5, 0.75, 1])
 
     # 3. visualize heatmap of average number of counts across FOVs
     cmap_counts = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"])
+    max_occupancy_stat = np.round(np.max(occupancy_stats.mean(dim="fov").values), 2)
     c_counts = axs[2].imshow(
         occupancy_stats.mean(dim="fov").values,
         cmap=cmap_counts,
-        aspect="auto"
+        aspect="auto",
+        vmin=0,
+        vmax=max_occupancy_stat
+    )
+    axs[2].set_xticks([])
+    axs[2].set_yticks([])
+    axs[2].set_title("Mean num cells per tile")
+    fig.colorbar(c_counts, ax=axs[2], ticks=np.round(np.linspace(0, max_occupancy_stat, 4), 2))
+
+    fig.suptitle(
+        f"Stats for # tiles: {tiles_per_row_col}, positive threshold: {positive_threshold}",
+        fontsize=24
     )
-    for x, y in itertools.pairwise(occupancy_stats.tile_row.values):
-        axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black")
-    fig.colorbar(c_positivity, ax=axs[2])
 
     # save the figure to save_dir
     fig.savefig(

From fc56d8eed020f3c4aec8540a0410d295ba69ce63 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 27 Feb 2024 11:14:25 -0800
Subject: [PATCH 06/14] Update grid sizes to grid search over: don't need to be
 too granular

---
 python_files/generate_supplementary_plots.py | 4 ++--
 python_files/supplementary_plot_helpers.py   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index 1215dc1..4822c14 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -445,8 +445,8 @@
 # )
 
 # massive GridSearch
-for tiles_per_row_col in [32, 64, 128, 256, 512]:
-    for positive_threshold in [1, 2, 5, 10]:
+for tiles_per_row_col in [4, 8, 16]:
+    for positive_threshold in [10, 25, 50, 100, 200, 500]:
         supplementary_plot_helpers.visualize_occupancy_statistics(
             cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col,
             positive_threshold=positive_threshold,
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index ce8ed62..ba77dac 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -391,7 +391,6 @@ def visualize_occupancy_statistics(
 
     # count number of positive tiles per FOV
     fov_positive_tiles: xr.DataArray = occupancy_stats_positivity.sum(dim=["x", "y"])
-    print(fov_positive_tiles)
     fov_positive_tile_counts: Dict[str, int] = dict(
         zip(fov_positive_tiles.fov.values, fov_positive_tiles.values)
     )

From be8083df0b284aee4d4748f47b87033b605b5488 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Fri, 1 Mar 2024 12:07:28 -0800
Subject: [PATCH 07/14] Save the occupancy statistics for faster loading

---
 python_files/generate_supplementary_plots.py | 55 +++++++++++---
 python_files/supplementary_plot_helpers.py   | 75 ++++++++++++++++++++
 2 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index 4822c14..4ec8be4 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -8,6 +8,7 @@
 from ark.utils.plot_utils import cohort_cluster_plot
 # from toffy import qc_comp, qc_metrics_plots
 from alpineer import io_utils
+from alpineer.load_utils import load_imgs_from_tree
 
 
 matplotlib.rcParams['pdf.fonttype'] = 42
@@ -21,6 +22,7 @@
 import supplementary_plot_helpers
 
 ANALYSIS_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files"
+IMAGE_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
 SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
 
 
@@ -436,19 +438,56 @@
 cell_table = pd.read_csv(
     os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
 )
+
 occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats")
 if not os.path.exists(occupancy_stats_viz_dir):
     os.makedirs(occupancy_stats_viz_dir)
 
-# supplementary_plot_helpers.visualize_occupancy_statistics(
-#     cell_table, occupancy_stats_viz_dir
-# )
+# because not all of the images are of the same size, make sure to append that as a feature
+# NOTE: this is expensive to create, load in a previously-generated version if it exists
+if os.path.exists(os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv")):
+    cell_table = pd.read_csv(
+        os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv")
+    )
+else:
+    cell_table = pd.read_csv(
+        os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
+    )
+    fov_sizes = {
+        fov: load_imgs_from_tree(
+            IMAGE_DIR, img_sub_folder=None, fovs=[fov], channels=["CD20"]
+        ).shape[1]
+        for fov in list(cell_table["fov"].unique())
+    }
+    fov_sizes = pd.DataFrame(
+        {"fov": list(fov_sizes.keys()), "fov_pixel_size": list(fov_sizes.values())}
+    )
+
+    cell_table = cell_table.merge(fov_sizes, on="fov")
+    cell_table.to_csv(
+        os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv"),
+        index=False
+    )
 
 # massive GridSearch
+total_occupancy_stats_df = pd.DataFrame()
 for tiles_per_row_col in [4, 8, 16]:
-    for positive_threshold in [10, 25, 50, 100, 200, 500]:
-        supplementary_plot_helpers.visualize_occupancy_statistics(
-            cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col,
-            positive_threshold=positive_threshold,
+    for positive_threshold in [5, 10, 15, 20]:
+        occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics(
+            cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"],
+            tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold
+        )
+        occupancy_stats_df = pd.DataFrame(
+            {
+                "fov": list(occupancy_stats.keys()),
+                "percent_positive_tiles": list(occupancy_stats.values())
+            }
         )
-        print(f"Generated plot for # tiles {tiles_per_row_col} and threshold {positive_threshold}")
+        occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2
+        occupancy_stats_df["positive_threshold"] = positive_threshold
+
+        total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
+
+total_occupancy_stats_df.to_csv(
+    os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_updated.csv"), index=False
+)
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index ba77dac..7868c90 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -341,6 +341,81 @@ def stitch_before_after_norm(
     post_norm_tiled.save(post_norm_stitched_path)
 
 
+def compute_occupancy_statistics(
+    cell_table: pd.DataFrame, pop_col: str = "cell_cluster",
+    pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8,
+    positive_threshold: int = 20
+):
+    """Compute the occupancy statistics over a cohort.
+
+    Args:
+        cell_table (pd.DataFrame):
+            The cell table associated with the cohort, need to contain all FOVs as well as 
+            a column indicating the size of the FOV.
+        pop_col (str):
+            Column containing the names of the cell populations
+        pop_subset (Optional[List[str]]):
+            Which populations, if any, to subset on. If None, use all populations
+        tiles_per_row_col (int):
+            The row/col dims of tiles to define over each image
+        positive_threshold (int):
+            The cell count in a tile required for positivity
+
+    Returns:
+        Dict[str, float]:
+            A dictionary mapping each FOV to the percentage of tiles above the positive threshold 
+            for number of cells.
+    """
+    # get all the FOV names
+    fov_names: np.ndarray = cell_table["fov"].unique()
+
+    # if a population subset is specified, first validate then truncate the cell table accordingly
+    if pop_subset:
+        verify_in_list(
+            specified_populations=pop_subset,
+            valid_populations=cell_table[pop_col].unique()
+        )
+
+        cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy()
+
+    # define the tile size in pixels for each FOV
+    cell_table["tile_size"] = cell_table["fov_pixel_size"] // tiles_per_row_col
+
+    # define the tile for each cell
+    cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int)
+    cell_table["tile_col"] = (cell_table["centroid-0"] // cell_table["tile_size"]).astype(int)
+
+    # Group by FOV and tile positions, then count occupancy
+    occupancy_counts: pd.DataFrame = cell_table.groupby(
+        ["fov", "tile_row", "tile_col"]
+    ).size().reset_index(name="occupancy")
+
+    # define the occupancy_stats array
+    occupancy_stats = xr.DataArray(
+        np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16),
+        coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)],
+        dims=["fov", "x", "y"]
+    )
+
+    # Update the DataArray based on occupancy_counts without explicit Python loops
+    for index, row in occupancy_counts.iterrows():
+        occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"]
+
+    # define the tiles that are positive based on threshold
+    occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold
+
+    # compute the percentage of positive tiles for each FOV
+    occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum(
+        dim=['x', 'y']
+    ) / (tiles_per_row_col ** 2)
+
+    # convert to dict
+    occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict()
+    occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()}
+
+    return occupancy_stats_dict
+
+
 # occupancy statistic helpers
 def visualize_occupancy_statistics(
     cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048,

From e26583eb805a40aa8395766497cc20e741ca676b Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Tue, 5 Mar 2024 11:39:29 -0800
Subject: [PATCH 08/14] File naming updates

---
 python_files/generate_supplementary_plots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index 4ec8be4..0fe0b9f 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -489,5 +489,5 @@
         total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
 
 total_occupancy_stats_df.to_csv(
-    os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_updated.csv"), index=False
+    os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False
 )

From dbd50c7da29fa9bb55d3eebc3964b4080c4ad188 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 6 Mar 2024 11:44:43 -0800
Subject: [PATCH 09/14] Updated plotting

---
 python_files/supplementary_plot_helpers.py | 71 +++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index 8ebbeef..d02f78e 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -410,6 +410,7 @@ def stitch_before_after_norm(
     post_norm_tiled.save(post_norm_stitched_path)
 
 
+# occupancy statistic helpers
 def compute_occupancy_statistics(
     cell_table: pd.DataFrame, pop_col: str = "cell_cluster",
     pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8,
@@ -485,8 +486,76 @@ def compute_occupancy_statistics(
     return occupancy_stats_dict
 
 
-# occupancy statistic helpers
 def visualize_occupancy_statistics(
+    occupancy_stats_table: pd.DataFrame, save_dir: Union[str, pathlib.Path],
+    pop_subset: Optional[List[str]] = None, figsize: Optional[Tuple[float, float]] = (10, 10)
+):
+    """Visualize the distribution of the percentage of tiles above a positive threshold.
+
+    Data needs to be generated by compute_occupancy_statistics.
+
+    Args:
+        occupancy_stats_table (pd.DataFrame):
+            The table generated by compute_occupancy_statistics, lists the percentage of positive 
+            tiles for each image of the cohort at different positivity thresholds and grid sizes
+        save_dir (Union[str, pathlib.Path]):
+            Directory to save the visualizations in
+        pop_subset (Optional[List[str]]):
+            Which populations, if any, to subset on. If None, use all populations
+        fig_size (Optional[Tuple[float, float]]):
+            The figure size to use for the image.
+    """
+    # generate each unique test pair
+    tiles_threshold_trials = occupancy_stats_table[
+        ["num_tiles", "positive_threshold"]
+    ].drop_duplicates().reset_index(drop=True)
+
+    # define a separate plot on the same grid for each test pair
+    fig, axs = plt.subplots(
+        tiles_threshold_trials.shape[0], 1, figsize=figsize
+    )
+    populations_str = f"{', '.join(pop_subset)} populations" if pop_subset else "all populations"
+    # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24)
+    # fig.subplots_adjust(top=1.50)
+
+    # define an index to iterate through the subplots
+    subplot_i = 0
+
+    for _, trial in tiles_threshold_trials.iterrows():
+        grid_size = int(np.sqrt(trial["num_tiles"]))
+        positive_threshold = trial["positive_threshold"]
+
+        # subset data obtained just for the specified trial
+        trial_data = occupancy_stats_table[
+            (occupancy_stats_table["num_tiles"] == trial["num_tiles"]) &
+            (occupancy_stats_table["positive_threshold"] == trial["positive_threshold"])
+        ]
+
+        # visualize the distribution using a histogram
+        positive_tile_percentages = trial_data["percent_positive_tiles"].values
+        axs[subplot_i].hist(
+            positive_tile_percentages,
+            facecolor="g",
+            bins=20,
+            alpha=0.75
+        )
+        axs[subplot_i].set_title(
+            f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}"
+        )
+
+        # update the subplot index
+        subplot_i += 1
+
+    plt.tight_layout()
+
+    # save the figure to save_dir
+    fig.savefig(
+        pathlib.Path(save_dir) / f"occupancy_statistic_distributions.png",
+        dpi=300
+    )
+
+
+def visualize_occupancy_statistics_old(
     cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048,
     tiles_per_row_col: int = 8, positive_threshold: int = 20
 ):

From c82a02f7877ab794177cb2b773b1d0f91e479fe5 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 6 Mar 2024 11:44:56 -0800
Subject: [PATCH 10/14] Comment out code that doesn't need to be run

---
 python_files/generate_supplementary_plots.py | 1109 +++++++++---------
 1 file changed, 559 insertions(+), 550 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index 97855ff..fdcf24a 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -26,535 +26,535 @@
 SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
 
 
-# Panel validation
-panel_validation_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "panel_validation")
-if not os.path.exists(panel_validation_viz_dir):
-    os.makedirs(panel_validation_viz_dir)
-
-controls_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
-controls_fov = "TONIC_TMA1_colon_bottom"
-supplementary_plot_helpers.validate_panel(
-    controls_dir, controls_fov, panel_validation_viz_dir, font_size=180
-)
-
-samples_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
-samples_fov = "TONIC_TMA2_R5C4"
-samples_channels = sorted(io_utils.remove_file_extensions(
-    io_utils.list_files(os.path.join(samples_dir, samples_fov), substrs=".tiff")
-))
-exclude_chans = ["CD11c_nuc_exclude", "CK17_smoothed", "ECAD_smoothed", "FOXP3_nuc_include",
-                 "chan_39", "chan_45", "chan_48", "chan_115", "chan_141"]
-for ec in exclude_chans:
-    if ec in samples_channels:
-        samples_channels.remove(ec)
-supplementary_plot_helpers.validate_panel(
-    samples_dir, samples_fov, panel_validation_viz_dir, channels=samples_channels, font_size=320
-)
-
-
-# ROI selection
-
-
-# QC
-qc_metrics = ["Non-zero mean intensity"]
-channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update",
-                   "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed",
-                   "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"]
-
-## FOV spatial location
-cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
-qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics"
-if not os.path.exists(qc_tma_metrics_dir):
-    os.makedirs(qc_tma_metrics_dir)
-
-fovs = io_utils.list_folders(cohort_path)
-tmas = list(set([fov.split('_R')[0] for fov in fovs]))
-
-qc_tmas = qc_comp.QCTMA(
-    qc_metrics=qc_metrics,
-    cohort_path=cohort_path,
-    metrics_dir=qc_tma_metrics_dir,
-)
-
-qc_tmas.compute_qc_tma_metrics(tmas=tmas)
-qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude)
-qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300)
-
-## longitudinal controls
-control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
-qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control"
-if not os.path.exists(qc_control_metrics_dir):
-    os.makedirs(qc_control_metrics_dir)
-
-folders = io_utils.list_folders(control_path, "TMA3_")
-control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4
-                   else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4]
-                   for name in folders]
-
-all_folders = io_utils.list_folders(control_path)
-for i, control in enumerate(control_substrs):
-    control_sample_name = control
-    print(control)
-    if control == 'tonsil_bottom':
-        fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25]
-    else:
-        fovs = [folder for folder in all_folders if control in folder]
-
-    qc_control = qc_comp.QCControlMetrics(
-        qc_metrics=qc_metrics,
-        cohort_path=control_path,
-        metrics_dir=qc_control_metrics_dir,
-    )
-
-    qc_control.compute_control_qc_metrics(
-        control_sample_name=control_sample_name,
-        fovs=fovs,
-        channel_exclude=channel_exclude,
-        channel_include=None,
-    )
-
-    qc_metrics_plots.longitudinal_control_heatmap(
-        qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300
-    )
-
-dfs = []
-for control in control_substrs:
-    df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv"))
-    df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])]
-    log2_norm_df: pd.DataFrame = df.pivot(
-        index="channel", columns="fov", values="Non-zero mean intensity"
-    ).transform(func=lambda row: np.log2(row / row.mean()), axis=1)
-    if control != 'tonsil_bottom_duplicate1':
-        dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col]
-        log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df
-
-    mean_t_df: pd.DataFrame = (
-        log2_norm_df.mean(axis=0)
-        .to_frame(name="mean")
-        .transpose()
-        .sort_values(by="mean", axis=1)
-    )
-    transformed_df: pd.DataFrame = pd.concat(
-        objs=[log2_norm_df, mean_t_df]
-    ).sort_values(by="mean", axis=1, inplace=False)
-    transformed_df.rename_axis("channel", axis=0, inplace=True)
-    transformed_df.rename_axis("fov", axis=1, inplace=True)
-
-    dfs.append(transformed_df)
-all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True)
-all_data = all_data.groupby(['channel']).mean()
-all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2)
-
-
-fig = plt.figure(figsize=(12,12), dpi=300)
-fig.set_layout_engine(layout="constrained")
-gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1])
-_norm = Normalize(vmin=-1, vmax=1)
-_cmap = sns.color_palette("vlag", as_cmap=True)
-fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ")
-
-annotation_kws = {
-    "horizontalalignment": "center",
-    "verticalalignment": "center",
-    "fontsize": 8,
-}
-
-ax_heatmap = fig.add_subplot(gs[0, 0])
-sns.heatmap(
-    data=all_data[~all_data.index.isin(["mean"])],
-    ax=ax_heatmap,
-    linewidths=1,
-    linecolor="black",
-    cbar_kws={"shrink": 0.5},
-    annot=True,
-    annot_kws=annotation_kws,
-    xticklabels=False,
-    norm=_norm,
-    cmap=_cmap,
-)
-
-ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$")
-ax_heatmap.set_yticks(
-    ticks=ax_heatmap.get_yticks(),
-    labels=ax_heatmap.get_yticklabels(),
-    rotation=0,
-)
-ax_heatmap.set_xlabel(None)
-
-ax_avg = fig.add_subplot(gs[1, 0])
-sns.heatmap(
-    data=all_data[all_data.index.isin(["mean"])],
-    ax=ax_avg,
-    linewidths=1,
-    linecolor="black",
-    annot=True,
-    annot_kws=annotation_kws,
-    fmt=".2f",
-    cmap=ListedColormap(["white"]),
-    cbar=False,
-)
-ax_avg.set_yticks(
-    ticks=ax_avg.get_yticks(),
-    labels=["Mean"],
-    rotation=0,
-)
-ax_avg.set_xticks(
-    ticks=ax_avg.get_xticks(),
-    labels=ax_avg.get_xticklabels(),
-    rotation=45,
-    ha="right",
-    rotation_mode="anchor",
-)
-ax_heatmap.set_ylabel("Channel")
-ax_avg.set_xlabel("FOV")
-
-fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300,
-            bbox_inches="tight")
-
-
-# Image processing
-## show a run with images stitched in acquisition order pre- and post-normalization
-acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order")
-if not os.path.exists(acquisition_order_viz_dir):
-    os.makedirs(acquisition_order_viz_dir)
-
-run_name = "2022-01-14_TONIC_TMA2_run1"
-pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta"
-post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized"
-save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
-
-# NOTE: image not scaled up, this happens in Photoshop
-supplementary_plot_helpers.stitch_before_after_norm(
-    pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name,
-    "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1
-)
-
-
-# Cell identification and classification
-cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
-cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
-                 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
-                 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
-                 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
-cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
-
-save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
-
-## cell cluster counts
-sns.histplot(data=cell_table, x="cell_cluster")
-sns.despine()
-plt.title("Cell Cluster Counts")
-plt.xlabel("Cell Cluster")
-plt.xticks(rotation=75)
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
-
-## fov cell counts
-cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
-plt.figure(figsize=(8, 6))
-g = sns.histplot(data=cluster_counts, kde=True)
-sns.despine()
-plt.title("Histogram of Cell Counts per Image")
-plt.xlabel("Number of Cells in an Image")
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
-
-## cell type composition by tissue location of met and timepoint
-meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
-meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
-
-all_data = cell_table.merge(meta_data, on=['fov'], how='left')
-
-for metric in ['Localization', 'Timepoint']:
-    data = all_data[all_data.Timepoint == 'baseline'] if metric == 'Localization' else all_data
-
-    groups = np.unique(data.Localization) if metric == 'Localization' else \
-        ['primary', 'baseline', 'post_induction', 'on_nivo']
-    dfs = []
-    for group in groups:
-        sub_data = data[data[metric] == group]
-
-        df = sub_data.groupby("cell_cluster_broad").count().reset_index()
-        df = df.set_index('cell_cluster_broad').transpose()
-        sub_df = df.iloc[:1].reset_index(drop=True)
-        sub_df.insert(0, metric, [group])
-        sub_df[metric] = sub_df[metric].map(str)
-        sub_df = sub_df.set_index(metric)
-
-        dfs.append(sub_df)
-    prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
-
-    color_map = {'Cancer': 'dimgrey', 'Stroma': 'darksalmon', 'T': 'navajowhite',
-                 'Mono_Mac': 'red', 'B': 'darkviolet', 'Other': 'yellowgreen',
-                 'Granulocyte': 'aqua', 'NK': 'dodgerblue'}
-
-    means = prop_data.mean(axis=0).reset_index()
-    means = means.sort_values(by=[0], ascending=False)
-    prop_data = prop_data[means.cell_cluster_broad]
-
-    colors = [color_map[cluster] for cluster in means.cell_cluster_broad]
-    prop_data.plot(kind='bar', stacked=True, color=colors)
-    sns.despine()
-    plt.ticklabel_format(style='plain', useOffset=False, axis='y')
-    plt.gca().set_ylabel("Cell Proportions")
-    xlabel = "Tissue Location" if metric == 'Localization' else "Timepoint"
-    plt.gca().set_xlabel(xlabel)
-    plt.xticks(rotation=30)
-    plt.title(f"Cell Type Composition by {xlabel}")
-    handles, labels = plt.gca().get_legend_handles_labels()
-    plt.legend(handles[::-1], labels[::-1],
-               bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
-    plt.tight_layout()
-    plot_name = "cell_props_by_tissue_loc.png" if metric == 'Localization' else "cell_props_by_timepoint.png"
-    plt.savefig(os.path.join(save_dir, plot_name), dpi=300)
-
-## colored cell cluster masks from random subset of 20 FOVs
-random.seed(13)
-seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
-
-all_fovs = list(cell_table['fov'].unique())
-fovs = random.sample(all_fovs, 20)
-cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
-
-cohort_cluster_plot(
-    fovs=fovs,
-    seg_dir=seg_dir,
-    save_dir=save_dir,
-    cell_data=cell_table_subset,
-    erode=True,
-    fov_col='fov',
-    label_col='label',
-    cluster_col='cell_cluster_broad',
-    seg_suffix="_whole_cell.tiff",
-    cmap=color_map,
-    display_fig=False,
-)
-
-
-# Cell identification and classification
-cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
-cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
-                 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
-                 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
-                 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
-cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
-
-save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
-
-## cell cluster counts
-sns.histplot(data=cell_table, x="cell_cluster")
-sns.despine()
-plt.title("Cell Cluster Counts")
-plt.xlabel("Cell Cluster")
-plt.xticks(rotation=75)
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
-
-## fov cell counts
-cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
-plt.figure(figsize=(8, 6))
-g = sns.histplot(data=cluster_counts, kde=True)
-sns.despine()
-plt.title("Histogram of Cell Counts per Image")
-plt.xlabel("Number of Cells in an Image")
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
-
-## cell type composition by tissue location of met
-meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
-meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
-
-all_data = cell_table.merge(meta_data, on=['fov'], how='left')
-base_data = all_data[all_data.Timepoint == 'baseline']
-
-all_locals = np.unique(base_data.Localization)
-dfs = []
-for region in all_locals:
-    localization_data = base_data[base_data.Localization == region]
-
-    df = localization_data.groupby("cell_cluster_broad").count().reset_index()
-    df = df.set_index('cell_cluster_broad').transpose()
-    sub_df = df.iloc[:1].reset_index(drop=True)
-    sub_df.insert(0, "Localization", [region])
-    sub_df['Localization'] = sub_df['Localization'].map(str)
-    sub_df = sub_df.set_index('Localization')
-
-    dfs.append(sub_df)
-prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
-
-color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'],
-             'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite',  'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']}
-prop_data = prop_data[color_map['cell_cluster_broad']]
-
-colors = color_map['color']
-prop_data.plot(kind='bar', stacked=True, color=colors)
-plt.ticklabel_format(style='plain', useOffset=False, axis='y')
-plt.gca().set_ylabel("Cell Proportions")
-plt.gca().set_xlabel("Tissue Location")
-plt.xticks(rotation=30)
-plt.title("Cell Type Composition by Tissue Location")
-plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
-plt.tight_layout()
-plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300)
-
-## colored cell cluster masks from random subset of 20 FOVs
-random.seed(13)
-seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
-
-all_fovs = list(cell_table['fov'].unique())
-fovs = random.sample(all_fovs, 20)
-cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
-
-cohort_cluster_plot(
-    fovs=fovs,
-    seg_dir=seg_dir,
-    save_dir=save_dir,
-    cell_data=cell_table_subset,
-    erode=True,
-    fov_col='fov',
-    label_col='label',
-    cluster_col='cell_cluster_broad',
-    seg_suffix="_whole_cell.tiff",
-    cmap=color_map,
-    display_fig=False,
-)
-
-# Functional marker thresholding
-cell_table = pd.read_csv(
-    os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
-)
-functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds")
-if not os.path.exists(functional_marker_viz_dir):
-    os.makedirs(functional_marker_viz_dir)
-
-marker_info = {
-    "Ki67": {
-        "populations": ["Cancer", "Mast"],
-        "threshold": 0.002,
-        "x_range": (0, 0.012),
-        "x_ticks": np.array([0, 0.004, 0.008, 0.012]),
-        "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]),
-    },
-    "CD38": {
-        "populations": ["Endothelium", "Cancer_EMT"],
-        "threshold": 0.004,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-    },
-    "CD45RB": {
-        "populations": ["CD4T", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.015),
-        "x_ticks": np.array([0, 0.005, 0.010, 0.015]),
-        "x_tick_labels": np.array([0, 0.005, 0.010, 0.015])
-    },
-    "CD45RO": {
-        "populations": ["CD4T", "Fibroblast"],
-        "threshold": 0.002,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
-    },
-    "CD57": {
-        "populations": ["CD8T", "B"],
-        "threshold": 0.002,
-        "x_range": (0, 0.006),
-        "x_ticks": np.array([0, 0.002, 0.004, 0.006]),
-        "x_tick_labels": np.array([0, 0.002, 0.004, 0.006])
-    },
-    "CD69": {
-        "populations": ["Treg", "Cancer"],
-        "threshold": 0.002,
-        "x_range": (0, 0.008),
-        "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]),
-        "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008])
-    },
-    "GLUT1": {
-        "populations": ["Cancer_EMT", "M2_Mac"],
-        "threshold": 0.002,
-        "x_range": (0, 0.02),
-        "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
-        "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
-    },
-    "IDO": {
-        "populations": ["APC", "M1_Mac"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
-    },
-    "PD1": {
-        "populations": ["CD8T", "Stroma"],
-        "threshold": 0.0005,
-        "x_range": (0, 0.002),
-        "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]),
-        "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002])
-    },
-    "PDL1": {
-        "populations": ["Cancer", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]),
-    },
-    "HLA1": {
-        "populations": ["APC", "Stroma"],
-        "threshold": 0.001,
-        "x_range": (0, 0.025),
-        "x_ticks": np.array([0, 0.0125, 0.025]),
-        "x_tick_labels": np.array([0, 0.0125, 0.025])
-    },
-    "HLADR": {
-        "populations": ["APC", "Neutrophil"],
-        "threshold": 0.001,
-        "x_range": (0, 0.025),
-        "x_ticks": np.array([0, 0.0125, 0.025]),
-        "x_tick_labels": np.array([0, 0.0125, 0.025])
-    },
-    "TBET": {
-        "populations": ["NK", "B"],
-        "threshold": 0.0015,
-        "x_range": (0, 0.0045),
-        "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]),
-        "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045])
-    },
-    "TCF1": {
-        "populations": ["CD4T", "M1_Mac"],
-        "threshold": 0.001,
-        "x_range": (0, 0.003),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
-    },
-    "TIM3": {
-        "populations": ["Monocyte", "Endothelium"],
-        "threshold": 0.001,
-        "x_range": (0, 0.004),
-        "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]),
-        "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004])
-    },
-    "Vim": {
-        "populations": ["Endothelium", "B"],
-        "threshold": 0.002,
-        "x_range": (0, 0.06),
-        "x_ticks": np.array([0, 0.02, 0.04, 0.06]),
-        "x_tick_labels": np.array([0, 0.02, 0.04, 0.06])
-    },
-    "Fe": {
-        "populations": ["Fibroblast", "Cancer"],
-        "threshold": 0.1,
-        "x_range": (0, 0.3),
-        "x_ticks": np.array([0, 0.1, 0.2, 0.3]),
-        "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]),
-    }
-}
-supplementary_plot_helpers.functional_marker_thresholding(
-    cell_table, functional_marker_viz_dir, marker_info=marker_info,
-    figsize=(20, 40)
-)
-
-
-# Feature extraction
+# # Panel validation
+# panel_validation_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "panel_validation")
+# if not os.path.exists(panel_validation_viz_dir):
+#     os.makedirs(panel_validation_viz_dir)
+
+# controls_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
+# controls_fov = "TONIC_TMA1_colon_bottom"
+# supplementary_plot_helpers.validate_panel(
+#     controls_dir, controls_fov, panel_validation_viz_dir, font_size=180
+# )
+
+# samples_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
+# samples_fov = "TONIC_TMA2_R5C4"
+# samples_channels = sorted(io_utils.remove_file_extensions(
+#     io_utils.list_files(os.path.join(samples_dir, samples_fov), substrs=".tiff")
+# ))
+# exclude_chans = ["CD11c_nuc_exclude", "CK17_smoothed", "ECAD_smoothed", "FOXP3_nuc_include",
+#                  "chan_39", "chan_45", "chan_48", "chan_115", "chan_141"]
+# for ec in exclude_chans:
+#     if ec in samples_channels:
+#         samples_channels.remove(ec)
+# supplementary_plot_helpers.validate_panel(
+#     samples_dir, samples_fov, panel_validation_viz_dir, channels=samples_channels, font_size=320
+# )
+
+
+# # ROI selection
+
+
+# # QC
+# qc_metrics = ["Non-zero mean intensity"]
+# channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update",
+#                    "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed",
+#                    "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"]
+
+# ## FOV spatial location
+# cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"
+# qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics"
+# if not os.path.exists(qc_tma_metrics_dir):
+#     os.makedirs(qc_tma_metrics_dir)
+
+# fovs = io_utils.list_folders(cohort_path)
+# tmas = list(set([fov.split('_R')[0] for fov in fovs]))
+
+# qc_tmas = qc_comp.QCTMA(
+#     qc_metrics=qc_metrics,
+#     cohort_path=cohort_path,
+#     metrics_dir=qc_tma_metrics_dir,
+# )
+
+# qc_tmas.compute_qc_tma_metrics(tmas=tmas)
+# qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude)
+# qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300)
+
+# ## longitudinal controls
+# control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls"
+# qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control"
+# if not os.path.exists(qc_control_metrics_dir):
+#     os.makedirs(qc_control_metrics_dir)
+
+# folders = io_utils.list_folders(control_path, "TMA3_")
+# control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4
+#                    else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4]
+#                    for name in folders]
+
+# all_folders = io_utils.list_folders(control_path)
+# for i, control in enumerate(control_substrs):
+#     control_sample_name = control
+#     print(control)
+#     if control == 'tonsil_bottom':
+#         fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25]
+#     else:
+#         fovs = [folder for folder in all_folders if control in folder]
+
+#     qc_control = qc_comp.QCControlMetrics(
+#         qc_metrics=qc_metrics,
+#         cohort_path=control_path,
+#         metrics_dir=qc_control_metrics_dir,
+#     )
+
+#     qc_control.compute_control_qc_metrics(
+#         control_sample_name=control_sample_name,
+#         fovs=fovs,
+#         channel_exclude=channel_exclude,
+#         channel_include=None,
+#     )
+
+#     qc_metrics_plots.longitudinal_control_heatmap(
+#         qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300
+#     )
+
+# dfs = []
+# for control in control_substrs:
+#     df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv"))
+#     df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])]
+#     log2_norm_df: pd.DataFrame = df.pivot(
+#         index="channel", columns="fov", values="Non-zero mean intensity"
+#     ).transform(func=lambda row: np.log2(row / row.mean()), axis=1)
+#     if control != 'tonsil_bottom_duplicate1':
+#         dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col]
+#         log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df
+
+#     mean_t_df: pd.DataFrame = (
+#         log2_norm_df.mean(axis=0)
+#         .to_frame(name="mean")
+#         .transpose()
+#         .sort_values(by="mean", axis=1)
+#     )
+#     transformed_df: pd.DataFrame = pd.concat(
+#         objs=[log2_norm_df, mean_t_df]
+#     ).sort_values(by="mean", axis=1, inplace=False)
+#     transformed_df.rename_axis("channel", axis=0, inplace=True)
+#     transformed_df.rename_axis("fov", axis=1, inplace=True)
+
+#     dfs.append(transformed_df)
+# all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True)
+# all_data = all_data.groupby(['channel']).mean()
+# all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2)
+
+
+# fig = plt.figure(figsize=(12,12), dpi=300)
+# fig.set_layout_engine(layout="constrained")
+# gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1])
+# _norm = Normalize(vmin=-1, vmax=1)
+# _cmap = sns.color_palette("vlag", as_cmap=True)
+# fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ")
+
+# annotation_kws = {
+#     "horizontalalignment": "center",
+#     "verticalalignment": "center",
+#     "fontsize": 8,
+# }
+
+# ax_heatmap = fig.add_subplot(gs[0, 0])
+# sns.heatmap(
+#     data=all_data[~all_data.index.isin(["mean"])],
+#     ax=ax_heatmap,
+#     linewidths=1,
+#     linecolor="black",
+#     cbar_kws={"shrink": 0.5},
+#     annot=True,
+#     annot_kws=annotation_kws,
+#     xticklabels=False,
+#     norm=_norm,
+#     cmap=_cmap,
+# )
+
+# ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$")
+# ax_heatmap.set_yticks(
+#     ticks=ax_heatmap.get_yticks(),
+#     labels=ax_heatmap.get_yticklabels(),
+#     rotation=0,
+# )
+# ax_heatmap.set_xlabel(None)
+
+# ax_avg = fig.add_subplot(gs[1, 0])
+# sns.heatmap(
+#     data=all_data[all_data.index.isin(["mean"])],
+#     ax=ax_avg,
+#     linewidths=1,
+#     linecolor="black",
+#     annot=True,
+#     annot_kws=annotation_kws,
+#     fmt=".2f",
+#     cmap=ListedColormap(["white"]),
+#     cbar=False,
+# )
+# ax_avg.set_yticks(
+#     ticks=ax_avg.get_yticks(),
+#     labels=["Mean"],
+#     rotation=0,
+# )
+# ax_avg.set_xticks(
+#     ticks=ax_avg.get_xticks(),
+#     labels=ax_avg.get_xticklabels(),
+#     rotation=45,
+#     ha="right",
+#     rotation_mode="anchor",
+# )
+# ax_heatmap.set_ylabel("Channel")
+# ax_avg.set_xlabel("FOV")
+
+# fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300,
+#             bbox_inches="tight")
+
+
+# # Image processing
+# ## show a run with images stitched in acquisition order pre- and post-normalization
+# acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order")
+# if not os.path.exists(acquisition_order_viz_dir):
+#     os.makedirs(acquisition_order_viz_dir)
+
+# run_name = "2022-01-14_TONIC_TMA2_run1"
+# pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta"
+# post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized"
+# save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs"
+
+# # NOTE: image not scaled up, this happens in Photoshop
+# supplementary_plot_helpers.stitch_before_after_norm(
+#     pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name,
+#     "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1
+# )
+
+
+# # Cell identification and classification
+# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
+# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
+#                  'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
+#                  'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
+#                  'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
+# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
+
+# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
+
+# ## cell cluster counts
+# sns.histplot(data=cell_table, x="cell_cluster")
+# sns.despine()
+# plt.title("Cell Cluster Counts")
+# plt.xlabel("Cell Cluster")
+# plt.xticks(rotation=75)
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
+
+# ## fov cell counts
+# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
+# plt.figure(figsize=(8, 6))
+# g = sns.histplot(data=cluster_counts, kde=True)
+# sns.despine()
+# plt.title("Histogram of Cell Counts per Image")
+# plt.xlabel("Number of Cells in an Image")
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
+
+# ## cell type composition by tissue location of met and timepoint
+# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
+# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
+
+# all_data = cell_table.merge(meta_data, on=['fov'], how='left')
+
+# for metric in ['Localization', 'Timepoint']:
+#     data = all_data[all_data.Timepoint == 'baseline'] if metric == 'Localization' else all_data
+
+#     groups = np.unique(data.Localization) if metric == 'Localization' else \
+#         ['primary', 'baseline', 'post_induction', 'on_nivo']
+#     dfs = []
+#     for group in groups:
+#         sub_data = data[data[metric] == group]
+
+#         df = sub_data.groupby("cell_cluster_broad").count().reset_index()
+#         df = df.set_index('cell_cluster_broad').transpose()
+#         sub_df = df.iloc[:1].reset_index(drop=True)
+#         sub_df.insert(0, metric, [group])
+#         sub_df[metric] = sub_df[metric].map(str)
+#         sub_df = sub_df.set_index(metric)
+
+#         dfs.append(sub_df)
+#     prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
+
+#     color_map = {'Cancer': 'dimgrey', 'Stroma': 'darksalmon', 'T': 'navajowhite',
+#                  'Mono_Mac': 'red', 'B': 'darkviolet', 'Other': 'yellowgreen',
+#                  'Granulocyte': 'aqua', 'NK': 'dodgerblue'}
+
+#     means = prop_data.mean(axis=0).reset_index()
+#     means = means.sort_values(by=[0], ascending=False)
+#     prop_data = prop_data[means.cell_cluster_broad]
+
+#     colors = [color_map[cluster] for cluster in means.cell_cluster_broad]
+#     prop_data.plot(kind='bar', stacked=True, color=colors)
+#     sns.despine()
+#     plt.ticklabel_format(style='plain', useOffset=False, axis='y')
+#     plt.gca().set_ylabel("Cell Proportions")
+#     xlabel = "Tissue Location" if metric == 'Localization' else "Timepoint"
+#     plt.gca().set_xlabel(xlabel)
+#     plt.xticks(rotation=30)
+#     plt.title(f"Cell Type Composition by {xlabel}")
+#     handles, labels = plt.gca().get_legend_handles_labels()
+#     plt.legend(handles[::-1], labels[::-1],
+#                bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
+#     plt.tight_layout()
+#     plot_name = "cell_props_by_tissue_loc.png" if metric == 'Localization' else "cell_props_by_timepoint.png"
+#     plt.savefig(os.path.join(save_dir, plot_name), dpi=300)
+
+# ## colored cell cluster masks from random subset of 20 FOVs
+# random.seed(13)
+# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
+
+# all_fovs = list(cell_table['fov'].unique())
+# fovs = random.sample(all_fovs, 20)
+# cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
+
+# cohort_cluster_plot(
+#     fovs=fovs,
+#     seg_dir=seg_dir,
+#     save_dir=save_dir,
+#     cell_data=cell_table_subset,
+#     erode=True,
+#     fov_col='fov',
+#     label_col='label',
+#     cluster_col='cell_cluster_broad',
+#     seg_suffix="_whole_cell.tiff",
+#     cmap=color_map,
+#     display_fig=False,
+# )
+
+
+# # Cell identification and classification
+# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv')
+# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5,
+#                  'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11,
+#                  'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16,
+#                  'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20}
+# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order))
+
+# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs'
+
+# ## cell cluster counts
+# sns.histplot(data=cell_table, x="cell_cluster")
+# sns.despine()
+# plt.title("Cell Cluster Counts")
+# plt.xlabel("Cell Cluster")
+# plt.xticks(rotation=75)
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300)
+
+# ## fov cell counts
+# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1]
+# plt.figure(figsize=(8, 6))
+# g = sns.histplot(data=cluster_counts, kde=True)
+# sns.despine()
+# plt.title("Histogram of Cell Counts per Image")
+# plt.xlabel("Number of Cells in an Image")
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300)
+
+# ## cell type composition by tissue location of met
+# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv')
+# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']]
+
+# all_data = cell_table.merge(meta_data, on=['fov'], how='left')
+# base_data = all_data[all_data.Timepoint == 'baseline']
+
+# all_locals = np.unique(base_data.Localization)
+# dfs = []
+# for region in all_locals:
+#     localization_data = base_data[base_data.Localization == region]
+
+#     df = localization_data.groupby("cell_cluster_broad").count().reset_index()
+#     df = df.set_index('cell_cluster_broad').transpose()
+#     sub_df = df.iloc[:1].reset_index(drop=True)
+#     sub_df.insert(0, "Localization", [region])
+#     sub_df['Localization'] = sub_df['Localization'].map(str)
+#     sub_df = sub_df.set_index('Localization')
+
+#     dfs.append(sub_df)
+# prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1)
+
+# color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'],
+#              'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite',  'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']}
+# prop_data = prop_data[color_map['cell_cluster_broad']]
+
+# colors = color_map['color']
+# prop_data.plot(kind='bar', stacked=True, color=colors)
+# plt.ticklabel_format(style='plain', useOffset=False, axis='y')
+# plt.gca().set_ylabel("Cell Proportions")
+# plt.gca().set_xlabel("Tissue Location")
+# plt.xticks(rotation=30)
+# plt.title("Cell Type Composition by Tissue Location")
+# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
+# plt.tight_layout()
+# plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300)
+
+# ## colored cell cluster masks from random subset of 20 FOVs
+# random.seed(13)
+# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output'
+
+# all_fovs = list(cell_table['fov'].unique())
+# fovs = random.sample(all_fovs, 20)
+# cell_table_subset = cell_table[cell_table.fov.isin(fovs)]
+
+# cohort_cluster_plot(
+#     fovs=fovs,
+#     seg_dir=seg_dir,
+#     save_dir=save_dir,
+#     cell_data=cell_table_subset,
+#     erode=True,
+#     fov_col='fov',
+#     label_col='label',
+#     cluster_col='cell_cluster_broad',
+#     seg_suffix="_whole_cell.tiff",
+#     cmap=color_map,
+#     display_fig=False,
+# )
+
+# # Functional marker thresholding
+# cell_table = pd.read_csv(
+#     os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
+# )
+# functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds")
+# if not os.path.exists(functional_marker_viz_dir):
+#     os.makedirs(functional_marker_viz_dir)
+
+# marker_info = {
+#     "Ki67": {
+#         "populations": ["Cancer", "Mast"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.012),
+#         "x_ticks": np.array([0, 0.004, 0.008, 0.012]),
+#         "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]),
+#     },
+#     "CD38": {
+#         "populations": ["Endothelium", "Cancer_EMT"],
+#         "threshold": 0.004,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#     },
+#     "CD45RB": {
+#         "populations": ["CD4T", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.015),
+#         "x_ticks": np.array([0, 0.005, 0.010, 0.015]),
+#         "x_tick_labels": np.array([0, 0.005, 0.010, 0.015])
+#     },
+#     "CD45RO": {
+#         "populations": ["CD4T", "Fibroblast"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
+#     },
+#     "CD57": {
+#         "populations": ["CD8T", "B"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.006),
+#         "x_ticks": np.array([0, 0.002, 0.004, 0.006]),
+#         "x_tick_labels": np.array([0, 0.002, 0.004, 0.006])
+#     },
+#     "CD69": {
+#         "populations": ["Treg", "Cancer"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.008),
+#         "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]),
+#         "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008])
+#     },
+#     "GLUT1": {
+#         "populations": ["Cancer_EMT", "M2_Mac"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.02),
+#         "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]),
+#         "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02])
+#     },
+#     "IDO": {
+#         "populations": ["APC", "M1_Mac"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
+#     },
+#     "PD1": {
+#         "populations": ["CD8T", "Stroma"],
+#         "threshold": 0.0005,
+#         "x_range": (0, 0.002),
+#         "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]),
+#         "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002])
+#     },
+#     "PDL1": {
+#         "populations": ["Cancer", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]),
+#     },
+#     "HLA1": {
+#         "populations": ["APC", "Stroma"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.025),
+#         "x_ticks": np.array([0, 0.0125, 0.025]),
+#         "x_tick_labels": np.array([0, 0.0125, 0.025])
+#     },
+#     "HLADR": {
+#         "populations": ["APC", "Neutrophil"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.025),
+#         "x_ticks": np.array([0, 0.0125, 0.025]),
+#         "x_tick_labels": np.array([0, 0.0125, 0.025])
+#     },
+#     "TBET": {
+#         "populations": ["NK", "B"],
+#         "threshold": 0.0015,
+#         "x_range": (0, 0.0045),
+#         "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]),
+#         "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045])
+#     },
+#     "TCF1": {
+#         "populations": ["CD4T", "M1_Mac"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.003),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003])
+#     },
+#     "TIM3": {
+#         "populations": ["Monocyte", "Endothelium"],
+#         "threshold": 0.001,
+#         "x_range": (0, 0.004),
+#         "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]),
+#         "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004])
+#     },
+#     "Vim": {
+#         "populations": ["Endothelium", "B"],
+#         "threshold": 0.002,
+#         "x_range": (0, 0.06),
+#         "x_ticks": np.array([0, 0.02, 0.04, 0.06]),
+#         "x_tick_labels": np.array([0, 0.02, 0.04, 0.06])
+#     },
+#     "Fe": {
+#         "populations": ["Fibroblast", "Cancer"],
+#         "threshold": 0.1,
+#         "x_range": (0, 0.3),
+#         "x_ticks": np.array([0, 0.1, 0.2, 0.3]),
+#         "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]),
+#     }
+# }
+# supplementary_plot_helpers.functional_marker_thresholding(
+#     cell_table, functional_marker_viz_dir, marker_info=marker_info,
+#     figsize=(20, 40)
+# )
+
+
+# # Feature extraction
 
 
 # Occupancy statistics
@@ -593,26 +593,35 @@
         index=False
     )
 
-
 # massive GridSearch
-total_occupancy_stats_df = pd.DataFrame()
-for tiles_per_row_col in [4, 8, 16]:
-    for positive_threshold in [5, 10, 15, 20]:
-        occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics(
-            cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"],
-            tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold
-        )
-        occupancy_stats_df = pd.DataFrame(
-            {
-                "fov": list(occupancy_stats.keys()),
-                "percent_positive_tiles": list(occupancy_stats.values())
-            }
-        )
-        occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2
-        occupancy_stats_df["positive_threshold"] = positive_threshold
-
-        total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
-
-total_occupancy_stats_df.to_csv(
-    os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False
+if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")):
+    total_occupancy_stats_df = pd.read_csv(
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")
+    )
+else:
+    total_occupancy_stats_df = pd.DataFrame()
+    for tiles_per_row_col in [4, 8, 16]:
+        for positive_threshold in [5, 10, 15, 20]:
+            occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics(
+                cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"],
+                tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold
+            )
+            occupancy_stats_df = pd.DataFrame(
+                {
+                    "fov": list(occupancy_stats.keys()),
+                    "percent_positive_tiles": list(occupancy_stats.values())
+                }
+            )
+            occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2
+            occupancy_stats_df["positive_threshold"] = positive_threshold
+
+            total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
+
+    total_occupancy_stats_df.to_csv(
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False
+    )
+
+# visualize histograms for each trial in the occupancy stats table
+supplementary_plot_helpers.visualize_occupancy_statistics(
+    total_occupancy_stats_df, occupancy_stats_viz_dir, pop_subset=["T cells"], figsize=(20, 40)
 )

From e9aaa0c2c208398ed8af7fae63d2ad54145f4415 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Thu, 9 May 2024 15:16:34 -0700
Subject: [PATCH 11/14] Update to explicitly pass max_size_image param

---
 python_files/generate_supplementary_plots.py | 10 ++++++----
 python_files/supplementary_plot_helpers.py   |  7 +++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index fdcf24a..b0e1af3 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -594,9 +594,9 @@
     )
 
 # massive GridSearch
-if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")):
+if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")):
     total_occupancy_stats_df = pd.read_csv(
-        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")
     )
 else:
     total_occupancy_stats_df = pd.DataFrame()
@@ -604,7 +604,9 @@
         for positive_threshold in [5, 10, 15, 20]:
             occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics(
                 cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"],
-                tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold
+                tiles_per_row_col=tiles_per_row_col,
+                max_image_size=cell_table["fov_pixel_size"].max(),
+                positive_threshold=positive_threshold
             )
             occupancy_stats_df = pd.DataFrame(
                 {
@@ -618,7 +620,7 @@
             total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
 
     total_occupancy_stats_df.to_csv(
-        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv"), index=False
     )
 
 # visualize histograms for each trial in the occupancy stats table
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index d02f78e..f79df4e 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -414,7 +414,7 @@ def stitch_before_after_norm(
 def compute_occupancy_statistics(
     cell_table: pd.DataFrame, pop_col: str = "cell_cluster",
     pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8,
-    positive_threshold: int = 20
+    max_image_size: int = 2048, positive_threshold: int = 20
 ):
     """Compute the occupancy statistics over a cohort.
 
@@ -428,6 +428,9 @@ def compute_occupancy_statistics(
             Which populations, if any, to subset on. If None, use all populations
         tiles_per_row_col (int):
             The row/col dims of tiles to define over each image
+        max_image_size (int):
+            Maximum size of an image passed in, assuming square images
+            NOTE: all images should have an image size that is a factor of max_image_size
         positive_threshold (int):
             The cell count in a tile required for positivity
 
@@ -449,7 +452,7 @@ def compute_occupancy_statistics(
         cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy()
 
     # define the tile size in pixels for each FOV
-    cell_table["tile_size"] = cell_table["fov_pixel_size"] // tiles_per_row_col
+    cell_table["tile_size"] = max_image_size // tiles_per_row_col
 
     # define the tile for each cell
     cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int)

From 9128e2c5bd07896176302d0efe40e58b234a12b9 Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Thu, 16 May 2024 10:58:49 -0700
Subject: [PATCH 12/14] Update occupancy stat workflow

---
 python_files/generate_supplementary_plots.py |  45 ++--
 python_files/supplementary_plot_helpers.py   | 232 +++++++++++++------
 2 files changed, 182 insertions(+), 95 deletions(-)

diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py
index b0e1af3..58d4006 100644
--- a/python_files/generate_supplementary_plots.py
+++ b/python_files/generate_supplementary_plots.py
@@ -558,11 +558,6 @@
 
 
 # Occupancy statistics
-# TODO: make a constant in supplementary_plot_helpers
-cell_table = pd.read_csv(
-    os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
-)
-
 occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats")
 if not os.path.exists(occupancy_stats_viz_dir):
     os.makedirs(occupancy_stats_viz_dir)
@@ -574,6 +569,7 @@
         os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv")
     )
 else:
+    # TODO: make constant earlier in this script
     cell_table = pd.read_csv(
         os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv")
     )
@@ -594,36 +590,41 @@
     )
 
 # massive GridSearch
-if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")):
+if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv")):
     total_occupancy_stats_df = pd.read_csv(
-        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv")
     )
 else:
     total_occupancy_stats_df = pd.DataFrame()
+    total_occupancy_stats_grouped_df = pd.DataFrame()
     for tiles_per_row_col in [4, 8, 16]:
         for positive_threshold in [5, 10, 15, 20]:
-            occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics(
-                cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"],
+            occupancy_stats_df, occupancy_stats_grouped_df = supplementary_plot_helpers.compute_occupancy_statistics(
+                cell_table,
                 tiles_per_row_col=tiles_per_row_col,
                 max_image_size=cell_table["fov_pixel_size"].max(),
                 positive_threshold=positive_threshold
             )
-            occupancy_stats_df = pd.DataFrame(
-                {
-                    "fov": list(occupancy_stats.keys()),
-                    "percent_positive_tiles": list(occupancy_stats.values())
-                }
-            )
-            occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2
-            occupancy_stats_df["positive_threshold"] = positive_threshold
+            # occupancy_stats_df = pd.DataFrame(
+            #     {
+            #         "fov": list(occupancy_stats.keys()),
+            #         "percent_positive_tiles": list(occupancy_stats.values())
+            #     }
+            # )
+            occupancy_stats_grouped_df["num_tiles"] = tiles_per_row_col ** 2
+            occupancy_stats_grouped_df["positive_threshold"] = positive_threshold
 
             total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df])
+            total_occupancy_stats_grouped_df = pd.concat([total_occupancy_stats_grouped_df, occupancy_stats_grouped_df])
 
     total_occupancy_stats_df.to_csv(
-        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv"), index=False
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv"), index=False
+    )
+    total_occupancy_stats_grouped_df.to_csv(
+        os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad_grouped.csv"), index=False
     )
 
-# visualize histograms for each trial in the occupancy stats table
-supplementary_plot_helpers.visualize_occupancy_statistics(
-    total_occupancy_stats_df, occupancy_stats_viz_dir, pop_subset=["T cells"], figsize=(20, 40)
-)
+# # visualize histograms for each trial in the occupancy stats table
+# supplementary_plot_helpers.visualize_occupancy_statistics(
+#     total_occupancy_stats_df, occupancy_stats_viz_dir, figsize=(20, 40)
+# )
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index f79df4e..5426185 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -410,9 +410,35 @@ def stitch_before_after_norm(
     post_norm_tiled.save(post_norm_stitched_path)
 
 
+def occupancy_division_factor(occupancy_group: pd.api.typing.DataFrameGroupBy,
+                              tiles_per_row_col: int,
+                              max_image_size: int,
+                              cell_table: pd.DataFrame):
+    """Compute the denominator to ensure the proper percentage is computed for occupancy stats
+
+    Args:
+        occupancy_group (pd.api.typing.DataFrameGroupBy):
+            The group by object for the FOV and cell type
+        tiles_per_row_col (int):
+            The row/col dims of tiles to define over each image
+        max_image_size (int):
+            Maximum size of an image passed in, assuming square images
+            NOTE: all images should have an image size that is a factor of max_image_size
+        cell_table (pd.DataFrame):
+            The cell table associated with the cohort
+
+    Returns:
+        float:
+            The factor to divide the occupancy stat positive counts by
+    """
+    fov_name = occupancy_group["fov"].values[0]
+    image_size = cell_table.loc[cell_table["fov"] == fov_name, "fov_pixel_size"].values[0]
+    return tiles_per_row_col ** 2 / (max_image_size / image_size)
+
+
 # occupancy statistic helpers
 def compute_occupancy_statistics(
-    cell_table: pd.DataFrame, pop_col: str = "cell_cluster",
+    cell_table: pd.DataFrame, pop_col: str = "cell_cluster_broad",
     pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8,
     max_image_size: int = 2048, positive_threshold: int = 20
 ):
@@ -448,8 +474,9 @@ def compute_occupancy_statistics(
             specified_populations=pop_subset,
             valid_populations=cell_table[pop_col].unique()
         )
-
-        cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy()
+    # otherwise, use all possible subsets as defined by pop_col
+    else:
+        pop_subset = cell_table[pop_col].unique()
 
     # define the tile size in pixels for each FOV
     cell_table["tile_size"] = max_image_size // tiles_per_row_col
@@ -458,40 +485,86 @@ def compute_occupancy_statistics(
     cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int)
     cell_table["tile_col"] = (cell_table["centroid-0"] // cell_table["tile_size"]).astype(int)
 
-    # Group by FOV and tile positions, then count occupancy
+    # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col
+    all_combos = pd.MultiIndex.from_product(
+        [
+            cell_table["fov"].unique(),
+            cell_table["tile_row"].unique(),
+            cell_table["tile_col"].unique(),
+            cell_table[pop_col].unique()
+        ],
+        names=["fov", "tile_row", "tile_col", pop_col]
+    ).to_frame(index=False)
+
     occupancy_counts: pd.DataFrame = cell_table.groupby(
-        ["fov", "tile_row", "tile_col"]
+        ["fov", "tile_row", "tile_col", pop_col]
     ).size().reset_index(name="occupancy")
-
-    # define the occupancy_stats array
-    occupancy_stats = xr.DataArray(
-        np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16),
-        coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)],
-        dims=["fov", "x", "y"]
+    occupancy_counts = pd.merge(
+        all_combos, occupancy_counts,
+        on=["fov", "tile_row", "tile_col", pop_col],
+        how="left"
     )
-
-    # Update the DataArray based on occupancy_counts without explicit Python loops
-    for index, row in occupancy_counts.iterrows():
-        occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"]
-
-    # define the tiles that are positive based on threshold
-    occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold
-
-    # compute the percentage of positive tiles for each FOV
-    occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum(
-        dim=['x', 'y']
-    ) / (tiles_per_row_col ** 2)
-
-    # convert to dict
-    occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict()
-    occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()}
-
-    return occupancy_stats_dict
+    occupancy_counts["occupancy"].fillna(0, inplace=True)
+    print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates())
+
+    occupancy_counts["is_positive"] = occupancy_counts["occupancy"] > positive_threshold
+    # occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply(
+    #     lambda row: row["is_positive"].sum() / occupancy_division_factor(
+    #         row, tiles_per_row_col, max_image_size, cell_table
+    #     ) * 100
+    # ).reset_index(name="percent_positive")
+    occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply(
+        lambda row: row["is_positive"].sum()
+    ).reset_index(name="total_positive")
+
+    return occupancy_counts, occupancy_counts_grouped
+
+    # # define the occupancy_stats array
+    # occupancy_stats = xr.DataArray(
+    #     np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16),
+    #     coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)],
+    #     dims=["fov", "x", "y"]
+    # )
+
+    # # iterate over each population
+    # for pop in pop_subset:
+    #     cell_table_pop = cell_table[cell_table[pop_col] == pop].copy()
+
+    #     # Group by FOV and tile positions, then count occupancy
+    #     occupancy_counts: pd.DataFrame = cell_table_pop.groupby(
+    #         ["fov", "tile_row", "tile_col"]
+    #     ).size().reset_index(name="occupancy")
+
+    #     # define the occupancy_stats array
+    #     occupancy_stats = xr.DataArray(
+    #         np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16),
+    #         coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)],
+    #         dims=["fov", "x", "y"]
+    #     )
+
+    #     # Update the DataArray based on occupancy_counts without explicit Python loops
+    #     for index, row in occupancy_counts.iterrows():
+    #         occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"]
+
+    #     # define the tiles that are positive based on threshold
+    #     occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold
+
+    #     # compute the percentage of positive tiles for each FOV
+    #     occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum(
+    #         dim=['x', 'y']
+    #     ) / (tiles_per_row_col ** 2)
+
+    #     # convert to dict
+    #     occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict()
+    #     occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()}
+
+    #     return occupancy_stats_dict
 
 
 def visualize_occupancy_statistics(
     occupancy_stats_table: pd.DataFrame, save_dir: Union[str, pathlib.Path],
-    pop_subset: Optional[List[str]] = None, figsize: Optional[Tuple[float, float]] = (10, 10)
+    pop_col: str = "cell_cluster_broad", pop_subset: Optional[List[str]] = None,
+    figsize: Optional[Tuple[float, float]] = (10, 10)
 ):
     """Visualize the distribution of the percentage of tiles above a positive threshold.
 
@@ -503,59 +576,72 @@ def visualize_occupancy_statistics(
             tiles for each image of the cohort at different positivity thresholds and grid sizes
         save_dir (Union[str, pathlib.Path]):
             Directory to save the visualizations in
+        pop_col (str):
+            Column containing the names of the cell populations
         pop_subset (Optional[List[str]]):
             Which populations, if any, to subset on. If None, use all populations
         fig_size (Optional[Tuple[float, float]]):
             The figure size to use for the image.
     """
-    # generate each unique test pair
-    tiles_threshold_trials = occupancy_stats_table[
-        ["num_tiles", "positive_threshold"]
-    ].drop_duplicates().reset_index(drop=True)
-
-    # define a separate plot on the same grid for each test pair
-    fig, axs = plt.subplots(
-        tiles_threshold_trials.shape[0], 1, figsize=figsize
-    )
-    populations_str = f"{', '.join(pop_subset)} populations" if pop_subset else "all populations"
-    # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24)
-    # fig.subplots_adjust(top=1.50)
-
-    # define an index to iterate through the subplots
-    subplot_i = 0
-
-    for _, trial in tiles_threshold_trials.iterrows():
-        grid_size = int(np.sqrt(trial["num_tiles"]))
-        positive_threshold = trial["positive_threshold"]
-
-        # subset data obtained just for the specified trial
-        trial_data = occupancy_stats_table[
-            (occupancy_stats_table["num_tiles"] == trial["num_tiles"]) &
-            (occupancy_stats_table["positive_threshold"] == trial["positive_threshold"])
-        ]
-
-        # visualize the distribution using a histogram
-        positive_tile_percentages = trial_data["percent_positive_tiles"].values
-        axs[subplot_i].hist(
-            positive_tile_percentages,
-            facecolor="g",
-            bins=20,
-            alpha=0.75
+    if pop_subset is not None:
+        verify_in_list(
+            specified_populations=pop_subset,
+            valid_populations=occupancy_stats_table[pop_col].unique()
         )
-        axs[subplot_i].set_title(
-            f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}"
+    else:
+        pop_subset = occupancy_stats_table[pop_col].unique()
+
+    for pop in pop_subset:
+        occupancy_stats_table_sub = occupancy_stats_table[occupancy_stats_table[pop_col] == pop]
+
+        # generate each unique test pair
+        tiles_threshold_trials = occupancy_stats_table_sub[
+            ["num_tiles", "positive_threshold"]
+        ].drop_duplicates().reset_index(drop=True)
+
+        # define a separate plot on the same grid for each test pair
+        fig, axs = plt.subplots(
+            tiles_threshold_trials.shape[0], 1, figsize=figsize
         )
+        populations_str = f"{pop} cell populations"
+        # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24)
+        # fig.subplots_adjust(top=1.50)
+
+        # define an index to iterate through the subplots
+        subplot_i = 0
+
+        for _, trial in tiles_threshold_trials.iterrows():
+            grid_size = int(np.sqrt(trial["num_tiles"]))
+            positive_threshold = trial["positive_threshold"]
+
+            # subset data obtained just for the specified trial
+            trial_data = occupancy_stats_table_sub[
+                (occupancy_stats_table_sub["num_tiles"] == trial["num_tiles"]) &
+                (occupancy_stats_table_sub["positive_threshold"] == trial["positive_threshold"])
+            ]
+
+            # visualize the distribution using a histogram
+            positive_tile_percentages = trial_data["percent_positive"].values
+            axs[subplot_i].hist(
+                positive_tile_percentages,
+                facecolor="g",
+                bins=20,
+                alpha=0.75
+            )
+            axs[subplot_i].set_title(
+                f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}"
+            )
 
-        # update the subplot index
-        subplot_i += 1
+            # update the subplot index
+            subplot_i += 1
 
-    plt.tight_layout()
+        plt.tight_layout()
 
-    # save the figure to save_dir
-    fig.savefig(
-        pathlib.Path(save_dir) / f"occupancy_statistic_distributions.png",
-        dpi=300
-    )
+        # save the figure to save_dir
+        fig.savefig(
+            pathlib.Path(save_dir) / f"occupancy_statistic_distributions_{pop}.png",
+            dpi=300
+        )
 
 
 def visualize_occupancy_statistics_old(

From 3b50d3480468fdca48c24386c59463989e23e33d Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 4 Sep 2024 13:56:44 -0700
Subject: [PATCH 13/14] Integrate occupancy stat generation into DF per core

---
 .../postprocessing/5_create_dfs_per_core.py   | 226 +++++++++++++++++-
 python_files/supplementary_plot_helpers.py    |  60 ++++-
 python_files/utils.py                         | 160 +++++++++++++
 3 files changed, 436 insertions(+), 10 deletions(-)

diff --git a/python_files/postprocessing/5_create_dfs_per_core.py b/python_files/postprocessing/5_create_dfs_per_core.py
index a911cc8..7d6e905 100644
--- a/python_files/postprocessing/5_create_dfs_per_core.py
+++ b/python_files/postprocessing/5_create_dfs_per_core.py
@@ -5,7 +5,7 @@
 import numpy as np
 from scipy.stats import spearmanr
 
-from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster
+from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy
 
 #
 # This file creates plotting-ready data structures to enumerate the frequency, count, and density
@@ -36,6 +36,9 @@
 cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv'))
 cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv'))
 cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv'))
+cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated'), usecols=[
+    "fov", "label", "centroid-0", "centroid-1",  "cell_meta_cluster", "cell_cluster", "cell_cluster_broad"
+])
 area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv'))
 annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'cell_annotation_mask.csv'))
 fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv'))
@@ -51,6 +54,7 @@
 cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left')
 cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left')
 cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+cell_table_occupancy = cell_table_occupancy.merge(harmonized_annotations, on=['fov', 'label'], how='left')
 
 # check for FOVs present in imaged data that aren't in core metadata
 missing_fovs = cell_table_clusters.loc[~cell_table_clusters.fov.isin(core_metadata.fov), 'fov'].unique()
@@ -61,6 +65,7 @@
 cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :]
 cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :]
 cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :]
+cell_table_occupancy = cell_table_occupancy.loc[~cell_table_occupancy.fov.isin(missing_fovs), :]
 fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :]
 fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :]
 
@@ -1021,6 +1026,225 @@
 deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False)
 
 
+# occupancy stat summary
+
+# Create list to hold parameters for each df that will be produced
+occupancy_df_params = [['cluster_broad_freq', 'cell_cluster_broad']]
+
+occupancy_dfs = []
+for result_name, cluster_col_name in occupancy_df_params:
+
+    # remove cluster_names except for the one specified for the df
+    drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label']
+    drop_cols.remove(cluster_col_name)
+
+    # create df
+    occupancy_dfs.append(create_long_df_by_occupancy(cell_table=cell_table_occupancy,
+                                                     result_name=result_name,
+                                                     cluster_col_name=cluster_col_name,
+                                                     drop_cols=drop_cols,
+                                                     zscore_threshold=0.0))
+
+# create combined df
+total_df_occupancy = pd.concat(occupancy_dfs, axis=0)
+total_df_occupancy = total_df_occupancy.merge(harmonized_metadata, on='fov', how='inner')
+total_df_occupancy = total_df_occupancy.rename(columns={'functional_marker': 'occupancy_feature'})
+
+# save df
+total_df_occupancy.to_csv(os.path.join(output_dir, 'occupancy_df_per_core.csv'), index=False)
+
+# filter diversity scores to only include FOVs with at least the specified number of cells
+total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+min_cells = 5
+
+filtered_dfs = []
+metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+           ['cluster_count', 'cluster_freq'],
+           ['meta_cluster_count', 'meta_cluster_freq']]
+for metric in metrics:
+    # subset count df to include cells at the relevant clustering resolution
+    for compartment in ['all']:
+        count_df = total_df[total_df.metric == metric[0]]
+        count_df = count_df[count_df.subset == compartment]
+
+        # subset diversity df to only include diversity metrics at this resolution
+        occupancy_df = total_df_occupancy[total_df_occupancy.metric == metric[1]]
+        occupancy_df = occupancy_df[occupancy_df.subset == compartment]
+
+        # for each cell type, determine which FOVs have high enough counts to be included
+        for cell_type in count_df.cell_type.unique():
+            keep_df = count_df[count_df.cell_type == cell_type]
+            keep_df = keep_df[keep_df.value >= min_cells]
+            keep_fovs = keep_df.fov.unique()
+
+            # subset morphology df to only include FOVs with high enough counts
+            keep_features = occupancy_df[occupancy_df.cell_type == cell_type]
+            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+
+            # append to list of filtered dfs
+            filtered_dfs.append(keep_features)
+
+filtered_occupancy_df = pd.concat(filtered_dfs)
+
+# save filtered df
+filtered_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_filtered.csv'), index=False)
+
+# remove distances that are correlated with abundance of cell type
+
+if not os.path.exists(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv')):
+    density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')]
+
+    # NOTE: this is purely for consistency, the table when created only includes the cluster_broad_freq metric and all compartments
+    filtered_occupancy_df = filtered_occupancy_df.loc[(filtered_occupancy_df.metric == 'cluster_broad_freq') & (filtered_occupancy_df.subset == 'all')]
+
+    # remove images without tumor cells
+    density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :]
+    filtered_occupancy_df = filtered_occupancy_df.loc[filtered_occupancy_df.fov.isin(density_df.fov.unique())]
+    cell_types = filtered_occupancy_df.cell_type.unique()
+
+    # calculate which pairings to keep
+    keep_cells, keep_features = [], []
+
+    for cell_type in cell_types:
+        density_subset = density_df.loc[density_df.cell_type == cell_type]
+        occupancy_subset = filtered_occupancy_df.loc[filtered_occupancy_df.linear_distance == cell_type]
+        occupancy_wide = occupancy_subset.pivot(index='fov', columns='cell_type', values='value')
+        occupancy_wide.reset_index(inplace=True)
+        occupancy_wide = pd.merge(occupancy_wide, density_subset[['fov', 'value']], on='fov', how='inner')
+
+        # get correlations
+        corr_df = occupancy_wide.corr(method='spearman')
+
+        # determine which features to keep
+        corr_vals = corr_df.loc['value', :].abs()
+        corr_vals = corr_vals[corr_vals < 0.7]
+
+        # add to list of features to keep
+        keep_cells.extend(corr_vals.index)
+        keep_features.extend([cell_type] * len(corr_vals.index))
+
+    keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features})
+
+    keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv'), index=False)
+else:
+    keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv'))
+
+deduped_dfs = []
+for cell_type in keep_df.cell_type.unique():
+    keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique()
+    if len(keep_features) > 0:
+        keep_df_subset = filtered_occupancy_df.loc[filtered_occupancy_df.cell_type == cell_type]
+        keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)]
+        deduped_dfs.append(keep_df_subset)
+
+deduped_occupancy_df = pd.concat(deduped_dfs)
+
+# save filtered df
+deduped_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_deduped.csv'), index=False)
+
+# create version aggregated by timepoint
+deduped_occupancy_df_grouped = deduped_occupancy_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset'])
+deduped_occupancy_df_timepoint = deduped_occupancy_df_grouped['value'].agg([np.mean, np.std])
+deduped_occupancy_df_timepoint.reset_index(inplace=True)
+deduped_occupancy_df_timepoint = deduped_occupancy_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# save timepoint df
+deduped_occupancy_df_timepoint.to_csv(os.path.join(output_dir, 'occupancy_df_per_timepoint_deduped.csv'), index=False)
+
+
+# create manual df with total morphology marker average across all cells in an image
+morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
+
+# group by specified columns
+grouped_table = morph_table_small.groupby('fov')
+transformed = grouped_table.agg(np.mean)
+transformed.reset_index(inplace=True)
+
+# reshape to long df
+long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature')
+long_df['metric'] = 'total_freq'
+long_df['cell_type'] = 'all'
+long_df['subset'] = 'all'
+
+long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
+
+long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False)
+
+# filter morphology markers to only include FOVs with at least the specified number of cells
+total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+min_cells = 5
+
+filtered_dfs = []
+metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+           ['cluster_count', 'cluster_freq'],
+           ['meta_cluster_count', 'meta_cluster_freq']]
+for metric in metrics:
+    # subset count df to include cells at the relevant clustering resolution
+    for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
+                        'tls', 'tagg', 'all']:
+        count_df = total_df[total_df.metric == metric[0]]
+        count_df = count_df[count_df.subset == compartment]
+
+        # subset morphology df to only include morphology metrics at this resolution
+        morph_df = total_df_morph[total_df_morph.metric == metric[1]]
+        morph_df = morph_df[morph_df.subset == compartment]
+
+        # for each cell type, determine which FOVs have high enough counts to be included
+        for cell_type in count_df.cell_type.unique():
+            keep_df = count_df[count_df.cell_type == cell_type]
+            keep_df = keep_df[keep_df.value >= min_cells]
+            keep_fovs = keep_df.fov.unique()
+
+            # subset morphology df to only include FOVs with high enough counts
+            keep_features = morph_df[morph_df.cell_type == cell_type]
+            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+
+            # append to list of filtered dfs
+            filtered_dfs.append(keep_features)
+
+filtered_dfs.append(long_df)
+filtered_morph_df = pd.concat(filtered_dfs)
+
+# save filtered df
+filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False)
+
+# create version aggregated by timepoint
+filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset'])
+filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std])
+filtered_morph_df_timepoint.reset_index(inplace=True)
+filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# save timepoint df
+filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False)
+
+# remove redundant morphology features
+block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter']
+
+block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear']
+
+block3 = ['eccentricity', 'major_axis_equiv_diam_ratio']
+
+block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear']
+
+deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+
+# only keep complex morphology features for cancer cells, remove everything except area and nc for others
+cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17',
+                   'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim']
+basic_morph_features = ['area', 'area_nuclear', 'nc_ratio']
+
+deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :]
+
+# saved deduped
+deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False)
+
+# same for timepoints
+deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :]
+deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False)
+
+
+
 # fiber analysis
 fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])]
 
diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py
index 5426185..7cc2900 100644
--- a/python_files/supplementary_plot_helpers.py
+++ b/python_files/supplementary_plot_helpers.py
@@ -439,10 +439,11 @@ def occupancy_division_factor(occupancy_group: pd.api.typing.DataFrameGroupBy,
 # occupancy statistic helpers
 def compute_occupancy_statistics(
     cell_table: pd.DataFrame, pop_col: str = "cell_cluster_broad",
-    pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8,
-    max_image_size: int = 2048, positive_threshold: int = 20
+    pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 4,
+    max_image_size: int = 2048, positive_threshold: int = 0.0, sample_dir: Union[str, pathlib.Path]
 ):
-    """Compute the occupancy statistics over a cohort.
+    """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile 
+    across the cohort.
 
     Args:
         cell_table (pd.DataFrame):
@@ -456,9 +457,9 @@ def compute_occupancy_statistics(
             The row/col dims of tiles to define over each image
         max_image_size (int):
             Maximum size of an image passed in, assuming square images
-            NOTE: all images should have an image size that is a factor of max_image_size
+            NOTE: all images should have an image size that is a factor of `max_image_size`
         positive_threshold (int):
-            The cell count in a tile required for positivity
+            The z-scored cell count in a tile required for positivity
 
     Returns:
         Dict[str, float]:
@@ -496,6 +497,8 @@ def compute_occupancy_statistics(
         names=["fov", "tile_row", "tile_col", pop_col]
     ).to_frame(index=False)
 
+    # compute the total occupancy of each tile for each population type
+    # NOTE: need to merge this way to account for tiles with 0 counts
     occupancy_counts: pd.DataFrame = cell_table.groupby(
         ["fov", "tile_row", "tile_col", pop_col]
     ).size().reset_index(name="occupancy")
@@ -505,9 +508,21 @@ def compute_occupancy_statistics(
         how="left"
     )
     occupancy_counts["occupancy"].fillna(0, inplace=True)
-    print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates())
+    # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates())
+
+    # zscore the tile counts across the cohort grouped by the cell types
+    occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby(
+        [pop_col]
+    )["occupancy"].transform(lambda x: zscore(x, ddof=0))
+
+    # mark tiles as positive depending on the positive_threshold value
+    occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold
+
+    # total up the positive tiles per FOV and cell type
+    occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply(
+        lambda row: row["is_positive"].sum()
+    ).reset_ondex("total_positive_tiles")
 
-    occupancy_counts["is_positive"] = occupancy_counts["occupancy"] > positive_threshold
     # occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply(
     #     lambda row: row["is_positive"].sum() / occupancy_division_factor(
     #         row, tiles_per_row_col, max_image_size, cell_table
@@ -515,9 +530,36 @@ def compute_occupancy_statistics(
     # ).reset_index(name="percent_positive")
     occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply(
         lambda row: row["is_positive"].sum()
-    ).reset_index(name="total_positive")
+    ).reset_index(name="total_positive_tiles")
+
+    # get the max image size, this will determine how many tiles there are when finding percentages
+    occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply(
+        lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0],
+        axis=1
+    )
+    occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply(
+        lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])),
+        axis=1
+    )
+
+    # determine the percent positivity of tiles
+    occupancy_counts_grouped["percent_positive_tiles"] = \
+        occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"]
+
+    # occupancy_stats_grouped: pd.DataFrame = occupancy_stats.groupby(
+    #     ["fov", "cell_cluster_broad"]
+    # ).apply(lambda row: row["is_positive"].sum()).reset_index(name="total_positive_tiles")
+
+    # # everything after here is grouped
+    # occupancy_stats_grouped["num_cells"] = occupancy_stats.groupby(
+    #     ["fov", "cell_cluster_broad", "positivity_threshold"]
+    # ).apply(lambda row: row["occupancy"].sum()).values
+
+    # occuapcny_stats_grouped["max_tiles"] = occupancy_count
+
+    return occupancy_counts_grouped
 
-    return occupancy_counts, occupancy_counts_grouped
+    # return occupancy_counts, occupancy_counts_grouped
 
     # # define the occupancy_stats array
     # occupancy_stats = xr.DataArray(
diff --git a/python_files/utils.py b/python_files/utils.py
index 7bfe0fa..24a319c 100644
--- a/python_files/utils.py
+++ b/python_files/utils.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from skimage import morphology
 from scipy.ndimage import gaussian_filter
+from scipy.stats import zscore
 from skimage.measure import label
 import skimage.io as io
 
@@ -221,6 +222,165 @@ def create_long_df_by_functional(func_table, cluster_col_name, drop_cols, result
     return long_df_all
 
 
+def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, subset_col=None,
+                        tiles_per_row_col=4, max_image_size=2048, positive_threshold=0):
+    """Helper function for creating the unmelted occupancy table
+
+    Args:
+        cell_table (pd.DataFrame): the dataframe containing information on each cell
+        cluster_col_name (str): the column name in cell_table that contains the cluster information
+        drop_cols (list): list of columns to drop from cell_table
+        result_name (str): the name of this statistic in the returned df
+        subset_col (str): the name of the subset col to group by, if not provided then skip
+        tiles_per_row_col (int): the row/col dims of tiles to define over each image
+        max_image_size (int): maximum size of an image passed in, assuming square images
+        positive_threshold (int): the z-scored cell count in a tile required for positivity
+
+    Returns:
+        pd.DataFrame: occupancy stats in unmelted format"""
+
+    verify_in_list(cell_type_col=cluster_col_name, cell_table_columns=cell_table.columns)
+    verify_in_list(drop_cols=drop_cols, cell_table_columns=cell_table.columns)
+    if subset_col is not None:
+        verify_in_list(subset_col=subset_col, cell_table_columns=cell_table.columns)
+
+    # remove the drop columns
+    cell_table_small = cell_table.loc[:, ~cell_table.columns.isin(drop_cols)]
+
+    # get all the FOV names
+    fov_names = cell_table_small["fov"].unique()
+
+    # if a population subset is specified, first validate then truncate the cell table accordingly
+    if pop_subset:
+        verify_in_list(
+            specified_populations=pop_subset,
+            valid_populations=cell_table_small[cluster_col_name].unique()
+        )
+    # otherwise, use all possible subsets as defined by pop_col
+    else:
+        pop_subset = cell_table_small[cluster_col_name].unique()
+
+    # define the tile size in pixels for each FOV
+    tile_size = max_image_size // tiles_per_row_col
+
+    # define the tile for each cell
+    cell_table_small["tile_row"] = (cell_table_small["centroid-1"] // tile_size).astype(int)
+    cell_table_small["tile_col"] = (cell_table_small["centroid-0"] // tile_size).astype(int)
+
+    # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col
+    all_combos: pd.DataFrame = pd.MultiIndex.from_product(
+        [
+            cell_table_small["fov"].unique(),
+            cell_table_small["tile_row"].unique(),
+            cell_table_small["tile_col"].unique(),
+            cell_table_small[pop_col].unique()
+        ],
+        names=["fov", "tile_row", "tile_col", cluster_col_name]
+    ).to_frame(index=False)
+
+    occupancy_count_groupby = ["fov", "tile_row", "tile_col", cluster_col_name]
+    if subset_col:
+        occupancy_group_by.append("subset_col")
+
+    # compute the total occupancy of each tile for each population type
+    # NOTE: need to merge this way to account for tiles with 0 counts
+    occupancy_counts = cell_table_small.groupby(
+        occupancy_count_groupby
+    ).size().reset_index(name="occupancy")
+    occupancy_counts = pd.merge(
+        all_combos, occupancy_counts,
+        on=occupancy_count_groupby,
+        how="left"
+    )
+    occupancy_counts["occupancy"].fillna(0, inplace=True)
+    # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates())
+
+    # zscore the tile counts across the cohort grouped by the cell types
+    occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby(
+        [pop_col]
+    )["occupancy"].transform(lambda x: zscore(x, ddof=0))
+
+    # mark tiles as positive depending on the positive_threshold value
+    occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold
+
+    # total up the positive tiles per FOV and cell type
+    occupancy_zscore_groupby = ["fov", cluster_col_name]
+    if subset_col is not None:
+        occupancy_zscore_groupby.append(subset_col)
+    occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(occupancy_zscore_groupby).apply(
+        lambda row: row["is_positive"].sum()
+    ).reset_ondex("total_positive_tiles")
+
+    # get the max image size, this will determine how many tiles there are when finding percentages
+    occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply(
+        lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0],
+        axis=1
+    )
+    occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply(
+        lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])),
+        axis=1
+    )
+
+    # determine the percent positivity of tiles
+    occupancy_counts_grouped["percent_positive_tiles"] = \
+        occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"]
+
+    # reshape to long df
+    occupancy_long_df = pd.melt(
+        occupancy_counts_grouped[occupancy_zscore_groupby + ["percent_positive_tiles"]],
+        id_vars=occupancy_zscore_groupby,
+        var_name='functional_marker'
+    )
+    occupancy_long_df['metric'] = result_name
+    occupancy_long_df = long_df.rename(columns={cluster_col_name: "cell_type"})
+    if subset_col is not None:
+        occupancy_long_df.rename(columns={subset_col: "subset"})
+
+    return occupancy_long_df
+
+
+def create_long_df_by_occupancy(cell_table, cluster_col_name, drop_cols, result_name,
+                                subset_col=None, tiles_per_row_col=4, max_image_size=2048,
+                                positive_threshold=0):
+    """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile 
+    across the cohort.
+
+    Args:
+        cell_table (pd.DataFrame): the dataframe containing information on each cell
+        cluster_col_name (str): the column name in cell_table that contains the cluster information
+        drop_cols (list): list of columns to drop from cell_table
+        result_name (str): the name of this statistic in the returned df
+        subset_col (str): the column name in cell_table to subset by
+        tiles_per_row_col (int): the row/col dims of tiles to define over each image
+        max_image_size (int): maximum size of an image passed in, assuming square images
+        positive_threshold (int): the z-scored cell count in a tile required for positivity
+
+    Returns:
+        pd.DataFrame: long format dataframe containing the summarized occupancy data"""
+
+    # first generate df without subsetting
+    drop_cols_all = drop_cols.copy()
+    if subset_col is not None:
+        drop_cols_all = drop_cols + [subset_col]
+
+    occupancy_long_all_compartments = occupancy_df_helper(
+        cell_table, cluster_col_name, drop_cols, result_name, None,
+        tiles_per_row_col, max_image_size, positive_threshold
+    )
+    long_df_all["subset"] = "all"
+
+    if subset_col is not None:
+        occupancy_long_per_compartment = occupancy_df_helper(
+            cell_table, cluster_col_name, drop_cols, result_name, subset_col,
+            tiles_per_row_col, max_image_size, positive_threshold
+        )
+        occupancy_long_comb = pd.concat(
+            [occupancy_long_all_compartments, occupancy_long_per_compartment]
+        )
+
+    return occupancy_long_comb
+
+
 def create_channel_mask(img, intensity_thresh, sigma, min_mask_size=0, max_hole_size=100000):
     """Generates a binary mask from a single channel image
 

From 3ed1549029bc0585808c823da6f2a93e9009be9a Mon Sep 17 00:00:00 2001
From: alex-l-kong <alkong@ucdavis.edu>
Date: Wed, 4 Sep 2024 15:05:09 -0700
Subject: [PATCH 14/14] Remove unnecessary pop_subset

---
 .../postprocessing/5_create_dfs_per_core.py   | 2040 +++++++++--------
 python_files/utils.py                         |   18 +-
 2 files changed, 1026 insertions(+), 1032 deletions(-)

diff --git a/python_files/postprocessing/5_create_dfs_per_core.py b/python_files/postprocessing/5_create_dfs_per_core.py
index 7d6e905..01b0970 100644
--- a/python_files/postprocessing/5_create_dfs_per_core.py
+++ b/python_files/postprocessing/5_create_dfs_per_core.py
@@ -5,7 +5,11 @@
 import numpy as np
 from scipy.stats import spearmanr
 
-from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy
+# from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+# from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy
+from utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy
 
 #
 # This file creates plotting-ready data structures to enumerate the frequency, count, and density
@@ -29,1001 +33,1001 @@
 
 # load relevant tables
 core_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_core.csv'))
-timepoint_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_timepoint.csv'))
+# timepoint_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_timepoint.csv'))
 harmonized_metadata = pd.read_csv(os.path.join(analysis_dir, 'harmonized_metadata.csv'))
 cell_table_clusters = pd.read_csv(os.path.join(analysis_dir, 'cell_table_clusters.csv'))
-cell_table_func = pd.read_csv(os.path.join(analysis_dir, 'cell_table_func_all.csv'))
-cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv'))
-cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv'))
-cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv'))
-cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated'), usecols=[
+# cell_table_func = pd.read_csv(os.path.join(analysis_dir, 'cell_table_func_all.csv'))
+# cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv'))
+# cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv'))
+# cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv'))
+cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated.csv'), usecols=[
     "fov", "label", "centroid-0", "centroid-1",  "cell_meta_cluster", "cell_cluster", "cell_cluster_broad"
 ])
-area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv'))
-annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'cell_annotation_mask.csv'))
-fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv'))
-fiber_tile_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data/tile_stats_512', 'fiber_stats_table-tile_512.csv'))
+# area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv'))
+annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir', 'cell_annotation_mask.csv'))
+# fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv'))
+# fiber_tile_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data/tile_stats_512', 'fiber_stats_table-tile_512.csv'))
 
 # merge cell-level annotations
 harmonized_annotations = annotations_by_mask
 harmonized_annotations = harmonized_annotations.rename(columns={'mask_name': 'tumor_region'})
 assert len(harmonized_annotations) == len(cell_table_clusters)
 
-cell_table_clusters = cell_table_clusters.merge(harmonized_annotations, on=['fov', 'label'], how='left')
-cell_table_func = cell_table_func.merge(harmonized_annotations, on=['fov', 'label'], how='left')
-cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left')
-cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left')
-cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+# cell_table_clusters = cell_table_clusters.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+# cell_table_func = cell_table_func.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+# cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+# cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left')
+# cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left')
 cell_table_occupancy = cell_table_occupancy.merge(harmonized_annotations, on=['fov', 'label'], how='left')
 
 # check for FOVs present in imaged data that aren't in core metadata
 missing_fovs = cell_table_clusters.loc[~cell_table_clusters.fov.isin(core_metadata.fov), 'fov'].unique()
 
-# TONIC_TMA24_R11C2 and TONIC_TMA24_R11C3 are wrong tissue
-cell_table_clusters = cell_table_clusters.loc[~cell_table_clusters.fov.isin(missing_fovs), :]
-cell_table_func = cell_table_func.loc[~cell_table_func.fov.isin(missing_fovs), :]
-cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :]
-cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :]
-cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :]
+# # TONIC_TMA24_R11C2 and TONIC_TMA24_R11C3 are wrong tissue
+# cell_table_clusters = cell_table_clusters.loc[~cell_table_clusters.fov.isin(missing_fovs), :]
+# cell_table_func = cell_table_func.loc[~cell_table_func.fov.isin(missing_fovs), :]
+# cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :]
+# cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :]
+# cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :]
 cell_table_occupancy = cell_table_occupancy.loc[~cell_table_occupancy.fov.isin(missing_fovs), :]
-fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :]
-fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :]
+# fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :]
+# fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :]
+
+# #
+# # Generate counts and proportions of cell clusters per FOV
+# #
+
+# # Specify the types of cluster dfs to produce. Each row corresponds to a different way of
+# # summarizing the data. The first item in each list is the name for the df, the second is the
+# # name of the column to use to for the cluster labels, and the third is a boolean flag controlling
+# # whether normalized frequencies or total counts will be returned
+
+# cluster_df_params = [['cluster_broad_freq', 'cell_cluster_broad', True],
+#                      ['cluster_broad_count', 'cell_cluster_broad', False],
+#                      ['cluster_freq', 'cell_cluster', True],
+#                      ['cluster_count', 'cell_cluster', False],
+#                      ['meta_cluster_freq', 'cell_meta_cluster', True],
+#                      ['meta_cluster_count', 'cell_meta_cluster', False]]
+
+# cluster_dfs = []
+# for result_name, cluster_col_name, normalize in cluster_df_params:
+#     cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters,
+#                                                  result_name=result_name,
+#                                                  cluster_col_name=cluster_col_name,
+#                                                  subset_col='tumor_region',
+#                                                  normalize=normalize))
+
+
+# # create masks for dfs looking at only a subset of cells
+
+# # proportion of T cell subsets
+# tcell_mask = cell_table_clusters['cell_cluster'].isin(['Treg', 'CD8T', 'CD4T', 'T_Other'])
+
+# # proportion of immune cell subsets
+# immune_mask = cell_table_clusters['cell_cluster_broad'].isin(['Mono_Mac', 'T',
+#                                                               'Granulocyte', 'NK', 'B'])
+# immune_mask_2 = cell_table_clusters.cell_cluster == 'Immune_Other'
+# immune_mask = np.logical_or(immune_mask, immune_mask_2)
+
+# # proportion of stromal subsets
+# stroma_mask = cell_table_clusters['cell_cluster_broad'].isin(['Stroma'])
+
+# # proportion of cancer subsets
+# cancer_mask = cell_table_clusters['cell_cluster_broad'].isin(['Cancer'])
+
+# cluster_mask_params = [['tcell_freq', 'cell_cluster', True, tcell_mask],
+#                        ['immune_freq', 'cell_cluster', True, immune_mask],
+#                        ['stroma_freq', 'cell_meta_cluster', True, stroma_mask],
+#                        ['cancer_freq', 'cell_meta_cluster', True, cancer_mask]]
+
+# for result_name, cluster_col_name, normalize, mask in cluster_mask_params:
+#     cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters.loc[mask, :],
+#                                                  result_name=result_name,
+#                                                  cluster_col_name=cluster_col_name,
+#                                                  subset_col='tumor_region',
+#                                                  normalize=normalize))
+
+# # calculate total number of cells per image
+# grouped_cell_counts = cell_table_clusters[['fov']].groupby('fov').value_counts()
+# grouped_cell_counts = pd.DataFrame(grouped_cell_counts)
+# grouped_cell_counts.columns = ['value']
+# grouped_cell_counts.reset_index(inplace=True)
+# grouped_cell_counts['metric'] = 'total_cell_count'
+# grouped_cell_counts['cell_type'] = 'all'
+# grouped_cell_counts['subset'] = 'all'
+
+# #
+
+# # calculate total number of cells per region per image
+# grouped_cell_counts_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov', 'tumor_region']).value_counts()
+# grouped_cell_counts_region = pd.DataFrame(grouped_cell_counts_region)
+# grouped_cell_counts_region.columns = ['value']
+# grouped_cell_counts_region.reset_index(inplace=True)
+# grouped_cell_counts_region['metric'] = 'total_cell_count'
+# grouped_cell_counts_region.rename(columns={'tumor_region': 'subset'}, inplace=True)
+# grouped_cell_counts_region['cell_type'] = 'all'
+
+# # calculate proportions of cells per region per image
+# grouped_cell_freq_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov'])
+# grouped_cell_freq_region = grouped_cell_freq_region['tumor_region'].value_counts(normalize=True)
+# grouped_cell_freq_region = pd.DataFrame(grouped_cell_freq_region)
+# grouped_cell_freq_region.columns = ['value']
+# grouped_cell_freq_region.reset_index(inplace=True)
+# grouped_cell_freq_region['metric'] = 'total_cell_freq'
+# grouped_cell_freq_region.rename(columns={'tumor_region': 'subset'}, inplace=True)
+# grouped_cell_freq_region['cell_type'] = 'all'
+
+# # add manually defined dfs to overall list
+# cluster_dfs.extend([grouped_cell_counts,
+#                     grouped_cell_counts_region,
+#                     grouped_cell_freq_region])
+
+# # create single df with appropriate metadata
+# total_df = pd.concat(cluster_dfs, axis=0)
+
+# # compute density of cells for counts-based metrics
+# count_metrics = total_df.metric.unique()
+# count_metrics = [x for x in count_metrics if 'count' in x]
+
+# count_df = total_df.loc[total_df.metric.isin(count_metrics), :]
+# area_df = area_df.rename(columns={'compartment': 'subset'})
+# count_df = count_df.merge(area_df, on=['fov', 'subset'], how='left')
+# count_df['value'] = count_df['value'] / count_df['area']
+# count_df['value'] = count_df['value'] * 1000
+
+# # rename metric from count to density
+# count_df['metric'] = count_df['metric'].str.replace('count', 'density')
+# count_df = count_df.drop(columns=['area'])
+# total_df = pd.concat([total_df, count_df], axis=0)
+
+# # check that all metadata from core_metadata succesfully transferred over
+# len_total_df = len(total_df)
+# total_df = total_df.merge(harmonized_metadata, on='fov', how='left')
+# assert len_total_df == len(total_df)
+
+
+# # save annotated cluster counts
+# total_df.to_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'), index=False)
+
+# # create version aggregated by timepoint
+# total_df_grouped = total_df.groupby(['Tissue_ID', 'cell_type', 'metric', 'subset'])
+# total_df_timepoint = total_df_grouped['value'].agg([np.mean, np.std])
+# total_df_timepoint.reset_index(inplace=True)
+# total_df_timepoint = total_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# total_df_timepoint.to_csv(os.path.join(output_dir, 'cluster_df_per_timepoint.csv'), index=False)
+
+
+# #
+# # Create summary dataframe with proportions and counts of different functional marker populations
+# #
+
+
+# # Columns which are not thresholded (such as ratios between markers) can only be calculated for
+# # dfs looking at normalized expression, and need to be dropped when calculating counts
+# count_drop_cols = ['H3K9ac_H3K27me3_ratio', 'CD45RO_CD45RB_ratio']
+
+# # Create list to hold parameters for each df that will be produced
+# func_df_params = [['cluster_broad_count', 'cell_cluster_broad', False],
+#                   ['cluster_broad_freq', 'cell_cluster_broad', True],
+#                   ['cluster_count', 'cell_cluster', False],
+#                   ['cluster_freq', 'cell_cluster', True],
+#                   ['meta_cluster_count', 'cell_meta_cluster', False],
+#                   ['meta_cluster_freq', 'cell_meta_cluster', True]]
+
+
+# func_dfs = []
+# for result_name, cluster_col_name, normalize in func_df_params:
+#     # columns which are not functional markers need to be dropped from the df
+#     drop_cols = ['label']
+#     if not normalize:
+#         drop_cols.extend(count_drop_cols)
+
+#     # remove cluster_names except for the one specified for the df
+#     cluster_names = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad'] # , 'kmeans_labels']
+#     cluster_names.remove(cluster_col_name)
+#     drop_cols.extend(cluster_names)
+
+#     # create df
+#     func_dfs.append(create_long_df_by_functional(func_table=cell_table_func,
+#                                                  result_name=result_name,
+#                                                  cluster_col_name=cluster_col_name,
+#                                                  drop_cols=drop_cols,
+#                                                  normalize=normalize,
+#                                                  subset_col='tumor_region'))
+
+# # create combined df
+# total_df_func = pd.concat(func_dfs, axis=0)
+# total_df_func = total_df_func.merge(harmonized_metadata, on='fov', how='inner')
+
+# # save combined df
+# total_df_func.to_csv(os.path.join(output_dir, 'functional_df_per_core.csv'), index=False)
+
+# # create version aggregated by timepoint
+# total_df_grouped_func = total_df_func.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
+# total_df_timepoint_func = total_df_grouped_func['value'].agg([np.mean, np.std])
+# total_df_timepoint_func.reset_index(inplace=True)
+# total_df_timepoint_func = total_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# total_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint.csv'), index=False)
+
+# #
+# # Filter functional markers
+# #
 
-#
-# Generate counts and proportions of cell clusters per FOV
-#
-
-# Specify the types of cluster dfs to produce. Each row corresponds to a different way of
-# summarizing the data. The first item in each list is the name for the df, the second is the
-# name of the column to use to for the cluster labels, and the third is a boolean flag controlling
-# whether normalized frequencies or total counts will be returned
-
-cluster_df_params = [['cluster_broad_freq', 'cell_cluster_broad', True],
-                     ['cluster_broad_count', 'cell_cluster_broad', False],
-                     ['cluster_freq', 'cell_cluster', True],
-                     ['cluster_count', 'cell_cluster', False],
-                     ['meta_cluster_freq', 'cell_meta_cluster', True],
-                     ['meta_cluster_count', 'cell_meta_cluster', False]]
-
-cluster_dfs = []
-for result_name, cluster_col_name, normalize in cluster_df_params:
-    cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters,
-                                                 result_name=result_name,
-                                                 cluster_col_name=cluster_col_name,
-                                                 subset_col='tumor_region',
-                                                 normalize=normalize))
-
-
-# create masks for dfs looking at only a subset of cells
-
-# proportion of T cell subsets
-tcell_mask = cell_table_clusters['cell_cluster'].isin(['Treg', 'CD8T', 'CD4T', 'T_Other'])
-
-# proportion of immune cell subsets
-immune_mask = cell_table_clusters['cell_cluster_broad'].isin(['Mono_Mac', 'T',
-                                                              'Granulocyte', 'NK', 'B'])
-immune_mask_2 = cell_table_clusters.cell_cluster == 'Immune_Other'
-immune_mask = np.logical_or(immune_mask, immune_mask_2)
-
-# proportion of stromal subsets
-stroma_mask = cell_table_clusters['cell_cluster_broad'].isin(['Stroma'])
-
-# proportion of cancer subsets
-cancer_mask = cell_table_clusters['cell_cluster_broad'].isin(['Cancer'])
-
-cluster_mask_params = [['tcell_freq', 'cell_cluster', True, tcell_mask],
-                       ['immune_freq', 'cell_cluster', True, immune_mask],
-                       ['stroma_freq', 'cell_meta_cluster', True, stroma_mask],
-                       ['cancer_freq', 'cell_meta_cluster', True, cancer_mask]]
-
-for result_name, cluster_col_name, normalize, mask in cluster_mask_params:
-    cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters.loc[mask, :],
-                                                 result_name=result_name,
-                                                 cluster_col_name=cluster_col_name,
-                                                 subset_col='tumor_region',
-                                                 normalize=normalize))
-
-# calculate total number of cells per image
-grouped_cell_counts = cell_table_clusters[['fov']].groupby('fov').value_counts()
-grouped_cell_counts = pd.DataFrame(grouped_cell_counts)
-grouped_cell_counts.columns = ['value']
-grouped_cell_counts.reset_index(inplace=True)
-grouped_cell_counts['metric'] = 'total_cell_count'
-grouped_cell_counts['cell_type'] = 'all'
-grouped_cell_counts['subset'] = 'all'
-
-#
-
-# calculate total number of cells per region per image
-grouped_cell_counts_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov', 'tumor_region']).value_counts()
-grouped_cell_counts_region = pd.DataFrame(grouped_cell_counts_region)
-grouped_cell_counts_region.columns = ['value']
-grouped_cell_counts_region.reset_index(inplace=True)
-grouped_cell_counts_region['metric'] = 'total_cell_count'
-grouped_cell_counts_region.rename(columns={'tumor_region': 'subset'}, inplace=True)
-grouped_cell_counts_region['cell_type'] = 'all'
-
-# calculate proportions of cells per region per image
-grouped_cell_freq_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov'])
-grouped_cell_freq_region = grouped_cell_freq_region['tumor_region'].value_counts(normalize=True)
-grouped_cell_freq_region = pd.DataFrame(grouped_cell_freq_region)
-grouped_cell_freq_region.columns = ['value']
-grouped_cell_freq_region.reset_index(inplace=True)
-grouped_cell_freq_region['metric'] = 'total_cell_freq'
-grouped_cell_freq_region.rename(columns={'tumor_region': 'subset'}, inplace=True)
-grouped_cell_freq_region['cell_type'] = 'all'
-
-# add manually defined dfs to overall list
-cluster_dfs.extend([grouped_cell_counts,
-                    grouped_cell_counts_region,
-                    grouped_cell_freq_region])
-
-# create single df with appropriate metadata
-total_df = pd.concat(cluster_dfs, axis=0)
-
-# compute density of cells for counts-based metrics
-count_metrics = total_df.metric.unique()
-count_metrics = [x for x in count_metrics if 'count' in x]
-
-count_df = total_df.loc[total_df.metric.isin(count_metrics), :]
-area_df = area_df.rename(columns={'compartment': 'subset'})
-count_df = count_df.merge(area_df, on=['fov', 'subset'], how='left')
-count_df['value'] = count_df['value'] / count_df['area']
-count_df['value'] = count_df['value'] * 1000
-
-# rename metric from count to density
-count_df['metric'] = count_df['metric'].str.replace('count', 'density')
-count_df = count_df.drop(columns=['area'])
-total_df = pd.concat([total_df, count_df], axis=0)
-
-# check that all metadata from core_metadata succesfully transferred over
-len_total_df = len(total_df)
-total_df = total_df.merge(harmonized_metadata, on='fov', how='left')
-assert len_total_df == len(total_df)
-
-
-# save annotated cluster counts
-total_df.to_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'), index=False)
-
-# create version aggregated by timepoint
-total_df_grouped = total_df.groupby(['Tissue_ID', 'cell_type', 'metric', 'subset'])
-total_df_timepoint = total_df_grouped['value'].agg([np.mean, np.std])
-total_df_timepoint.reset_index(inplace=True)
-total_df_timepoint = total_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
-
-# save timepoint df
-total_df_timepoint.to_csv(os.path.join(output_dir, 'cluster_df_per_timepoint.csv'), index=False)
-
-
-#
-# Create summary dataframe with proportions and counts of different functional marker populations
-#
-
-
-# Columns which are not thresholded (such as ratios between markers) can only be calculated for
-# dfs looking at normalized expression, and need to be dropped when calculating counts
-count_drop_cols = ['H3K9ac_H3K27me3_ratio', 'CD45RO_CD45RB_ratio']
-
-# Create list to hold parameters for each df that will be produced
-func_df_params = [['cluster_broad_count', 'cell_cluster_broad', False],
-                  ['cluster_broad_freq', 'cell_cluster_broad', True],
-                  ['cluster_count', 'cell_cluster', False],
-                  ['cluster_freq', 'cell_cluster', True],
-                  ['meta_cluster_count', 'cell_meta_cluster', False],
-                  ['meta_cluster_freq', 'cell_meta_cluster', True]]
-
-
-func_dfs = []
-for result_name, cluster_col_name, normalize in func_df_params:
-    # columns which are not functional markers need to be dropped from the df
-    drop_cols = ['label']
-    if not normalize:
-        drop_cols.extend(count_drop_cols)
-
-    # remove cluster_names except for the one specified for the df
-    cluster_names = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad'] # , 'kmeans_labels']
-    cluster_names.remove(cluster_col_name)
-    drop_cols.extend(cluster_names)
-
-    # create df
-    func_dfs.append(create_long_df_by_functional(func_table=cell_table_func,
-                                                 result_name=result_name,
-                                                 cluster_col_name=cluster_col_name,
-                                                 drop_cols=drop_cols,
-                                                 normalize=normalize,
-                                                 subset_col='tumor_region'))
-
-# create combined df
-total_df_func = pd.concat(func_dfs, axis=0)
-total_df_func = total_df_func.merge(harmonized_metadata, on='fov', how='inner')
-
-# save combined df
-total_df_func.to_csv(os.path.join(output_dir, 'functional_df_per_core.csv'), index=False)
-
-# create version aggregated by timepoint
-total_df_grouped_func = total_df_func.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
-total_df_timepoint_func = total_df_grouped_func['value'].agg([np.mean, np.std])
-total_df_timepoint_func.reset_index(inplace=True)
-total_df_timepoint_func = total_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
-
-# save timepoint df
-total_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint.csv'), index=False)
-
-#
-# Filter functional markers
-#
-
-# filter functional markers to only include FOVs with at least the specified number of cells
-total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-min_cells = 5
-
-filtered_dfs = []
-metrics = [['cluster_broad_count', 'cluster_broad_freq'],
-           ['cluster_count', 'cluster_freq'],
-           ['meta_cluster_count', 'meta_cluster_freq']]
-
-for metric in metrics:
-    # subset count df to include cells at the relevant clustering resolution
-    for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
-                        'tls', 'tagg', 'all']:
-        count_df = total_df[total_df.metric == metric[0]]
-        count_df = count_df[count_df.subset == compartment]
-
-        # subset functional df to only include functional markers at this resolution
-        func_df = total_df_func[total_df_func.metric.isin(metric)]
-        func_df = func_df[func_df.subset == compartment]
-
-        # for each cell type, determine which FOVs have high enough counts to be included
-        for cell_type in count_df.cell_type.unique():
-            keep_df = count_df[count_df.cell_type == cell_type]
-            keep_df = keep_df[keep_df.value >= min_cells]
-            keep_fovs = keep_df.fov.unique()
-
-            # subset functional df to only include FOVs with high enough counts
-            keep_markers = func_df[func_df.cell_type == cell_type]
-            keep_markers = keep_markers[keep_markers.fov.isin(keep_fovs)]
-
-            # append to list of filtered dfs
-            filtered_dfs.append(keep_markers)
-
-filtered_func_df = pd.concat(filtered_dfs)
-
-# take subset for plotting average functional marker expression
-filtered_func_df_plot = filtered_func_df.loc[filtered_func_df.subset == 'all', :]
-filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.metric.isin(['cluster_broad_freq', 'cluster_freq', 'meta_cluster_freq']), :]
-filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.functional_marker.isin(sp_markers), :]
-
-# save filtered df
-filtered_func_df_plot.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False)
-
-# # identify combinations of markers and cell types to include in analysis based on threshold
-# mean_percent_positive = 0.05
-# sp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' not in x]
-# broad_df = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq']
-# broad_df = broad_df[broad_df.functional_marker.isin(sp_markers)]
-# broad_df = broad_df[broad_df.subset == 'all']
-# broad_df = broad_df[broad_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# broad_df_agg = broad_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# broad_df_agg = broad_df_agg.reset_index()
-# broad_df = broad_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# broad_df_include = broad_df > mean_percent_positive
-#
-# # include for all cells
-# general_markers = ['Ki67', 'HLA1', 'Vim', 'H3K9ac_H3K27me3_ratio']
-# broad_df_include[[general_markers]] = True
-#
-# # CD45 isoform ratios
-# double_pos = np.logical_and(broad_df_include['CD45RO'], broad_df_include['CD45RB'])
-# broad_df_include['CD45RO_CD45RB_ratio'] = double_pos
-#
-# # Cancer expression
-# broad_df_include.loc['Cancer', ['HLADR', 'CD57']] = True
-#
-# broad_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'))
-#
-# # apply thresholds to medium level clustering
-# assignment_dict_med = {'Cancer': ['Cancer', 'Cancer_EMT', 'Cancer_Other'],
-#                      'Mono_Mac': ['M1_Mac', 'M2_Mac', 'Mac_Other', 'Monocyte', 'APC'],
-#                      'B': ['B'],
-#                      'T': ['CD4T', 'CD8T', 'Treg', 'T_Other', 'Immune_Other'],
-#                      'Granulocyte': ['Neutrophil', 'Mast'],
-#                      'Stroma': ['Endothelium', 'Fibroblast', 'Stroma'],
-#                      'NK': ['NK'],
-#                      'Other': ['Other']}
-#
-# # get a list of all cell types
-# cell_types = np.concatenate([assignment_dict_med[key] for key in assignment_dict_med.keys()])
-# cell_types.sort()
-#
-# med_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns)
-#
-# for key in assignment_dict_med.keys():
-#     values = assignment_dict_med[key]
-#     med_df_include.loc[values] = broad_df_include.loc[key].values
-#
-# # check if assignment makes sense
-# med_df = filtered_func_df[filtered_func_df.metric == 'cluster_freq']
-# med_df = med_df[med_df.functional_marker.isin(sp_markers)]
-# med_df = med_df[med_df.subset == 'all']
-# med_df = med_df[med_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# med_df_agg = med_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# med_df_agg.reset_index(inplace=True)
-# med_df = med_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# med_df_bin = med_df > mean_percent_positive
-#
-# # add CD38 signal
-# med_df_include.loc[['Endothelium', 'Immune_Other'], 'CD38'] = True
-#
-# # add IDO signal
-# med_df_include.loc[['APC'], 'IDO'] = True
-#
-# # compare to see where assignments disagree, to see if any others need to be added
-# new_includes = (med_df_bin == True) & (med_df_include == False)
-#
-# med_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'))
-#
-# # do the same for the fine-grained clustering
-# assignment_dict_meta = {'Cancer': ['Cancer_CD56', 'Cancer_CK17', 'Cancer_Ecad'],
-#                    'Cancer_EMT': ['Cancer_SMA', 'Cancer_Vim'],
-#                    'Cancer_Other': ['Cancer_Other', 'Cancer_Mono'],
-#                    'M1_Mac': ['CD68'],
-#                    'M2_Mac': ['CD163'],
-#                    'Mac_Other': ['CD68_CD163_DP'],
-#                    'Monocyte': ['CD4_Mono', 'CD14'],
-#                    'APC': ['CD11c_HLADR'],
-#                    'B':  ['CD20'],
-#                    'Endothelium': ['CD31', 'CD31_VIM'],
-#                    'Fibroblast': ['FAP', 'FAP_SMA', 'SMA'],
-#                    'Stroma': ['Stroma_Collagen', 'Stroma_Fibronectin', 'VIM'],
-#                    'NK': ['CD56'],
-#                    'Neutrophil': ['Neutrophil'],
-#                    'Mast': ['Mast'],
-#                    'CD4T': ['CD4T','CD4T_HLADR'],
-#                    'CD8T': ['CD8T'],
-#                    'Treg': ['Treg'],
-#                    'T_Other': ['CD3_DN','CD4T_CD8T_DP'],
-#                    'Immune_Other': ['Immune_Other'],
-#                    'Other': ['Other']}
-#
-# # get a list of all cell types
-# cell_types = np.concatenate([assignment_dict_meta[key] for key in assignment_dict_meta.keys()])
-# cell_types.sort()
-#
-# meta_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns)
-#
-# for key in assignment_dict_meta.keys():
-#     values = assignment_dict_meta[key]
-#     meta_df_include.loc[values] = med_df_include.loc[key].values
-#
-# # check if assignment makes sense
-# meta_df = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq']
-# meta_df = meta_df[meta_df.functional_marker.isin(sp_markers)]
-# meta_df = meta_df[meta_df.subset == 'all']
-# meta_df = meta_df[meta_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# meta_df_agg = meta_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# meta_df_agg.reset_index(inplace=True)
-# meta_df = meta_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# meta_df_bin = meta_df > mean_percent_positive
-#
-# # compare to see where assignments disagree
-# new_includes = (meta_df_bin == True) & (meta_df_include == False)
-#
-# meta_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'))
-
-# process functional data so that only the specified cell type/marker combos are included
-
-# load matrices
-broad_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'), index_col=0)
-med_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'), index_col=0)
-meta_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'), index_col=0)
-
-# identify metrics and dfs that will be filtered
-filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include],
-           ['cluster_count', 'cluster_freq', med_df_include],
-           ['meta_cluster_count', 'meta_cluster_freq', meta_df_include]]
-
-combo_dfs = []
-
-for filters in filtering:
-    # get variables
-    metric_names = filters[:2]
-    metric_df = filters[2]
-
-    # subset functional df to only include functional markers at this resolution
-    func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)]
-
-    # loop over each cell type, and get the corresponding markers
-    for cell_type in metric_df.index:
-        markers = metric_df.columns[metric_df.loc[cell_type] == True]
-
-        # subset functional df to only include this cell type
-        func_df_cell = func_df[func_df.cell_type == cell_type]
-
-        # subset functional df to only include markers for this cell type
-        func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)]
-
-        # append to list of dfs
-        combo_dfs.append(func_df_cell)
-
-# do the same thing for double positive cells
-
-# # identify combinations of markers and cell types to include in analysis based on threshold
-# dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x]
-# mean_percent_positive_dp = 0.05
-# broad_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq']
-# broad_df_dp = broad_df_dp[broad_df_dp.subset == 'all']
-# broad_df_dp = broad_df_dp[broad_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# broad_df_dp = broad_df_dp[broad_df_dp.functional_marker.isin(dp_markers)]
-# broad_df_dp_agg = broad_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# broad_df_dp_agg = broad_df_dp_agg.reset_index()
-# broad_df_dp = broad_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# broad_df_dp_include = broad_df_dp > mean_percent_positive_dp
-#
-# broad_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'))
-#
-# # do the same for medium-level clustering
-# med_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_freq']
-# med_df_dp = med_df_dp[med_df_dp.subset == 'all']
-# med_df_dp = med_df_dp[med_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# med_df_dp = med_df_dp[med_df_dp.functional_marker.isin(dp_markers)]
-# med_df_dp_agg = med_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# # create matrix of cell types and markers
-# med_df_dp_agg = med_df_dp_agg.reset_index()
-# med_df_dp = med_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# med_df_dp_include = med_df_dp > mean_percent_positive_dp
-#
-# med_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'))
-#
-#
-# # do the same for finest-level clustering
-# meta_df_dp = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq']
-# meta_df_dp = meta_df_dp[meta_df_dp.subset == 'all']
-# meta_df_dp = meta_df_dp[meta_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
-# meta_df_dp = meta_df_dp[meta_df_dp.functional_marker.isin(dp_markers)]
-# meta_df_dp_agg = meta_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
-#
-# # create matrix of cell types and markers
-# meta_df_dp_agg = meta_df_dp_agg.reset_index()
-# meta_df_dp = meta_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
-# meta_df_dp_include = meta_df_dp > mean_percent_positive_dp
-#
-# meta_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'))
-
-# load inclusion matrices
-broad_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'), index_col=0)
-med_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'), index_col=0)
-meta_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'), index_col=0)
-
-# identify metrics and dfs that will be filtered
-filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include_dp],
-           ['cluster_count', 'cluster_freq', med_df_include_dp],
-           ['meta_cluster_count', 'meta_cluster_freq', meta_df_include_dp]]
-
-for filters in filtering:
-    # get variables
-    metric_names = filters[:2]
-    metric_df = filters[2]
-
-    # subset functional df to only include functional markers at this resolution
-    func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)]
-
-    # loop over each cell type, and get the corresponding markers
-    for cell_type in metric_df.index:
-        markers = metric_df.columns[metric_df.loc[cell_type] == True]
-
-        # subset functional df to only include this cell type
-        func_df_cell = func_df[func_df.cell_type == cell_type]
-
-        # subset functional df to only include markers for this cell type
-        func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)]
-
-        # append to list of dfs
-        combo_dfs.append(func_df_cell)
-
-
-# create manual df with total functional marker positivity across all cells in an image
-# dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x]
-# func_table_small = cell_table_func.loc[:, ~cell_table_func.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
-# func_table_small = func_table_small.loc[:, ~func_table_small.columns.isin(dp_markers)]
-#
-# # group by specified columns
-# grouped_table = func_table_small.groupby('fov')
-# transformed = grouped_table.agg(np.mean)
-# transformed.reset_index(inplace=True)
-#
-# # reshape to long df
-# long_df = pd.melt(transformed, id_vars=['fov'], var_name='functional_marker')
-# long_df['metric'] = 'total_freq'
-# long_df['cell_type'] = 'all'
-# long_df['subset'] = 'all'
-#
-# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
-#
-# long_df.to_csv(os.path.join(output_dir, 'total_func_per_core.csv'), index=False)
-
-# append to list of dfs
-long_df = pd.read_csv(os.path.join(output_dir, 'total_func_per_core.csv'))
-combo_dfs.append(long_df)
-
-# combine
-combo_df = pd.concat(combo_dfs)
-combo_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False)
-
-# create version of filtered df aggregated by timepoint
-combo_df_grouped_func = combo_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
-combo_df_timepoint_func = combo_df_grouped_func['value'].agg([np.mean, np.std])
-combo_df_timepoint_func.reset_index(inplace=True)
-combo_df_timepoint_func = combo_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
-
-# save timepoint df
-combo_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered.csv'), index=False)
+# # filter functional markers to only include FOVs with at least the specified number of cells
+# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+# min_cells = 5
+
+# filtered_dfs = []
+# metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+#            ['cluster_count', 'cluster_freq'],
+#            ['meta_cluster_count', 'meta_cluster_freq']]
+
+# for metric in metrics:
+#     # subset count df to include cells at the relevant clustering resolution
+#     for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
+#                         'tls', 'tagg', 'all']:
+#         count_df = total_df[total_df.metric == metric[0]]
+#         count_df = count_df[count_df.subset == compartment]
+
+#         # subset functional df to only include functional markers at this resolution
+#         func_df = total_df_func[total_df_func.metric.isin(metric)]
+#         func_df = func_df[func_df.subset == compartment]
+
+#         # for each cell type, determine which FOVs have high enough counts to be included
+#         for cell_type in count_df.cell_type.unique():
+#             keep_df = count_df[count_df.cell_type == cell_type]
+#             keep_df = keep_df[keep_df.value >= min_cells]
+#             keep_fovs = keep_df.fov.unique()
+
+#             # subset functional df to only include FOVs with high enough counts
+#             keep_markers = func_df[func_df.cell_type == cell_type]
+#             keep_markers = keep_markers[keep_markers.fov.isin(keep_fovs)]
+
+#             # append to list of filtered dfs
+#             filtered_dfs.append(keep_markers)
+
+# filtered_func_df = pd.concat(filtered_dfs)
+
+# # take subset for plotting average functional marker expression
+# filtered_func_df_plot = filtered_func_df.loc[filtered_func_df.subset == 'all', :]
+# filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.metric.isin(['cluster_broad_freq', 'cluster_freq', 'meta_cluster_freq']), :]
+# filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.functional_marker.isin(sp_markers), :]
+
+# # save filtered df
+# filtered_func_df_plot.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False)
+
+# # # identify combinations of markers and cell types to include in analysis based on threshold
+# # mean_percent_positive = 0.05
+# # sp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' not in x]
+# # broad_df = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq']
+# # broad_df = broad_df[broad_df.functional_marker.isin(sp_markers)]
+# # broad_df = broad_df[broad_df.subset == 'all']
+# # broad_df = broad_df[broad_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # broad_df_agg = broad_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # broad_df_agg = broad_df_agg.reset_index()
+# # broad_df = broad_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # broad_df_include = broad_df > mean_percent_positive
+# #
+# # # include for all cells
+# # general_markers = ['Ki67', 'HLA1', 'Vim', 'H3K9ac_H3K27me3_ratio']
+# # broad_df_include[[general_markers]] = True
+# #
+# # # CD45 isoform ratios
+# # double_pos = np.logical_and(broad_df_include['CD45RO'], broad_df_include['CD45RB'])
+# # broad_df_include['CD45RO_CD45RB_ratio'] = double_pos
+# #
+# # # Cancer expression
+# # broad_df_include.loc['Cancer', ['HLADR', 'CD57']] = True
+# #
+# # broad_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'))
+# #
+# # # apply thresholds to medium level clustering
+# # assignment_dict_med = {'Cancer': ['Cancer', 'Cancer_EMT', 'Cancer_Other'],
+# #                      'Mono_Mac': ['M1_Mac', 'M2_Mac', 'Mac_Other', 'Monocyte', 'APC'],
+# #                      'B': ['B'],
+# #                      'T': ['CD4T', 'CD8T', 'Treg', 'T_Other', 'Immune_Other'],
+# #                      'Granulocyte': ['Neutrophil', 'Mast'],
+# #                      'Stroma': ['Endothelium', 'Fibroblast', 'Stroma'],
+# #                      'NK': ['NK'],
+# #                      'Other': ['Other']}
+# #
+# # # get a list of all cell types
+# # cell_types = np.concatenate([assignment_dict_med[key] for key in assignment_dict_med.keys()])
+# # cell_types.sort()
+# #
+# # med_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns)
+# #
+# # for key in assignment_dict_med.keys():
+# #     values = assignment_dict_med[key]
+# #     med_df_include.loc[values] = broad_df_include.loc[key].values
+# #
+# # # check if assignment makes sense
+# # med_df = filtered_func_df[filtered_func_df.metric == 'cluster_freq']
+# # med_df = med_df[med_df.functional_marker.isin(sp_markers)]
+# # med_df = med_df[med_df.subset == 'all']
+# # med_df = med_df[med_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # med_df_agg = med_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # med_df_agg.reset_index(inplace=True)
+# # med_df = med_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # med_df_bin = med_df > mean_percent_positive
+# #
+# # # add CD38 signal
+# # med_df_include.loc[['Endothelium', 'Immune_Other'], 'CD38'] = True
+# #
+# # # add IDO signal
+# # med_df_include.loc[['APC'], 'IDO'] = True
+# #
+# # # compare to see where assignments disagree, to see if any others need to be added
+# # new_includes = (med_df_bin == True) & (med_df_include == False)
+# #
+# # med_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'))
+# #
+# # # do the same for the fine-grained clustering
+# # assignment_dict_meta = {'Cancer': ['Cancer_CD56', 'Cancer_CK17', 'Cancer_Ecad'],
+# #                    'Cancer_EMT': ['Cancer_SMA', 'Cancer_Vim'],
+# #                    'Cancer_Other': ['Cancer_Other', 'Cancer_Mono'],
+# #                    'M1_Mac': ['CD68'],
+# #                    'M2_Mac': ['CD163'],
+# #                    'Mac_Other': ['CD68_CD163_DP'],
+# #                    'Monocyte': ['CD4_Mono', 'CD14'],
+# #                    'APC': ['CD11c_HLADR'],
+# #                    'B':  ['CD20'],
+# #                    'Endothelium': ['CD31', 'CD31_VIM'],
+# #                    'Fibroblast': ['FAP', 'FAP_SMA', 'SMA'],
+# #                    'Stroma': ['Stroma_Collagen', 'Stroma_Fibronectin', 'VIM'],
+# #                    'NK': ['CD56'],
+# #                    'Neutrophil': ['Neutrophil'],
+# #                    'Mast': ['Mast'],
+# #                    'CD4T': ['CD4T','CD4T_HLADR'],
+# #                    'CD8T': ['CD8T'],
+# #                    'Treg': ['Treg'],
+# #                    'T_Other': ['CD3_DN','CD4T_CD8T_DP'],
+# #                    'Immune_Other': ['Immune_Other'],
+# #                    'Other': ['Other']}
+# #
+# # # get a list of all cell types
+# # cell_types = np.concatenate([assignment_dict_meta[key] for key in assignment_dict_meta.keys()])
+# # cell_types.sort()
+# #
+# # meta_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns)
+# #
+# # for key in assignment_dict_meta.keys():
+# #     values = assignment_dict_meta[key]
+# #     meta_df_include.loc[values] = med_df_include.loc[key].values
+# #
+# # # check if assignment makes sense
+# # meta_df = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq']
+# # meta_df = meta_df[meta_df.functional_marker.isin(sp_markers)]
+# # meta_df = meta_df[meta_df.subset == 'all']
+# # meta_df = meta_df[meta_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # meta_df_agg = meta_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # meta_df_agg.reset_index(inplace=True)
+# # meta_df = meta_df_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # meta_df_bin = meta_df > mean_percent_positive
+# #
+# # # compare to see where assignments disagree
+# # new_includes = (meta_df_bin == True) & (meta_df_include == False)
+# #
+# # meta_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'))
+
+# # process functional data so that only the specified cell type/marker combos are included
+
+# # load matrices
+# broad_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'), index_col=0)
+# med_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'), index_col=0)
+# meta_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'), index_col=0)
+
+# # identify metrics and dfs that will be filtered
+# filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include],
+#            ['cluster_count', 'cluster_freq', med_df_include],
+#            ['meta_cluster_count', 'meta_cluster_freq', meta_df_include]]
+
+# combo_dfs = []
+
+# for filters in filtering:
+#     # get variables
+#     metric_names = filters[:2]
+#     metric_df = filters[2]
 
+#     # subset functional df to only include functional markers at this resolution
+#     func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)]
+
+#     # loop over each cell type, and get the corresponding markers
+#     for cell_type in metric_df.index:
+#         markers = metric_df.columns[metric_df.loc[cell_type] == True]
+
+#         # subset functional df to only include this cell type
+#         func_df_cell = func_df[func_df.cell_type == cell_type]
+
+#         # subset functional df to only include markers for this cell type
+#         func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)]
+
+#         # append to list of dfs
+#         combo_dfs.append(func_df_cell)
+
+# # do the same thing for double positive cells
+
+# # # identify combinations of markers and cell types to include in analysis based on threshold
+# # dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x]
+# # mean_percent_positive_dp = 0.05
+# # broad_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq']
+# # broad_df_dp = broad_df_dp[broad_df_dp.subset == 'all']
+# # broad_df_dp = broad_df_dp[broad_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # broad_df_dp = broad_df_dp[broad_df_dp.functional_marker.isin(dp_markers)]
+# # broad_df_dp_agg = broad_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # broad_df_dp_agg = broad_df_dp_agg.reset_index()
+# # broad_df_dp = broad_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # broad_df_dp_include = broad_df_dp > mean_percent_positive_dp
+# #
+# # broad_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'))
+# #
+# # # do the same for medium-level clustering
+# # med_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_freq']
+# # med_df_dp = med_df_dp[med_df_dp.subset == 'all']
+# # med_df_dp = med_df_dp[med_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # med_df_dp = med_df_dp[med_df_dp.functional_marker.isin(dp_markers)]
+# # med_df_dp_agg = med_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # # create matrix of cell types and markers
+# # med_df_dp_agg = med_df_dp_agg.reset_index()
+# # med_df_dp = med_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # med_df_dp_include = med_df_dp > mean_percent_positive_dp
+# #
+# # med_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'))
+# #
+# #
+# # # do the same for finest-level clustering
+# # meta_df_dp = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq']
+# # meta_df_dp = meta_df_dp[meta_df_dp.subset == 'all']
+# # meta_df_dp = meta_df_dp[meta_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])]
+# # meta_df_dp = meta_df_dp[meta_df_dp.functional_marker.isin(dp_markers)]
+# # meta_df_dp_agg = meta_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean)
+# #
+# # # create matrix of cell types and markers
+# # meta_df_dp_agg = meta_df_dp_agg.reset_index()
+# # meta_df_dp = meta_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value')
+# # meta_df_dp_include = meta_df_dp > mean_percent_positive_dp
+# #
+# # meta_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'))
+
+# # load inclusion matrices
+# broad_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'), index_col=0)
+# med_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'), index_col=0)
+# meta_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'), index_col=0)
+
+# # identify metrics and dfs that will be filtered
+# filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include_dp],
+#            ['cluster_count', 'cluster_freq', med_df_include_dp],
+#            ['meta_cluster_count', 'meta_cluster_freq', meta_df_include_dp]]
+
+# for filters in filtering:
+#     # get variables
+#     metric_names = filters[:2]
+#     metric_df = filters[2]
 
-#
-# Remove double positive functional markers that are highly correlated with single positive scores
-#
+#     # subset functional df to only include functional markers at this resolution
+#     func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)]
+
+#     # loop over each cell type, and get the corresponding markers
+#     for cell_type in metric_df.index:
+#         markers = metric_df.columns[metric_df.loc[cell_type] == True]
+
+#         # subset functional df to only include this cell type
+#         func_df_cell = func_df[func_df.cell_type == cell_type]
+
+#         # subset functional df to only include markers for this cell type
+#         func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)]
+
+#         # append to list of dfs
+#         combo_dfs.append(func_df_cell)
+
+
+# # create manual df with total functional marker positivity across all cells in an image
+# # dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x]
+# # func_table_small = cell_table_func.loc[:, ~cell_table_func.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
+# # func_table_small = func_table_small.loc[:, ~func_table_small.columns.isin(dp_markers)]
+# #
+# # # group by specified columns
+# # grouped_table = func_table_small.groupby('fov')
+# # transformed = grouped_table.agg(np.mean)
+# # transformed.reset_index(inplace=True)
+# #
+# # # reshape to long df
+# # long_df = pd.melt(transformed, id_vars=['fov'], var_name='functional_marker')
+# # long_df['metric'] = 'total_freq'
+# # long_df['cell_type'] = 'all'
+# # long_df['subset'] = 'all'
+# #
+# # long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
+# #
+# # long_df.to_csv(os.path.join(output_dir, 'total_func_per_core.csv'), index=False)
+
+# # append to list of dfs
+# long_df = pd.read_csv(os.path.join(output_dir, 'total_func_per_core.csv'))
+# combo_dfs.append(long_df)
+
+# # combine
+# combo_df = pd.concat(combo_dfs)
+# combo_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False)
+
+# # create version of filtered df aggregated by timepoint
+# combo_df_grouped_func = combo_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
+# combo_df_timepoint_func = combo_df_grouped_func['value'].agg([np.mean, np.std])
+# combo_df_timepoint_func.reset_index(inplace=True)
+# combo_df_timepoint_func = combo_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# combo_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered.csv'), index=False)
+
+
+# #
+# # Remove double positive functional markers that are highly correlated with single positive scores
+# #
+
+# # cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'],
+# #            ['cluster_freq', 'cluster_count'],
+# #            ['meta_cluster_freq', 'meta_cluster_count']]
+# #
+# # all_markers = combo_df.functional_marker.unique()
+# # dp_markers = [x for x in all_markers if '__' in x]
+# #
+# # exclude_lists = []
+# # for cluster in cluster_resolution:
+# #
+# #     # subset functional df to only include functional markers at this resolution
+# #     func_df = combo_df[combo_df.metric.isin(cluster)]
+# #
+# #     # add unique identifier for cell + marker combo
+# #     func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker']
+# #
+# #     # subset the df further to look at just frequency and just one compartment
+# #     func_df_subset = func_df[func_df.metric == cluster[0]]
+# #     func_df_subset = func_df_subset[func_df_subset.subset == 'all']
+# #
+# #     # loop over each cell type, and each double positive functional marker
+# #     exclude_markers = []
+# #     cell_types = func_df_subset.cell_type.unique()
+# #     for cell_type in cell_types:
+# #         for marker in dp_markers:
+# #             # get the two markers that make up the double positive marker
+# #             marker_1, marker_2 = marker.split('__')
+# #
+# #             # subset to only include this cell type and these markers
+# #             current_df = func_df_subset.loc[func_df_subset.cell_type == cell_type, :]
+# #             current_df = current_df.loc[current_df.functional_marker.isin([marker, marker_1, marker_2]), :]
+# #
+# #             # these markers are not present in this cell type
+# #             if len(current_df) == 0:
+# #                 continue
+# #
+# #             # this double positive marker is not present in this cell type
+# #             if marker not in current_df.functional_marker.unique():
+# #                 continue
+# #
+# #             current_df_wide = current_df.pivot(index='fov', columns='functional_marker', values='value')
+# #
+# #             # the double positive marker is present, but both single positives are not; exclude it
+# #             if len(current_df_wide.columns) != 3:
+# #                 exclude_markers.append(marker)
+# #                 continue
+# #
+# #             corr_1, _ = spearmanr(current_df_wide[marker_1].values, current_df_wide[marker].values)
+# #             corr_2, _ = spearmanr(current_df_wide[marker_2].values, current_df_wide[marker].values)
+# #
+# #             if (corr_1 > 0.7) | (corr_2 > 0.7):
+# #                 exclude_markers.append(cell_type + '__' + marker)
+# #
+# #     # add to list
+# #     exclude_lists.append(exclude_markers)
+# #
+# # # construct df to hold list of exlcuded cells
+# # exclude_df_cluster = ['cluster_broad_freq'] * len(exclude_lists[0]) + ['cluster_freq'] * len(exclude_lists[1]) + ['meta_cluster_freq'] * len(exclude_lists[2])
+# # exclude_df_name = exclude_lists[0] + exclude_lists[1] + exclude_lists[2]
+# #
+# # exclude_df = pd.DataFrame({'metric': exclude_df_cluster, 'feature_name': exclude_df_name})
+# # exclude_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'), index=False)
+
+# # use previously generated exclude list
+# exclude_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'))
+
+# dedup_dfs = []
 
 # cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'],
 #            ['cluster_freq', 'cluster_count'],
 #            ['meta_cluster_freq', 'meta_cluster_count']]
-#
-# all_markers = combo_df.functional_marker.unique()
-# dp_markers = [x for x in all_markers if '__' in x]
-#
-# exclude_lists = []
+
 # for cluster in cluster_resolution:
-#
 #     # subset functional df to only include functional markers at this resolution
 #     func_df = combo_df[combo_df.metric.isin(cluster)]
-#
+
 #     # add unique identifier for cell + marker combo
 #     func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker']
-#
-#     # subset the df further to look at just frequency and just one compartment
-#     func_df_subset = func_df[func_df.metric == cluster[0]]
-#     func_df_subset = func_df_subset[func_df_subset.subset == 'all']
-#
-#     # loop over each cell type, and each double positive functional marker
-#     exclude_markers = []
-#     cell_types = func_df_subset.cell_type.unique()
-#     for cell_type in cell_types:
-#         for marker in dp_markers:
-#             # get the two markers that make up the double positive marker
-#             marker_1, marker_2 = marker.split('__')
-#
-#             # subset to only include this cell type and these markers
-#             current_df = func_df_subset.loc[func_df_subset.cell_type == cell_type, :]
-#             current_df = current_df.loc[current_df.functional_marker.isin([marker, marker_1, marker_2]), :]
-#
-#             # these markers are not present in this cell type
-#             if len(current_df) == 0:
-#                 continue
-#
-#             # this double positive marker is not present in this cell type
-#             if marker not in current_df.functional_marker.unique():
-#                 continue
-#
-#             current_df_wide = current_df.pivot(index='fov', columns='functional_marker', values='value')
-#
-#             # the double positive marker is present, but both single positives are not; exclude it
-#             if len(current_df_wide.columns) != 3:
-#                 exclude_markers.append(marker)
-#                 continue
-#
-#             corr_1, _ = spearmanr(current_df_wide[marker_1].values, current_df_wide[marker].values)
-#             corr_2, _ = spearmanr(current_df_wide[marker_2].values, current_df_wide[marker].values)
-#
-#             if (corr_1 > 0.7) | (corr_2 > 0.7):
-#                 exclude_markers.append(cell_type + '__' + marker)
-#
-#     # add to list
-#     exclude_lists.append(exclude_markers)
-#
-# # construct df to hold list of exlcuded cells
-# exclude_df_cluster = ['cluster_broad_freq'] * len(exclude_lists[0]) + ['cluster_freq'] * len(exclude_lists[1]) + ['meta_cluster_freq'] * len(exclude_lists[2])
-# exclude_df_name = exclude_lists[0] + exclude_lists[1] + exclude_lists[2]
-#
-# exclude_df = pd.DataFrame({'metric': exclude_df_cluster, 'feature_name': exclude_df_name})
-# exclude_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'), index=False)
-
-# use previously generated exclude list
-exclude_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'))
-
-dedup_dfs = []
-
-cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'],
-           ['cluster_freq', 'cluster_count'],
-           ['meta_cluster_freq', 'meta_cluster_count']]
-
-for cluster in cluster_resolution:
-    # subset functional df to only include functional markers at this resolution
-    func_df = combo_df[combo_df.metric.isin(cluster)]
-
-    # add unique identifier for cell + marker combo
-    func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker']
-
-    exclude_names = exclude_df.loc[exclude_df.metric == cluster[0], 'feature_name'].values
-
-    # remove double positive markers that are highly correlated with single positive scores
-    func_df = func_df[~func_df.feature_name.isin(exclude_names)]
-    dedup_dfs.append(func_df)
-
-
-dedup_dfs.append(long_df)
-deduped_df = pd.concat(dedup_dfs)
-deduped_df = deduped_df.drop('feature_name', axis=1)
-
-# save deduped df
-deduped_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered_deduped.csv'), index=False)
-
-# create version aggregated by timepoint
-deduped_df_grouped = deduped_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
-deduped_df_timepoint = deduped_df_grouped['value'].agg([np.mean, np.std])
-deduped_df_timepoint.reset_index(inplace=True)
-deduped_df_timepoint = deduped_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
-
-# save timepoint df
-deduped_df_timepoint.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered_deduped.csv'), index=False)
-
-
-# morphology metric summary
-
-# Create list to hold parameters for each df that will be produced
-morph_df_params = [['cluster_broad_freq', 'cell_cluster_broad'],
-                  ['cluster_freq', 'cell_cluster'],
-                  ['meta_cluster_freq', 'cell_meta_cluster']]
 
-morph_dfs = []
-for result_name, cluster_col_name in morph_df_params:
+#     exclude_names = exclude_df.loc[exclude_df.metric == cluster[0], 'feature_name'].values
 
-    # remove cluster_names except for the one specified for the df
-    drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label']
-    drop_cols.remove(cluster_col_name)
-
-    # create df
-    morph_dfs.append(create_long_df_by_functional(func_table=cell_table_morph,
-                                                 result_name=result_name,
-                                                 cluster_col_name=cluster_col_name,
-                                                 drop_cols=drop_cols,
-                                                 normalize=True,
-                                                 subset_col='tumor_region'))
-
-# create combined df
-total_df_morph = pd.concat(morph_dfs, axis=0)
-total_df_morph = total_df_morph.merge(harmonized_metadata, on='fov', how='inner')
-total_df_morph = total_df_morph.rename(columns={'functional_marker': 'morphology_feature'})
-
-# save df
-total_df_morph.to_csv(os.path.join(output_dir, 'morph_df_per_core.csv'), index=False)
-
-
-# create manual df with total morphology marker average across all cells in an image
-morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
-
-# group by specified columns
-grouped_table = morph_table_small.groupby('fov')
-transformed = grouped_table.agg(np.mean)
-transformed.reset_index(inplace=True)
-
-# reshape to long df
-long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature')
-long_df['metric'] = 'total_freq'
-long_df['cell_type'] = 'all'
-long_df['subset'] = 'all'
-
-long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
-
-long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False)
-
-# filter morphology markers to only include FOVs with at least the specified number of cells
-total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-min_cells = 5
-
-filtered_dfs = []
-metrics = [['cluster_broad_count', 'cluster_broad_freq'],
-           ['cluster_count', 'cluster_freq'],
-           ['meta_cluster_count', 'meta_cluster_freq']]
-for metric in metrics:
-    # subset count df to include cells at the relevant clustering resolution
-    for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
-                        'tls', 'tagg', 'all']:
-        count_df = total_df[total_df.metric == metric[0]]
-        count_df = count_df[count_df.subset == compartment]
+#     # remove double positive markers that are highly correlated with single positive scores
+#     func_df = func_df[~func_df.feature_name.isin(exclude_names)]
+#     dedup_dfs.append(func_df)
 
-        # subset morphology df to only include morphology metrics at this resolution
-        morph_df = total_df_morph[total_df_morph.metric == metric[1]]
-        morph_df = morph_df[morph_df.subset == compartment]
 
-        # for each cell type, determine which FOVs have high enough counts to be included
-        for cell_type in count_df.cell_type.unique():
-            keep_df = count_df[count_df.cell_type == cell_type]
-            keep_df = keep_df[keep_df.value >= min_cells]
-            keep_fovs = keep_df.fov.unique()
+# dedup_dfs.append(long_df)
+# deduped_df = pd.concat(dedup_dfs)
+# deduped_df = deduped_df.drop('feature_name', axis=1)
 
-            # subset morphology df to only include FOVs with high enough counts
-            keep_features = morph_df[morph_df.cell_type == cell_type]
-            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+# # save deduped df
+# deduped_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered_deduped.csv'), index=False)
 
-            # append to list of filtered dfs
-            filtered_dfs.append(keep_features)
+# # create version aggregated by timepoint
+# deduped_df_grouped = deduped_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset'])
+# deduped_df_timepoint = deduped_df_grouped['value'].agg([np.mean, np.std])
+# deduped_df_timepoint.reset_index(inplace=True)
+# deduped_df_timepoint = deduped_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
 
-filtered_dfs.append(long_df)
-filtered_morph_df = pd.concat(filtered_dfs)
+# # save timepoint df
+# deduped_df_timepoint.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered_deduped.csv'), index=False)
 
-# save filtered df
-filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False)
 
-# create version aggregated by timepoint
-filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset'])
-filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std])
-filtered_morph_df_timepoint.reset_index(inplace=True)
-filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# # morphology metric summary
 
-# save timepoint df
-filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False)
+# # Create list to hold parameters for each df that will be produced
+# morph_df_params = [['cluster_broad_freq', 'cell_cluster_broad'],
+#                   ['cluster_freq', 'cell_cluster'],
+#                   ['meta_cluster_freq', 'cell_meta_cluster']]
 
-# remove redundant morphology features
-block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter']
+# morph_dfs = []
+# for result_name, cluster_col_name in morph_df_params:
 
-block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear']
+#     # remove cluster_names except for the one specified for the df
+#     drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label']
+#     drop_cols.remove(cluster_col_name)
 
-block3 = ['eccentricity', 'major_axis_equiv_diam_ratio']
+#     # create df
+#     morph_dfs.append(create_long_df_by_functional(func_table=cell_table_morph,
+#                                                  result_name=result_name,
+#                                                  cluster_col_name=cluster_col_name,
+#                                                  drop_cols=drop_cols,
+#                                                  normalize=True,
+#                                                  subset_col='tumor_region'))
 
-block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear']
+# # create combined df
+# total_df_morph = pd.concat(morph_dfs, axis=0)
+# total_df_morph = total_df_morph.merge(harmonized_metadata, on='fov', how='inner')
+# total_df_morph = total_df_morph.rename(columns={'functional_marker': 'morphology_feature'})
 
-deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+# # save df
+# total_df_morph.to_csv(os.path.join(output_dir, 'morph_df_per_core.csv'), index=False)
 
-# only keep complex morphology features for cancer cells, remove everything except area and nc for others
-cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17',
-                   'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim']
-basic_morph_features = ['area', 'area_nuclear', 'nc_ratio']
 
-deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :]
+# # create manual df with total morphology marker average across all cells in an image
+# morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
 
-# saved deduped
-deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False)
+# # group by specified columns
+# grouped_table = morph_table_small.groupby('fov')
+# transformed = grouped_table.agg(np.mean)
+# transformed.reset_index(inplace=True)
 
-# same for timepoints
-deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
-deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :]
-deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False)
+# # reshape to long df
+# long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature')
+# long_df['metric'] = 'total_freq'
+# long_df['cell_type'] = 'all'
+# long_df['subset'] = 'all'
 
-#
-# spatial features
-#
+# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
 
-# format mixing scores
-mixing_scores = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/mixing_score/cell_cluster_broad/homogeneous_mixing_scores.csv'))
-cols = mixing_scores.columns.tolist()
-keep_cols = [col for col in cols if 'mixing_score' in col]
-mixing_scores = mixing_scores[['fov'] + keep_cols]
+# long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False)
 
-mixing_scores = pd.melt(mixing_scores, id_vars=['fov'], var_name='mixing_score', value_name='value')
-mixing_scores.to_csv(os.path.join(output_dir, 'formatted_mixing_scores.csv'), index=False)
+# # filter morphology markers to only include FOVs with at least the specified number of cells
+# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+# min_cells = 5
 
-# compute local diversity scores per image
+# filtered_dfs = []
+# metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+#            ['cluster_count', 'cluster_freq'],
+#            ['meta_cluster_count', 'meta_cluster_freq']]
+# for metric in metrics:
+#     # subset count df to include cells at the relevant clustering resolution
+#     for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
+#                         'tls', 'tagg', 'all']:
+#         count_df = total_df[total_df.metric == metric[0]]
+#         count_df = count_df[count_df.subset == compartment]
 
-# Create list to hold parameters for each df that will be produced
-diversity_df_params = [['cluster_broad_freq', 'cell_cluster_broad'],
-                  ['cluster_freq', 'cell_cluster'],
-                  ['meta_cluster_freq', 'cell_meta_cluster']]
+#         # subset morphology df to only include morphology metrics at this resolution
+#         morph_df = total_df_morph[total_df_morph.metric == metric[1]]
+#         morph_df = morph_df[morph_df.subset == compartment]
 
-diversity_dfs = []
-for result_name, cluster_col_name in diversity_df_params:
+#         # for each cell type, determine which FOVs have high enough counts to be included
+#         for cell_type in count_df.cell_type.unique():
+#             keep_df = count_df[count_df.cell_type == cell_type]
+#             keep_df = keep_df[keep_df.value >= min_cells]
+#             keep_fovs = keep_df.fov.unique()
 
-    # remove cluster_names except for the one specified for the df
-    drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label']
-    drop_cols.remove(cluster_col_name)
+#             # subset morphology df to only include FOVs with high enough counts
+#             keep_features = morph_df[morph_df.cell_type == cell_type]
+#             keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
 
-    # create df
-    diversity_dfs.append(create_long_df_by_functional(func_table=cell_table_diversity,
-                                                 result_name=result_name,
-                                                 cluster_col_name=cluster_col_name,
-                                                 drop_cols=drop_cols,
-                                                 normalize=True,
-                                                 subset_col='tumor_region'))
+#             # append to list of filtered dfs
+#             filtered_dfs.append(keep_features)
 
-# create combined df
-total_df_diversity = pd.concat(diversity_dfs, axis=0)
-total_df_diversity = total_df_diversity.merge(harmonized_metadata, on='fov', how='inner')
-total_df_diversity = total_df_diversity.rename(columns={'functional_marker': 'diversity_feature'})
+# filtered_dfs.append(long_df)
+# filtered_morph_df = pd.concat(filtered_dfs)
 
-# save df
-total_df_diversity.to_csv(os.path.join(output_dir, 'diversity_df_per_core.csv'), index=False)
+# # save filtered df
+# filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False)
 
-# filter diversity scores to only include FOVs with at least the specified number of cells
-total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-min_cells = 5
+# # create version aggregated by timepoint
+# filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset'])
+# filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std])
+# filtered_morph_df_timepoint.reset_index(inplace=True)
+# filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
 
-filtered_dfs = []
-metrics = [['cluster_broad_count', 'cluster_broad_freq'],
-           ['cluster_count', 'cluster_freq'],
-           ['meta_cluster_count', 'meta_cluster_freq']]
-for metric in metrics:
-    # subset count df to include cells at the relevant clustering resolution
-    for compartment in ['cancer_core', 'cancer_border', 'stroma_core',
-                        'stroma_border', 'tagg', 'tls', 'all']:
-        count_df = total_df[total_df.metric == metric[0]]
-        count_df = count_df[count_df.subset == compartment]
+# # save timepoint df
+# filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False)
 
-        # subset diversity df to only include diversity metrics at this resolution
-        diversity_df = total_df_diversity[total_df_diversity.metric == metric[1]]
-        diversity_df = diversity_df[diversity_df.subset == compartment]
+# # remove redundant morphology features
+# block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter']
 
-        # for each cell type, determine which FOVs have high enough counts to be included
-        for cell_type in count_df.cell_type.unique():
-            keep_df = count_df[count_df.cell_type == cell_type]
-            keep_df = keep_df[keep_df.value >= min_cells]
-            keep_fovs = keep_df.fov.unique()
+# block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear']
 
-            # subset morphology df to only include FOVs with high enough counts
-            keep_features = diversity_df[diversity_df.cell_type == cell_type]
-            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+# block3 = ['eccentricity', 'major_axis_equiv_diam_ratio']
 
-            # append to list of filtered dfs
-            filtered_dfs.append(keep_features)
+# block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear']
 
-filtered_diversity_df = pd.concat(filtered_dfs)
+# deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
 
-# save filtered df
-filtered_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered.csv'), index=False)
+# # only keep complex morphology features for cancer cells, remove everything except area and nc for others
+# cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17',
+#                    'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim']
+# basic_morph_features = ['area', 'area_nuclear', 'nc_ratio']
 
-# create version aggregated by timepoint
-filtered_diversity_df_grouped = filtered_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset'])
-filtered_diversity_df_timepoint = filtered_diversity_df_grouped['value'].agg([np.mean, np.std])
-filtered_diversity_df_timepoint.reset_index(inplace=True)
-filtered_diversity_df_timepoint = filtered_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :]
 
-# save timepoint df
-filtered_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered.csv'), index=False)
+# # saved deduped
+# deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False)
 
+# # same for timepoints
+# deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+# deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :]
+# deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False)
 
-# investigate correlation between diversity scores
-fov_data = filtered_diversity_df.copy()
-fov_data['feature_name_unique'] = fov_data['cell_type'] + '_' + fov_data['diversity_feature']
-fov_data = fov_data.loc[(fov_data.subset == 'all') & (fov_data.metric == 'cluster_freq')]
-fov_data = fov_data.loc[fov_data.diversity_feature != 'diversity_cell_meta_cluster']
-fov_data_wide = fov_data.pivot(index='fov', columns='feature_name_unique', values='value')
+# #
+# # spatial features
+# #
 
-corr_df = fov_data_wide.corr(method='spearman')
+# # format mixing scores
+# mixing_scores = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/mixing_score/cell_cluster_broad/homogeneous_mixing_scores.csv'))
+# cols = mixing_scores.columns.tolist()
+# keep_cols = [col for col in cols if 'mixing_score' in col]
+# mixing_scores = mixing_scores[['fov'] + keep_cols]
 
-# replace Nans
-corr_df = corr_df.fillna(0)
-clustergrid = sns.clustermap(corr_df, cmap='vlag', vmin=-1, vmax=1, figsize=(20, 20))
+# mixing_scores = pd.melt(mixing_scores, id_vars=['fov'], var_name='mixing_score', value_name='value')
+# mixing_scores.to_csv(os.path.join(output_dir, 'formatted_mixing_scores.csv'), index=False)
 
-# save deduped df that excludes cell meta cluster
-deduped_diversity_df = filtered_diversity_df.loc[filtered_diversity_df.diversity_feature != 'diversity_cell_meta_cluster']
-deduped_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered_deduped.csv'), index=False)
+# # compute local diversity scores per image
 
-# create version aggregated by timepoint
-deduped_diversity_df_grouped = deduped_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset'])
-deduped_diversity_df_timepoint = deduped_diversity_df_grouped['value'].agg([np.mean, np.std])
-deduped_diversity_df_timepoint.reset_index(inplace=True)
-deduped_diversity_df_timepoint = deduped_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# # Create list to hold parameters for each df that will be produced
+# diversity_df_params = [['cluster_broad_freq', 'cell_cluster_broad'],
+#                   ['cluster_freq', 'cell_cluster'],
+#                   ['meta_cluster_freq', 'cell_meta_cluster']]
 
-# save timepoint df
-deduped_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered_deduped.csv'), index=False)
+# diversity_dfs = []
+# for result_name, cluster_col_name in diversity_df_params:
 
+#     # remove cluster_names except for the one specified for the df
+#     drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label']
+#     drop_cols.remove(cluster_col_name)
 
-# process linear distance dfs
-total_df_distance = create_long_df_by_functional(func_table=cell_table_distances_broad,
-                                             result_name='cluster_broad_freq',
-                                             cluster_col_name='cell_cluster_broad',
-                                             drop_cols=['label'],
-                                             normalize=True,
-                                             subset_col='tumor_region')
+#     # create df
+#     diversity_dfs.append(create_long_df_by_functional(func_table=cell_table_diversity,
+#                                                  result_name=result_name,
+#                                                  cluster_col_name=cluster_col_name,
+#                                                  drop_cols=drop_cols,
+#                                                  normalize=True,
+#                                                  subset_col='tumor_region'))
 
-# create combined df
-total_df_distance = total_df_distance.merge(harmonized_metadata, on='fov', how='inner')
-total_df_distance = total_df_distance.rename(columns={'functional_marker': 'linear_distance'})
-total_df_distance.dropna(inplace=True)
+# # create combined df
+# total_df_diversity = pd.concat(diversity_dfs, axis=0)
+# total_df_diversity = total_df_diversity.merge(harmonized_metadata, on='fov', how='inner')
+# total_df_diversity = total_df_diversity.rename(columns={'functional_marker': 'diversity_feature'})
 
-# save df
-total_df_distance.to_csv(os.path.join(output_dir, 'distance_df_per_core.csv'), index=False)
+# # save df
+# total_df_diversity.to_csv(os.path.join(output_dir, 'diversity_df_per_core.csv'), index=False)
 
-# filter distance scores to only include FOVs with at least the specified number of cells
-total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-min_cells = 5
-
-filtered_dfs = []
-metrics = [['cluster_broad_count', 'cluster_broad_freq']]
-
-for metric in metrics:
-    # subset count df to include cells at the relevant clustering resolution
-    for compartment in ['cancer_core', 'cancer_border', 'stroma_core',
-                        'stroma_border', 'tagg', 'tls', 'all']:
-        count_df = total_df[total_df.metric == metric[0]]
-        count_df = count_df[count_df.subset == compartment]
-
-        # subset distance df to only include distance metrics at this resolution
-        distance_df = total_df_distance[total_df_distance.metric == metric[1]]
-        distance_df = distance_df[distance_df.subset == compartment]
-
-        # for each cell type, determine which FOVs have high enough counts to be included
-        for cell_type in count_df.cell_type.unique():
-            keep_df = count_df[count_df.cell_type == cell_type]
-            keep_df = keep_df[keep_df.value >= min_cells]
-            keep_fovs = keep_df.fov.unique()
-
-            # subset morphology df to only include FOVs with high enough counts
-            keep_features = distance_df[distance_df.cell_type == cell_type]
-            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
-
-            # append to list of filtered dfs
-            filtered_dfs.append(keep_features)
-
-filtered_distance_df = pd.concat(filtered_dfs)
-
-# save filtered df
-filtered_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_filtered.csv'), index=False)
-
-# remove distances that are correlated with abundance of cell type
-
-# load data
+# # filter diversity scores to only include FOVs with at least the specified number of cells
 # total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-# density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')]
-# filtered_distance_df = filtered_distance_df.loc[(filtered_distance_df.metric == 'cluster_broad_freq') & (filtered_distance_df.subset == 'all')]
-#
-# # remove images without tumor cells
-# density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :]
-# filtered_distance_df = filtered_distance_df.loc[filtered_distance_df.fov.isin(density_df.fov.unique())]
-# cell_types = filtered_distance_df.cell_type.unique()
-#
-# # calculate which pairings to keep
-# keep_cells, keep_features = [], []
-#
-# for cell_type in cell_types:
-#     density_subset = density_df.loc[density_df.cell_type == cell_type]
-#     distance_subset = filtered_distance_df.loc[filtered_distance_df.linear_distance == cell_type]
-#     distance_wide = distance_subset.pivot(index='fov', columns='cell_type', values='value')
-#     distance_wide.reset_index(inplace=True)
-#     distance_wide = pd.merge(distance_wide, density_subset[['fov', 'value']], on='fov', how='inner')
-#
-#     # get correlations
-#     corr_df = distance_wide.corr(method='spearman')
-#
-#     # determine which features to keep
-#     corr_vals = corr_df.loc['value', :].abs()
-#     corr_vals = corr_vals[corr_vals < 0.7]
-#
-#     # add to list of features to keep
-#     keep_cells.extend(corr_vals.index)
-#     keep_features.extend([cell_type] * len(corr_vals.index))
-#
-# keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features})
-#
-# keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'), index=False)
-
-# filter distance df to only include features with low correlation with abundance
-keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'))
-
-
-deduped_dfs = []
-for cell_type in keep_df.cell_type.unique():
-    keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique()
-    if len(keep_features) > 0:
-        keep_df_subset = filtered_distance_df.loc[filtered_distance_df.cell_type == cell_type]
-        keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)]
-        deduped_dfs.append(keep_df_subset)
-
-deduped_distance_df = pd.concat(deduped_dfs)
-
-# save filtered df
-deduped_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_deduped.csv'), index=False)
-
-# create version aggregated by timepoint
-deduped_distance_df_grouped = deduped_distance_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset'])
-deduped_distance_df_timepoint = deduped_distance_df_grouped['value'].agg([np.mean, np.std])
-deduped_distance_df_timepoint.reset_index(inplace=True)
-deduped_distance_df_timepoint = deduped_distance_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
-
-# save timepoint df
-deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False)
+# min_cells = 5
+
+# filtered_dfs = []
+# metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+#            ['cluster_count', 'cluster_freq'],
+#            ['meta_cluster_count', 'meta_cluster_freq']]
+# for metric in metrics:
+#     # subset count df to include cells at the relevant clustering resolution
+#     for compartment in ['cancer_core', 'cancer_border', 'stroma_core',
+#                         'stroma_border', 'tagg', 'tls', 'all']:
+#         count_df = total_df[total_df.metric == metric[0]]
+#         count_df = count_df[count_df.subset == compartment]
+
+#         # subset diversity df to only include diversity metrics at this resolution
+#         diversity_df = total_df_diversity[total_df_diversity.metric == metric[1]]
+#         diversity_df = diversity_df[diversity_df.subset == compartment]
+
+#         # for each cell type, determine which FOVs have high enough counts to be included
+#         for cell_type in count_df.cell_type.unique():
+#             keep_df = count_df[count_df.cell_type == cell_type]
+#             keep_df = keep_df[keep_df.value >= min_cells]
+#             keep_fovs = keep_df.fov.unique()
+
+#             # subset morphology df to only include FOVs with high enough counts
+#             keep_features = diversity_df[diversity_df.cell_type == cell_type]
+#             keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+
+#             # append to list of filtered dfs
+#             filtered_dfs.append(keep_features)
+
+# filtered_diversity_df = pd.concat(filtered_dfs)
+
+# # save filtered df
+# filtered_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered.csv'), index=False)
+
+# # create version aggregated by timepoint
+# filtered_diversity_df_grouped = filtered_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset'])
+# filtered_diversity_df_timepoint = filtered_diversity_df_grouped['value'].agg([np.mean, np.std])
+# filtered_diversity_df_timepoint.reset_index(inplace=True)
+# filtered_diversity_df_timepoint = filtered_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# filtered_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered.csv'), index=False)
+
+
+# # investigate correlation between diversity scores
+# fov_data = filtered_diversity_df.copy()
+# fov_data['feature_name_unique'] = fov_data['cell_type'] + '_' + fov_data['diversity_feature']
+# fov_data = fov_data.loc[(fov_data.subset == 'all') & (fov_data.metric == 'cluster_freq')]
+# fov_data = fov_data.loc[fov_data.diversity_feature != 'diversity_cell_meta_cluster']
+# fov_data_wide = fov_data.pivot(index='fov', columns='feature_name_unique', values='value')
+
+# corr_df = fov_data_wide.corr(method='spearman')
+
+# # replace Nans
+# corr_df = corr_df.fillna(0)
+# clustergrid = sns.clustermap(corr_df, cmap='vlag', vmin=-1, vmax=1, figsize=(20, 20))
+
+# # save deduped df that excludes cell meta cluster
+# deduped_diversity_df = filtered_diversity_df.loc[filtered_diversity_df.diversity_feature != 'diversity_cell_meta_cluster']
+# deduped_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered_deduped.csv'), index=False)
+
+# # create version aggregated by timepoint
+# deduped_diversity_df_grouped = deduped_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset'])
+# deduped_diversity_df_timepoint = deduped_diversity_df_grouped['value'].agg([np.mean, np.std])
+# deduped_diversity_df_timepoint.reset_index(inplace=True)
+# deduped_diversity_df_timepoint = deduped_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# deduped_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered_deduped.csv'), index=False)
+
+
+# # process linear distance dfs
+# total_df_distance = create_long_df_by_functional(func_table=cell_table_distances_broad,
+#                                              result_name='cluster_broad_freq',
+#                                              cluster_col_name='cell_cluster_broad',
+#                                              drop_cols=['label'],
+#                                              normalize=True,
+#                                              subset_col='tumor_region')
+
+# # create combined df
+# total_df_distance = total_df_distance.merge(harmonized_metadata, on='fov', how='inner')
+# total_df_distance = total_df_distance.rename(columns={'functional_marker': 'linear_distance'})
+# total_df_distance.dropna(inplace=True)
+
+# # save df
+# total_df_distance.to_csv(os.path.join(output_dir, 'distance_df_per_core.csv'), index=False)
+
+# # filter distance scores to only include FOVs with at least the specified number of cells
+# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+# min_cells = 5
+
+# filtered_dfs = []
+# metrics = [['cluster_broad_count', 'cluster_broad_freq']]
+
+# for metric in metrics:
+#     # subset count df to include cells at the relevant clustering resolution
+#     for compartment in ['cancer_core', 'cancer_border', 'stroma_core',
+#                         'stroma_border', 'tagg', 'tls', 'all']:
+#         count_df = total_df[total_df.metric == metric[0]]
+#         count_df = count_df[count_df.subset == compartment]
+
+#         # subset distance df to only include distance metrics at this resolution
+#         distance_df = total_df_distance[total_df_distance.metric == metric[1]]
+#         distance_df = distance_df[distance_df.subset == compartment]
+
+#         # for each cell type, determine which FOVs have high enough counts to be included
+#         for cell_type in count_df.cell_type.unique():
+#             keep_df = count_df[count_df.cell_type == cell_type]
+#             keep_df = keep_df[keep_df.value >= min_cells]
+#             keep_fovs = keep_df.fov.unique()
+
+#             # subset morphology df to only include FOVs with high enough counts
+#             keep_features = distance_df[distance_df.cell_type == cell_type]
+#             keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+
+#             # append to list of filtered dfs
+#             filtered_dfs.append(keep_features)
+
+# filtered_distance_df = pd.concat(filtered_dfs)
+
+# # save filtered df
+# filtered_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_filtered.csv'), index=False)
+
+# # remove distances that are correlated with abundance of cell type
+
+# # load data
+# # total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+# # density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')]
+# # filtered_distance_df = filtered_distance_df.loc[(filtered_distance_df.metric == 'cluster_broad_freq') & (filtered_distance_df.subset == 'all')]
+# #
+# # # remove images without tumor cells
+# # density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :]
+# # filtered_distance_df = filtered_distance_df.loc[filtered_distance_df.fov.isin(density_df.fov.unique())]
+# # cell_types = filtered_distance_df.cell_type.unique()
+# #
+# # # calculate which pairings to keep
+# # keep_cells, keep_features = [], []
+# #
+# # for cell_type in cell_types:
+# #     density_subset = density_df.loc[density_df.cell_type == cell_type]
+# #     distance_subset = filtered_distance_df.loc[filtered_distance_df.linear_distance == cell_type]
+# #     distance_wide = distance_subset.pivot(index='fov', columns='cell_type', values='value')
+# #     distance_wide.reset_index(inplace=True)
+# #     distance_wide = pd.merge(distance_wide, density_subset[['fov', 'value']], on='fov', how='inner')
+# #
+# #     # get correlations
+# #     corr_df = distance_wide.corr(method='spearman')
+# #
+# #     # determine which features to keep
+# #     corr_vals = corr_df.loc['value', :].abs()
+# #     corr_vals = corr_vals[corr_vals < 0.7]
+# #
+# #     # add to list of features to keep
+# #     keep_cells.extend(corr_vals.index)
+# #     keep_features.extend([cell_type] * len(corr_vals.index))
+# #
+# # keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features})
+# #
+# # keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'), index=False)
+
+# # filter distance df to only include features with low correlation with abundance
+# keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'))
+
+
+# deduped_dfs = []
+# for cell_type in keep_df.cell_type.unique():
+#     keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique()
+#     if len(keep_features) > 0:
+#         keep_df_subset = filtered_distance_df.loc[filtered_distance_df.cell_type == cell_type]
+#         keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)]
+#         deduped_dfs.append(keep_df_subset)
+
+# deduped_distance_df = pd.concat(deduped_dfs)
+
+# # save filtered df
+# deduped_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_deduped.csv'), index=False)
+
+# # create version aggregated by timepoint
+# deduped_distance_df_grouped = deduped_distance_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset'])
+# deduped_distance_df_timepoint = deduped_distance_df_grouped['value'].agg([np.mean, np.std])
+# deduped_distance_df_timepoint.reset_index(inplace=True)
+# deduped_distance_df_timepoint = deduped_distance_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+
+# # save timepoint df
+# deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False)
 
 
 # occupancy stat summary
@@ -1043,7 +1047,7 @@
                                                      result_name=result_name,
                                                      cluster_col_name=cluster_col_name,
                                                      drop_cols=drop_cols,
-                                                     zscore_threshold=0.0))
+                                                     positive_threshold=0.0))
 
 # create combined df
 total_df_occupancy = pd.concat(occupancy_dfs, axis=0)
@@ -1152,142 +1156,142 @@
 deduped_occupancy_df_timepoint.to_csv(os.path.join(output_dir, 'occupancy_df_per_timepoint_deduped.csv'), index=False)
 
 
-# create manual df with total morphology marker average across all cells in an image
-morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
+# # create manual df with total morphology marker average across all cells in an image
+# morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])]
 
-# group by specified columns
-grouped_table = morph_table_small.groupby('fov')
-transformed = grouped_table.agg(np.mean)
-transformed.reset_index(inplace=True)
+# # group by specified columns
+# grouped_table = morph_table_small.groupby('fov')
+# transformed = grouped_table.agg(np.mean)
+# transformed.reset_index(inplace=True)
 
-# reshape to long df
-long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature')
-long_df['metric'] = 'total_freq'
-long_df['cell_type'] = 'all'
-long_df['subset'] = 'all'
+# # reshape to long df
+# long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature')
+# long_df['metric'] = 'total_freq'
+# long_df['cell_type'] = 'all'
+# long_df['subset'] = 'all'
 
-long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
+# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner')
 
-long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False)
+# long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False)
 
-# filter morphology markers to only include FOVs with at least the specified number of cells
-total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
-min_cells = 5
+# # filter morphology markers to only include FOVs with at least the specified number of cells
+# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'))
+# min_cells = 5
 
-filtered_dfs = []
-metrics = [['cluster_broad_count', 'cluster_broad_freq'],
-           ['cluster_count', 'cluster_freq'],
-           ['meta_cluster_count', 'meta_cluster_freq']]
-for metric in metrics:
-    # subset count df to include cells at the relevant clustering resolution
-    for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
-                        'tls', 'tagg', 'all']:
-        count_df = total_df[total_df.metric == metric[0]]
-        count_df = count_df[count_df.subset == compartment]
+# filtered_dfs = []
+# metrics = [['cluster_broad_count', 'cluster_broad_freq'],
+#            ['cluster_count', 'cluster_freq'],
+#            ['meta_cluster_count', 'meta_cluster_freq']]
+# for metric in metrics:
+#     # subset count df to include cells at the relevant clustering resolution
+#     for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border',
+#                         'tls', 'tagg', 'all']:
+#         count_df = total_df[total_df.metric == metric[0]]
+#         count_df = count_df[count_df.subset == compartment]
 
-        # subset morphology df to only include morphology metrics at this resolution
-        morph_df = total_df_morph[total_df_morph.metric == metric[1]]
-        morph_df = morph_df[morph_df.subset == compartment]
+#         # subset morphology df to only include morphology metrics at this resolution
+#         morph_df = total_df_morph[total_df_morph.metric == metric[1]]
+#         morph_df = morph_df[morph_df.subset == compartment]
 
-        # for each cell type, determine which FOVs have high enough counts to be included
-        for cell_type in count_df.cell_type.unique():
-            keep_df = count_df[count_df.cell_type == cell_type]
-            keep_df = keep_df[keep_df.value >= min_cells]
-            keep_fovs = keep_df.fov.unique()
+#         # for each cell type, determine which FOVs have high enough counts to be included
+#         for cell_type in count_df.cell_type.unique():
+#             keep_df = count_df[count_df.cell_type == cell_type]
+#             keep_df = keep_df[keep_df.value >= min_cells]
+#             keep_fovs = keep_df.fov.unique()
 
-            # subset morphology df to only include FOVs with high enough counts
-            keep_features = morph_df[morph_df.cell_type == cell_type]
-            keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
+#             # subset morphology df to only include FOVs with high enough counts
+#             keep_features = morph_df[morph_df.cell_type == cell_type]
+#             keep_features = keep_features[keep_features.fov.isin(keep_fovs)]
 
-            # append to list of filtered dfs
-            filtered_dfs.append(keep_features)
+#             # append to list of filtered dfs
+#             filtered_dfs.append(keep_features)
 
-filtered_dfs.append(long_df)
-filtered_morph_df = pd.concat(filtered_dfs)
+# filtered_dfs.append(long_df)
+# filtered_morph_df = pd.concat(filtered_dfs)
 
-# save filtered df
-filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False)
+# # save filtered df
+# filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False)
 
-# create version aggregated by timepoint
-filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset'])
-filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std])
-filtered_morph_df_timepoint.reset_index(inplace=True)
-filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# # create version aggregated by timepoint
+# filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset'])
+# filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std])
+# filtered_morph_df_timepoint.reset_index(inplace=True)
+# filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
 
-# save timepoint df
-filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False)
+# # save timepoint df
+# filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False)
 
-# remove redundant morphology features
-block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter']
+# # remove redundant morphology features
+# block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter']
 
-block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear']
+# block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear']
 
-block3 = ['eccentricity', 'major_axis_equiv_diam_ratio']
+# block3 = ['eccentricity', 'major_axis_equiv_diam_ratio']
 
-block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear']
+# block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear']
 
-deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+# deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
 
-# only keep complex morphology features for cancer cells, remove everything except area and nc for others
-cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17',
-                   'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim']
-basic_morph_features = ['area', 'area_nuclear', 'nc_ratio']
+# # only keep complex morphology features for cancer cells, remove everything except area and nc for others
+# cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17',
+#                    'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim']
+# basic_morph_features = ['area', 'area_nuclear', 'nc_ratio']
 
-deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :]
+# deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :]
 
-# saved deduped
-deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False)
+# # saved deduped
+# deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False)
 
-# same for timepoints
-deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
-deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :]
-deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False)
+# # same for timepoints
+# deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :]
+# deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :]
+# deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False)
 
 
 
-# fiber analysis
-fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])]
+# # fiber analysis
+# fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])]
 
-fiber_df = fiber_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left')
+# fiber_df = fiber_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left')
 
-fiber_df_means = fiber_df.groupby(['Tissue_ID', 'fov']).agg(np.mean)
-fiber_df_means.reset_index(inplace=True)
+# fiber_df_means = fiber_df.groupby(['Tissue_ID', 'fov']).agg(np.mean)
+# fiber_df_means.reset_index(inplace=True)
 
-fiber_df_long = pd.melt(fiber_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value')
-fiber_df_long['fiber_metric'] = 'fiber_' + fiber_df_long['fiber_metric']
+# fiber_df_long = pd.melt(fiber_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value')
+# fiber_df_long['fiber_metric'] = 'fiber_' + fiber_df_long['fiber_metric']
 
-fiber_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_core.csv'), index=False)
+# fiber_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_core.csv'), index=False)
 
-# create version aggregated by timepoint
-fiber_df_grouped = fiber_df_long.groupby(['Tissue_ID', 'fiber_metric'])
-fiber_df_timepoint = fiber_df_grouped['value'].agg([np.mean, np.std])
-fiber_df_timepoint.reset_index(inplace=True)
-fiber_df_timepoint = fiber_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# # create version aggregated by timepoint
+# fiber_df_grouped = fiber_df_long.groupby(['Tissue_ID', 'fiber_metric'])
+# fiber_df_timepoint = fiber_df_grouped['value'].agg([np.mean, np.std])
+# fiber_df_timepoint.reset_index(inplace=True)
+# fiber_df_timepoint = fiber_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
 
-# save timepoint df
-fiber_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_timepoint.csv'), index=False)
+# # save timepoint df
+# fiber_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_timepoint.csv'), index=False)
 
-# for tiles, get max per image
-fiber_tile_df = fiber_tile_df.dropna()
-fiber_tile_df = fiber_tile_df.loc[:, ~fiber_tile_df.columns.isin(['pixel_density', 'tile_y', 'tile_x'])]
-fiber_tile_df.columns = fiber_tile_df.columns.str.replace('avg_', '')
-fiber_tile_df.columns = fiber_tile_df.columns.str.replace('fiber_', '')
-fiber_tile_df = fiber_tile_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left')
+# # for tiles, get max per image
+# fiber_tile_df = fiber_tile_df.dropna()
+# fiber_tile_df = fiber_tile_df.loc[:, ~fiber_tile_df.columns.isin(['pixel_density', 'tile_y', 'tile_x'])]
+# fiber_tile_df.columns = fiber_tile_df.columns.str.replace('avg_', '')
+# fiber_tile_df.columns = fiber_tile_df.columns.str.replace('fiber_', '')
+# fiber_tile_df = fiber_tile_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left')
 
-# group by fov
-fiber_tile_df_means = fiber_tile_df.groupby(['Tissue_ID', 'fov']).agg(np.max)
-fiber_tile_df_means.reset_index(inplace=True)
+# # group by fov
+# fiber_tile_df_means = fiber_tile_df.groupby(['Tissue_ID', 'fov']).agg(np.max)
+# fiber_tile_df_means.reset_index(inplace=True)
 
-fiber_tile_df_long = pd.melt(fiber_tile_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value')
-fiber_tile_df_long['fiber_metric'] = 'max_fiber_' + fiber_tile_df_long['fiber_metric']
+# fiber_tile_df_long = pd.melt(fiber_tile_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value')
+# fiber_tile_df_long['fiber_metric'] = 'max_fiber_' + fiber_tile_df_long['fiber_metric']
 
-fiber_tile_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_tile.csv'), index=False)
+# fiber_tile_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_tile.csv'), index=False)
 
-# create version aggregated by timepoint
-fiber_tile_df_grouped = fiber_tile_df_long.groupby(['Tissue_ID', 'fiber_metric'])
-fiber_tile_df_timepoint = fiber_tile_df_grouped['value'].agg([np.mean, np.std])
-fiber_tile_df_timepoint.reset_index(inplace=True)
-fiber_tile_df_timepoint = fiber_tile_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
+# # create version aggregated by timepoint
+# fiber_tile_df_grouped = fiber_tile_df_long.groupby(['Tissue_ID', 'fiber_metric'])
+# fiber_tile_df_timepoint = fiber_tile_df_grouped['value'].agg([np.mean, np.std])
+# fiber_tile_df_timepoint.reset_index(inplace=True)
+# fiber_tile_df_timepoint = fiber_tile_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID')
 
-# save timepoint df
-fiber_tile_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_tile_timepoint.csv'), index=False)
+# # save timepoint df
+# fiber_tile_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_tile_timepoint.csv'), index=False)
diff --git a/python_files/utils.py b/python_files/utils.py
index 24a319c..2ea1f8a 100644
--- a/python_files/utils.py
+++ b/python_files/utils.py
@@ -250,16 +250,6 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su
     # get all the FOV names
     fov_names = cell_table_small["fov"].unique()
 
-    # if a population subset is specified, first validate then truncate the cell table accordingly
-    if pop_subset:
-        verify_in_list(
-            specified_populations=pop_subset,
-            valid_populations=cell_table_small[cluster_col_name].unique()
-        )
-    # otherwise, use all possible subsets as defined by pop_col
-    else:
-        pop_subset = cell_table_small[cluster_col_name].unique()
-
     # define the tile size in pixels for each FOV
     tile_size = max_image_size // tiles_per_row_col
 
@@ -273,7 +263,7 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su
             cell_table_small["fov"].unique(),
             cell_table_small["tile_row"].unique(),
             cell_table_small["tile_col"].unique(),
-            cell_table_small[pop_col].unique()
+            cell_table_small[cluster_col_name].unique()
         ],
         names=["fov", "tile_row", "tile_col", cluster_col_name]
     ).to_frame(index=False)
@@ -296,8 +286,8 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su
     # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates())
 
     # zscore the tile counts across the cohort grouped by the cell types
-    occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby(
-        [pop_col]
+    occupancy_counts["zscore_occupancy"] = occupancy_counts.groupby(
+        [cluster_col_name]
     )["occupancy"].transform(lambda x: zscore(x, ddof=0))
 
     # mark tiles as positive depending on the positive_threshold value
@@ -309,7 +299,7 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su
         occupancy_zscore_groupby.append(subset_col)
     occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(occupancy_zscore_groupby).apply(
         lambda row: row["is_positive"].sum()
-    ).reset_ondex("total_positive_tiles")
+    ).reset_index(name="total_positive_tiles")
 
     # get the max image size, this will determine how many tiles there are when finding percentages
     occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply(