From b6c088a7f31777c25bf60cb7e84cc495e0672ad1 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 14 Feb 2024 13:45:31 -0800 Subject: [PATCH 01/14] Initial commit of occupancy statistic visualizations --- python_files/generate_supplementary_plots.py | 18 +++- python_files/supplementary_plot_helpers.py | 93 ++++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index fa7bf23..dcf3134 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -297,7 +297,7 @@ cell_table = pd.read_csv( os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") ) -functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds_test") +functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds") if not os.path.exists(functional_marker_viz_dir): os.makedirs(functional_marker_viz_dir) @@ -431,3 +431,19 @@ # Feature extraction +# Occupancy statistics +# TODO: make a constant in supplementary_plot_helpers +cell_table = pd.read_csv( + os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") +) +occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats") +if not os.path.exists(occupancy_stats_viz_dir): + os.makedirs(occupancy_stats_viz_dir) + +# massive GridSearch +for tiles_per_row_col in [2, 4, 8, 16, 32, 64, 128]: + for positive_threshold in np.arange(1, 26): + supplementary_plot_helpers.visualize_occupancy_statistics( + cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col, + positive_threshold=positive_threshold, + ) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index 34f2b11..5eb12cb 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -1,4 +1,6 @@ +import itertools import math +import matplotlib.colors as mcolors import matplotlib.pyplot as plt import natsort as ns import numpy as np @@ -337,3 +339,94 @@ def stitch_before_after_norm( pre_norm_tiled.save(pre_norm_stitched_path) post_norm_tiled.save(post_norm_stitched_path) + + +# occupancy statistic helpers +def visualize_occupancy_statistics( + cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path]max_image_size: int = 2048, + tiles_per_row_col: int = 8, positive_threshold: int = 5 +): + """Define occupancy statistics over a cohort, and visualize distribution of positive + tile counts across FOV. + + Args: + cell_table (pd.DataFrame): + The cell table associated with the cohort, need to contain all FOVs + save_dir (Union[str, pathlib.Path]): + Directory to save the visualizations in + max_image_size (int): + The max image size to load in + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + positive_threshold (int): + The cell count in a tile required for positivity + """ + # get all the FOV names + fov_names: np.ndarray = cell_table["fov"].unique() + + # define the tile size in pixels + tile_size: int = max_image_size // tiles_per_row_col + + # define the occupancy_stats array + occupancy_stats: xr.DataArray = xr.DataArray( + np.zeros((len(fov_names), tile_size, tile_size), dtype=np.int16), + coords=[fov_names, np.zeros(tile_size), np.zeros(tile_size)], + dims=["fov", "x", "y"] + ) + + # for each FOV, update the corresponding tile count + for fov in fov_names: + cell_table_fov: pd.DataFrame = cell_table[cell_table["fov"] == fov].copy() + cell_table_fov["tile_row"] = cell_table_fov["centroid-1"] // tile_size + cell_table_row["tile_col"] = cell_table_fov["centroid-0"] // tile_size + fov_occupancy_table: pd.DataFrame = cell_table_fov.groupby( + ["tile_row", "tile_col"] + ).size().reset_index(name="occupancy") + occupancy_stats.loc[ + fov, fov_occupancy_table["tile_row"].values, fov_occupancy_table["tile_col"].values + ] += fov_occupancy_table["occupancy"].values + + # define the tiles that are positive based on threshold + occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold + + # count number of positive tiles per FOV + fov_positive_tiles: Dict[str, int] = occupancy_stats_positivity.sum( + dim=["tile_row", "tile_col"] + ).to_dict()["data"] + + # 1. visualize histogram of positive tile counts across cohort + fig, axs = plt.subplots( + 3, + 1, + figsize=(10, 10) + ) + axs[0].hist(list(fov_positive_tiles.values())) + + # 2. visualize heatmap of average positivity of tiles across FOVs + cmap_positivity = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) + c_positivity = axs[1].imshow( + occupancy_stats_positivity.mean(dim="fov").values, + cmap=cmap_positivity, + aspect="auto", + origin="lower" + ) + for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values) + axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") + fig.colorbar(c_positivity, ax=axs[1]) + + # 3. visualize heatmap of average number of counts across FOVs + cmap_counts = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) + c_counts = axs[2].imshow( + occupancy_stats.mean(dim="fov").values, + cmap=cmap_counts, + aspect="auto" + ) + for x, y in itertools.pairwise(occupancy_stats.tile_row.values) + axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") + fig.colorbar(c_positivity, ax=axs[2]) + + # save the figure to save_dir + fig.savefig( + pathlib.Path(save_dir) / f"occupancy_stats_ts{tiles_per_row_col}_pt{positive_threshold}.png" + dpi=300 + ) From 73e9692587ec066a3e6ebc6d461055e9c75377d0 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 14 Feb 2024 13:50:50 -0800 Subject: [PATCH 02/14] Missed a comma --- python_files/supplementary_plot_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index 5eb12cb..d9add96 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -343,7 +343,7 @@ def stitch_before_after_norm( # occupancy statistic helpers def visualize_occupancy_statistics( - cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path]max_image_size: int = 2048, + cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048, tiles_per_row_col: int = 8, positive_threshold: int = 5 ): """Define occupancy statistics over a cohort, and visualize distribution of positive From 8400c3c9a99faf4374e46c89bc0e6f987534a197 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 14 Feb 2024 13:51:09 -0800 Subject: [PATCH 03/14] Missed a few colons --- python_files/supplementary_plot_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index d9add96..a3d1b63 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -410,7 +410,7 @@ def visualize_occupancy_statistics( aspect="auto", origin="lower" ) - for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values) + for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values): axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") fig.colorbar(c_positivity, ax=axs[1]) @@ -421,7 +421,7 @@ def visualize_occupancy_statistics( cmap=cmap_counts, aspect="auto" ) - for x, y in itertools.pairwise(occupancy_stats.tile_row.values) + for x, y in itertools.pairwise(occupancy_stats.tile_row.values): axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") fig.colorbar(c_positivity, ax=axs[2]) From fab6b08d741fc9a6f367d9945d309a0925397a2b Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 14 Feb 2024 13:52:09 -0800 Subject: [PATCH 04/14] Forgot a comma --- python_files/supplementary_plot_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index a3d1b63..40644f5 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -427,6 +427,6 @@ def visualize_occupancy_statistics( # save the figure to save_dir fig.savefig( - pathlib.Path(save_dir) / f"occupancy_stats_ts{tiles_per_row_col}_pt{positive_threshold}.png" + pathlib.Path(save_dir) / f"occupancy_stats_{tiles_per_row_col}_{positive_threshold}.png", dpi=300 ) From dd271fc53e3f9bdc9f2e4f5ef46e4b6e217da0b2 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 15 Feb 2024 13:11:34 -0800 Subject: [PATCH 05/14] More efficient occupancy statistic calculations, and don't overlay tile names on heatmap --- python_files/generate_supplementary_plots.py | 821 ++++++++++--------- python_files/supplementary_plot_helpers.py | 69 +- 2 files changed, 455 insertions(+), 435 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index dcf3134..1215dc1 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -6,7 +6,7 @@ import pandas as pd import matplotlib from ark.utils.plot_utils import cohort_cluster_plot -from toffy import qc_comp, qc_metrics_plots +# from toffy import qc_comp, qc_metrics_plots from alpineer import io_utils @@ -24,411 +24,411 @@ SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" -# Panel validation - - -# ROI selection - - -# QC -qc_metrics = ["Non-zero mean intensity"] -channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update", - "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed", - "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"] - -## FOV spatial location -cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" -qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics" -if not os.path.exists(qc_tma_metrics_dir): - os.makedirs(qc_tma_metrics_dir) - -fovs = io_utils.list_folders(cohort_path) -tmas = list(set([fov.split('_R')[0] for fov in fovs])) - -qc_tmas = qc_comp.QCTMA( - qc_metrics=qc_metrics, - cohort_path=cohort_path, - metrics_dir=qc_tma_metrics_dir, -) - -qc_tmas.compute_qc_tma_metrics(tmas=tmas) -qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude) -qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300) - -## longitudinal controls -control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" -qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control" -if not os.path.exists(qc_control_metrics_dir): - os.makedirs(qc_control_metrics_dir) - -folders = io_utils.list_folders(control_path, "TMA3_") -control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4 - else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4] - for name in folders] - -all_folders = io_utils.list_folders(control_path) -for i, control in enumerate(control_substrs): - control_sample_name = control - print(control) - if control == 'tonsil_bottom': - fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25] - else: - fovs = [folder for folder in all_folders if control in folder] - - qc_control = qc_comp.QCControlMetrics( - qc_metrics=qc_metrics, - cohort_path=control_path, - metrics_dir=qc_control_metrics_dir, - ) - - qc_control.compute_control_qc_metrics( - control_sample_name=control_sample_name, - fovs=fovs, - channel_exclude=channel_exclude, - channel_include=None, - ) - - qc_metrics_plots.longitudinal_control_heatmap( - qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300 - ) - -dfs = [] -for control in control_substrs: - df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv")) - df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])] - log2_norm_df: pd.DataFrame = df.pivot( - index="channel", columns="fov", values="Non-zero mean intensity" - ).transform(func=lambda row: np.log2(row / row.mean()), axis=1) - if control != 'tonsil_bottom_duplicate1': - dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col] - log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df - - mean_t_df: pd.DataFrame = ( - log2_norm_df.mean(axis=0) - .to_frame(name="mean") - .transpose() - .sort_values(by="mean", axis=1) - ) - transformed_df: pd.DataFrame = pd.concat( - objs=[log2_norm_df, mean_t_df] - ).sort_values(by="mean", axis=1, inplace=False) - transformed_df.rename_axis("channel", axis=0, inplace=True) - transformed_df.rename_axis("fov", axis=1, inplace=True) - - dfs.append(transformed_df) -all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True) -all_data = all_data.groupby(['channel']).mean() -all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2) - - -fig = plt.figure(figsize=(12,12), dpi=300) -fig.set_layout_engine(layout="constrained") -gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1]) -_norm = Normalize(vmin=-1, vmax=1) -_cmap = sns.color_palette("vlag", as_cmap=True) -fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ") - -annotation_kws = { - "horizontalalignment": "center", - "verticalalignment": "center", - "fontsize": 8, -} - -ax_heatmap = fig.add_subplot(gs[0, 0]) -sns.heatmap( - data=all_data[~all_data.index.isin(["mean"])], - ax=ax_heatmap, - linewidths=1, - linecolor="black", - cbar_kws={"shrink": 0.5}, - annot=True, - annot_kws=annotation_kws, - xticklabels=False, - norm=_norm, - cmap=_cmap, -) - -ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$") -ax_heatmap.set_yticks( - ticks=ax_heatmap.get_yticks(), - labels=ax_heatmap.get_yticklabels(), - rotation=0, -) -ax_heatmap.set_xlabel(None) - -ax_avg = fig.add_subplot(gs[1, 0]) -sns.heatmap( - data=all_data[all_data.index.isin(["mean"])], - ax=ax_avg, - linewidths=1, - linecolor="black", - annot=True, - annot_kws=annotation_kws, - fmt=".2f", - cmap=ListedColormap(["white"]), - cbar=False, -) -ax_avg.set_yticks( - ticks=ax_avg.get_yticks(), - labels=["Mean"], - rotation=0, -) -ax_avg.set_xticks( - ticks=ax_avg.get_xticks(), - labels=ax_avg.get_xticklabels(), - rotation=45, - ha="right", - rotation_mode="anchor", -) -ax_heatmap.set_ylabel("Channel") -ax_avg.set_xlabel("FOV") - -fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300, - bbox_inches="tight") - - -# Image processing -## show a run with images stitched in acquisition order pre- and post-normalization -acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order") -if not os.path.exists(acquisition_order_viz_dir): - os.makedirs(acquisition_order_viz_dir) - -run_name = "2022-01-14_TONIC_TMA2_run1" -pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta" -post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized" -save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" - -# NOTE: image not scaled up, this happens in Photoshop -supplementary_plot_helpers.stitch_before_after_norm( - pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name, - "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1 -) - - -# Cell identification and classification -cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') -cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, - 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, - 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, - 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} -cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) - -save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' - -## cell cluster counts -sns.histplot(data=cell_table, x="cell_cluster") -sns.despine() -plt.title("Cell Cluster Counts") -plt.xlabel("Cell Cluster") -plt.xticks(rotation=75) -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) - -## fov cell counts -cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] -plt.figure(figsize=(8, 6)) -g = sns.histplot(data=cluster_counts, kde=True) -sns.despine() -plt.title("Histogram of Cell Counts per Image") -plt.xlabel("Number of Cells in an Image") -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) - -## cell type composition by tissue location of met -meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') -meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] - -all_data = cell_table.merge(meta_data, on=['fov'], how='left') -base_data = all_data[all_data.Timepoint == 'baseline'] - -all_locals = np.unique(base_data.Localization) -dfs = [] -for region in all_locals: - localization_data = base_data[base_data.Localization == region] - - df = localization_data.groupby("cell_cluster_broad").count().reset_index() - df = df.set_index('cell_cluster_broad').transpose() - sub_df = df.iloc[:1].reset_index(drop=True) - sub_df.insert(0, "Localization", [region]) - sub_df['Localization'] = sub_df['Localization'].map(str) - sub_df = sub_df.set_index('Localization') - - dfs.append(sub_df) -prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) - -color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'], - 'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite', 'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']} -prop_data = prop_data[color_map['cell_cluster_broad']] - -colors = color_map['color'] -prop_data.plot(kind='bar', stacked=True, color=colors) -plt.ticklabel_format(style='plain', useOffset=False, axis='y') -plt.gca().set_ylabel("Cell Proportions") -plt.gca().set_xlabel("Tissue Location") -plt.xticks(rotation=30) -plt.title("Cell Type Composition by Tissue Location") -plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300) - -## colored cell cluster masks from random subset of 20 FOVs -random.seed(13) -seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' - -all_fovs = list(cell_table['fov'].unique()) -fovs = random.sample(all_fovs, 20) -cell_table_subset = cell_table[cell_table.fov.isin(fovs)] - -cohort_cluster_plot( - fovs=fovs, - seg_dir=seg_dir, - save_dir=save_dir, - cell_data=cell_table_subset, - erode=True, - fov_col='fov', - label_col='label', - cluster_col='cell_cluster_broad', - seg_suffix="_whole_cell.tiff", - cmap=color_map, - display_fig=False, -) - -# Functional marker thresholding -cell_table = pd.read_csv( - os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") -) -functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds") -if not os.path.exists(functional_marker_viz_dir): - os.makedirs(functional_marker_viz_dir) - -marker_info = { - "Ki67": { - "populations": ["Cancer", "Mast"], - "threshold": 0.002, - "x_range": (0, 0.012), - "x_ticks": np.array([0, 0.004, 0.008, 0.012]), - "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]), - }, - "CD38": { - "populations": ["Endothelium", "Cancer_EMT"], - "threshold": 0.004, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]), - }, - "CD45RB": { - "populations": ["CD4T", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.015), - "x_ticks": np.array([0, 0.005, 0.010, 0.015]), - "x_tick_labels": np.array([0, 0.005, 0.010, 0.015]) - }, - "CD45RO": { - "populations": ["CD4T", "Fibroblast"], - "threshold": 0.002, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) - }, - "CD57": { - "populations": ["CD8T", "B"], - "threshold": 0.002, - "x_range": (0, 0.006), - "x_ticks": np.array([0, 0.002, 0.004, 0.006]), - "x_tick_labels": np.array([0, 0.002, 0.004, 0.006]) - }, - "CD69": { - "populations": ["Treg", "Cancer"], - "threshold": 0.002, - "x_range": (0, 0.008), - "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]), - "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008]) - }, - "GLUT1": { - "populations": ["Cancer_EMT", "M2_Mac"], - "threshold": 0.002, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) - }, - "IDO": { - "populations": ["APC", "M1_Mac"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) - }, - "PD1": { - "populations": ["CD8T", "Stroma"], - "threshold": 0.0005, - "x_range": (0, 0.002), - "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]), - "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002]) - }, - "PDL1": { - "populations": ["Cancer", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]), - }, - "HLA1": { - "populations": ["APC", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.025), - "x_ticks": np.array([0, 0.0125, 0.025]), - "x_tick_labels": np.array([0, 0.0125, 0.025]) - }, - "HLADR": { - "populations": ["APC", "Neutrophil"], - "threshold": 0.001, - "x_range": (0, 0.025), - "x_ticks": np.array([0, 0.0125, 0.025]), - "x_tick_labels": np.array([0, 0.0125, 0.025]) - }, - "TBET": { - "populations": ["NK", "B"], - "threshold": 0.0015, - "x_range": (0, 0.0045), - "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]), - "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045]) - }, - "TCF1": { - "populations": ["CD4T", "M1_Mac"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) - }, - "TIM3": { - "populations": ["Monocyte", "Endothelium"], - "threshold": 0.001, - "x_range": (0, 0.004), - "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004]) - }, - "Vim": { - "populations": ["Endothelium", "B"], - "threshold": 0.002, - "x_range": (0, 0.06), - "x_ticks": np.array([0, 0.02, 0.04, 0.06]), - "x_tick_labels": np.array([0, 0.02, 0.04, 0.06]) - }, - "Fe": { - "populations": ["Fibroblast", "Cancer"], - "threshold": 0.1, - "x_range": (0, 0.3), - "x_ticks": np.array([0, 0.1, 0.2, 0.3]), - "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]), - } -} -supplementary_plot_helpers.functional_marker_thresholding( - cell_table, functional_marker_viz_dir, marker_info=marker_info, - figsize=(20, 40) -) - - -# Feature extraction +# # Panel validation + + +# # ROI selection + + +# # QC +# qc_metrics = ["Non-zero mean intensity"] +# channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update", +# "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed", +# "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"] + +# ## FOV spatial location +# cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" +# qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics" +# if not os.path.exists(qc_tma_metrics_dir): +# os.makedirs(qc_tma_metrics_dir) + +# fovs = io_utils.list_folders(cohort_path) +# tmas = list(set([fov.split('_R')[0] for fov in fovs])) + +# qc_tmas = qc_comp.QCTMA( +# qc_metrics=qc_metrics, +# cohort_path=cohort_path, +# metrics_dir=qc_tma_metrics_dir, +# ) + +# qc_tmas.compute_qc_tma_metrics(tmas=tmas) +# qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude) +# qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300) + +# ## longitudinal controls +# control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" +# qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control" +# if not os.path.exists(qc_control_metrics_dir): +# os.makedirs(qc_control_metrics_dir) + +# folders = io_utils.list_folders(control_path, "TMA3_") +# control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4 +# else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4] +# for name in folders] + +# all_folders = io_utils.list_folders(control_path) +# for i, control in enumerate(control_substrs): +# control_sample_name = control +# print(control) +# if control == 'tonsil_bottom': +# fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25] +# else: +# fovs = [folder for folder in all_folders if control in folder] + +# qc_control = qc_comp.QCControlMetrics( +# qc_metrics=qc_metrics, +# cohort_path=control_path, +# metrics_dir=qc_control_metrics_dir, +# ) + +# qc_control.compute_control_qc_metrics( +# control_sample_name=control_sample_name, +# fovs=fovs, +# channel_exclude=channel_exclude, +# channel_include=None, +# ) + +# qc_metrics_plots.longitudinal_control_heatmap( +# qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300 +# ) + +# dfs = [] +# for control in control_substrs: +# df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv")) +# df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])] +# log2_norm_df: pd.DataFrame = df.pivot( +# index="channel", columns="fov", values="Non-zero mean intensity" +# ).transform(func=lambda row: np.log2(row / row.mean()), axis=1) +# if control != 'tonsil_bottom_duplicate1': +# dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col] +# log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df + +# mean_t_df: pd.DataFrame = ( +# log2_norm_df.mean(axis=0) +# .to_frame(name="mean") +# .transpose() +# .sort_values(by="mean", axis=1) +# ) +# transformed_df: pd.DataFrame = pd.concat( +# objs=[log2_norm_df, mean_t_df] +# ).sort_values(by="mean", axis=1, inplace=False) +# transformed_df.rename_axis("channel", axis=0, inplace=True) +# transformed_df.rename_axis("fov", axis=1, inplace=True) + +# dfs.append(transformed_df) +# all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True) +# all_data = all_data.groupby(['channel']).mean() +# all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2) + + +# fig = plt.figure(figsize=(12,12), dpi=300) +# fig.set_layout_engine(layout="constrained") +# gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1]) +# _norm = Normalize(vmin=-1, vmax=1) +# _cmap = sns.color_palette("vlag", as_cmap=True) +# fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ") + +# annotation_kws = { +# "horizontalalignment": "center", +# "verticalalignment": "center", +# "fontsize": 8, +# } + +# ax_heatmap = fig.add_subplot(gs[0, 0]) +# sns.heatmap( +# data=all_data[~all_data.index.isin(["mean"])], +# ax=ax_heatmap, +# linewidths=1, +# linecolor="black", +# cbar_kws={"shrink": 0.5}, +# annot=True, +# annot_kws=annotation_kws, +# xticklabels=False, +# norm=_norm, +# cmap=_cmap, +# ) + +# ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$") +# ax_heatmap.set_yticks( +# ticks=ax_heatmap.get_yticks(), +# labels=ax_heatmap.get_yticklabels(), +# rotation=0, +# ) +# ax_heatmap.set_xlabel(None) + +# ax_avg = fig.add_subplot(gs[1, 0]) +# sns.heatmap( +# data=all_data[all_data.index.isin(["mean"])], +# ax=ax_avg, +# linewidths=1, +# linecolor="black", +# annot=True, +# annot_kws=annotation_kws, +# fmt=".2f", +# cmap=ListedColormap(["white"]), +# cbar=False, +# ) +# ax_avg.set_yticks( +# ticks=ax_avg.get_yticks(), +# labels=["Mean"], +# rotation=0, +# ) +# ax_avg.set_xticks( +# ticks=ax_avg.get_xticks(), +# labels=ax_avg.get_xticklabels(), +# rotation=45, +# ha="right", +# rotation_mode="anchor", +# ) +# ax_heatmap.set_ylabel("Channel") +# ax_avg.set_xlabel("FOV") + +# fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300, +# bbox_inches="tight") + + +# # Image processing +# ## show a run with images stitched in acquisition order pre- and post-normalization +# acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order") +# if not os.path.exists(acquisition_order_viz_dir): +# os.makedirs(acquisition_order_viz_dir) + +# run_name = "2022-01-14_TONIC_TMA2_run1" +# pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta" +# post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized" +# save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" + +# # NOTE: image not scaled up, this happens in Photoshop +# supplementary_plot_helpers.stitch_before_after_norm( +# pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name, +# "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1 +# ) + + +# # Cell identification and classification +# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') +# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, +# 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, +# 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, +# 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} +# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) + +# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' + +# ## cell cluster counts +# sns.histplot(data=cell_table, x="cell_cluster") +# sns.despine() +# plt.title("Cell Cluster Counts") +# plt.xlabel("Cell Cluster") +# plt.xticks(rotation=75) +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) + +# ## fov cell counts +# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] +# plt.figure(figsize=(8, 6)) +# g = sns.histplot(data=cluster_counts, kde=True) +# sns.despine() +# plt.title("Histogram of Cell Counts per Image") +# plt.xlabel("Number of Cells in an Image") +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) + +# ## cell type composition by tissue location of met +# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') +# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] + +# all_data = cell_table.merge(meta_data, on=['fov'], how='left') +# base_data = all_data[all_data.Timepoint == 'baseline'] + +# all_locals = np.unique(base_data.Localization) +# dfs = [] +# for region in all_locals: +# localization_data = base_data[base_data.Localization == region] + +# df = localization_data.groupby("cell_cluster_broad").count().reset_index() +# df = df.set_index('cell_cluster_broad').transpose() +# sub_df = df.iloc[:1].reset_index(drop=True) +# sub_df.insert(0, "Localization", [region]) +# sub_df['Localization'] = sub_df['Localization'].map(str) +# sub_df = sub_df.set_index('Localization') + +# dfs.append(sub_df) +# prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) + +# color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'], +# 'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite', 'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']} +# prop_data = prop_data[color_map['cell_cluster_broad']] + +# colors = color_map['color'] +# prop_data.plot(kind='bar', stacked=True, color=colors) +# plt.ticklabel_format(style='plain', useOffset=False, axis='y') +# plt.gca().set_ylabel("Cell Proportions") +# plt.gca().set_xlabel("Tissue Location") +# plt.xticks(rotation=30) +# plt.title("Cell Type Composition by Tissue Location") +# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300) + +# ## colored cell cluster masks from random subset of 20 FOVs +# random.seed(13) +# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' + +# all_fovs = list(cell_table['fov'].unique()) +# fovs = random.sample(all_fovs, 20) +# cell_table_subset = cell_table[cell_table.fov.isin(fovs)] + +# cohort_cluster_plot( +# fovs=fovs, +# seg_dir=seg_dir, +# save_dir=save_dir, +# cell_data=cell_table_subset, +# erode=True, +# fov_col='fov', +# label_col='label', +# cluster_col='cell_cluster_broad', +# seg_suffix="_whole_cell.tiff", +# cmap=color_map, +# display_fig=False, +# ) + +# # Functional marker thresholding +# cell_table = pd.read_csv( +# os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") +# ) +# functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds") +# if not os.path.exists(functional_marker_viz_dir): +# os.makedirs(functional_marker_viz_dir) + +# marker_info = { +# "Ki67": { +# "populations": ["Cancer", "Mast"], +# "threshold": 0.002, +# "x_range": (0, 0.012), +# "x_ticks": np.array([0, 0.004, 0.008, 0.012]), +# "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]), +# }, +# "CD38": { +# "populations": ["Endothelium", "Cancer_EMT"], +# "threshold": 0.004, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# }, +# "CD45RB": { +# "populations": ["CD4T", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.015), +# "x_ticks": np.array([0, 0.005, 0.010, 0.015]), +# "x_tick_labels": np.array([0, 0.005, 0.010, 0.015]) +# }, +# "CD45RO": { +# "populations": ["CD4T", "Fibroblast"], +# "threshold": 0.002, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) +# }, +# "CD57": { +# "populations": ["CD8T", "B"], +# "threshold": 0.002, +# "x_range": (0, 0.006), +# "x_ticks": np.array([0, 0.002, 0.004, 0.006]), +# "x_tick_labels": np.array([0, 0.002, 0.004, 0.006]) +# }, +# "CD69": { +# "populations": ["Treg", "Cancer"], +# "threshold": 0.002, +# "x_range": (0, 0.008), +# "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]), +# "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008]) +# }, +# "GLUT1": { +# "populations": ["Cancer_EMT", "M2_Mac"], +# "threshold": 0.002, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) +# }, +# "IDO": { +# "populations": ["APC", "M1_Mac"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) +# }, +# "PD1": { +# "populations": ["CD8T", "Stroma"], +# "threshold": 0.0005, +# "x_range": (0, 0.002), +# "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]), +# "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002]) +# }, +# "PDL1": { +# "populations": ["Cancer", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]), +# }, +# "HLA1": { +# "populations": ["APC", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.025), +# "x_ticks": np.array([0, 0.0125, 0.025]), +# "x_tick_labels": np.array([0, 0.0125, 0.025]) +# }, +# "HLADR": { +# "populations": ["APC", "Neutrophil"], +# "threshold": 0.001, +# "x_range": (0, 0.025), +# "x_ticks": np.array([0, 0.0125, 0.025]), +# "x_tick_labels": np.array([0, 0.0125, 0.025]) +# }, +# "TBET": { +# "populations": ["NK", "B"], +# "threshold": 0.0015, +# "x_range": (0, 0.0045), +# "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]), +# "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045]) +# }, +# "TCF1": { +# "populations": ["CD4T", "M1_Mac"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) +# }, +# "TIM3": { +# "populations": ["Monocyte", "Endothelium"], +# "threshold": 0.001, +# "x_range": (0, 0.004), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004]) +# }, +# "Vim": { +# "populations": ["Endothelium", "B"], +# "threshold": 0.002, +# "x_range": (0, 0.06), +# "x_ticks": np.array([0, 0.02, 0.04, 0.06]), +# "x_tick_labels": np.array([0, 0.02, 0.04, 0.06]) +# }, +# "Fe": { +# "populations": ["Fibroblast", "Cancer"], +# "threshold": 0.1, +# "x_range": (0, 0.3), +# "x_ticks": np.array([0, 0.1, 0.2, 0.3]), +# "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]), +# } +# } +# supplementary_plot_helpers.functional_marker_thresholding( +# cell_table, functional_marker_viz_dir, marker_info=marker_info, +# figsize=(20, 40) +# ) + + +# # Feature extraction # Occupancy statistics @@ -440,10 +440,15 @@ if not os.path.exists(occupancy_stats_viz_dir): os.makedirs(occupancy_stats_viz_dir) +# supplementary_plot_helpers.visualize_occupancy_statistics( +# cell_table, occupancy_stats_viz_dir +# ) + # massive GridSearch -for tiles_per_row_col in [2, 4, 8, 16, 32, 64, 128]: - for positive_threshold in np.arange(1, 26): +for tiles_per_row_col in [32, 64, 128, 256, 512]: + for positive_threshold in [1, 2, 5, 10]: supplementary_plot_helpers.visualize_occupancy_statistics( cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold, ) + print(f"Generated plot for # tiles {tiles_per_row_col} and threshold {positive_threshold}") diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index 40644f5..ce8ed62 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -344,7 +344,7 @@ def stitch_before_after_norm( # occupancy statistic helpers def visualize_occupancy_statistics( cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048, - tiles_per_row_col: int = 8, positive_threshold: int = 5 + tiles_per_row_col: int = 8, positive_threshold: int = 20 ): """Define occupancy statistics over a cohort, and visualize distribution of positive tile counts across FOV. @@ -367,32 +367,34 @@ def visualize_occupancy_statistics( # define the tile size in pixels tile_size: int = max_image_size // tiles_per_row_col + cell_table["tile_row"] = (cell_table["centroid-1"] // tile_size).astype(int) + cell_table["tile_col"] = (cell_table["centroid-0"] // tile_size).astype(int) + + # Group by FOV and tile positions, then count occupancy + occupancy_counts: pd.DataFrame = cell_table.groupby( + ["fov", "tile_row", "tile_col"] + ).size().reset_index(name="occupancy") + # define the occupancy_stats array - occupancy_stats: xr.DataArray = xr.DataArray( - np.zeros((len(fov_names), tile_size, tile_size), dtype=np.int16), - coords=[fov_names, np.zeros(tile_size), np.zeros(tile_size)], + occupancy_stats = xr.DataArray( + np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], dims=["fov", "x", "y"] ) - # for each FOV, update the corresponding tile count - for fov in fov_names: - cell_table_fov: pd.DataFrame = cell_table[cell_table["fov"] == fov].copy() - cell_table_fov["tile_row"] = cell_table_fov["centroid-1"] // tile_size - cell_table_row["tile_col"] = cell_table_fov["centroid-0"] // tile_size - fov_occupancy_table: pd.DataFrame = cell_table_fov.groupby( - ["tile_row", "tile_col"] - ).size().reset_index(name="occupancy") - occupancy_stats.loc[ - fov, fov_occupancy_table["tile_row"].values, fov_occupancy_table["tile_col"].values - ] += fov_occupancy_table["occupancy"].values + # Update the DataArray based on occupancy_counts without explicit Python loops + for index, row in occupancy_counts.iterrows(): + occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] # define the tiles that are positive based on threshold occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold # count number of positive tiles per FOV - fov_positive_tiles: Dict[str, int] = occupancy_stats_positivity.sum( - dim=["tile_row", "tile_col"] - ).to_dict()["data"] + fov_positive_tiles: xr.DataArray = occupancy_stats_positivity.sum(dim=["x", "y"]) + print(fov_positive_tiles) + fov_positive_tile_counts: Dict[str, int] = dict( + zip(fov_positive_tiles.fov.values, fov_positive_tiles.values) + ) # 1. visualize histogram of positive tile counts across cohort fig, axs = plt.subplots( @@ -400,7 +402,9 @@ def visualize_occupancy_statistics( 1, figsize=(10, 10) ) - axs[0].hist(list(fov_positive_tiles.values())) + print(list(fov_positive_tile_counts.values())) + axs[0].hist(list(fov_positive_tile_counts.values())) + axs[0].set_title("Distribution of tile positivity") # 2. visualize heatmap of average positivity of tiles across FOVs cmap_positivity = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) @@ -408,22 +412,33 @@ def visualize_occupancy_statistics( occupancy_stats_positivity.mean(dim="fov").values, cmap=cmap_positivity, aspect="auto", - origin="lower" + vmin=0, + vmax=1 ) - for x, y in itertools.pairwise(occupancy_stats_positivity.tile_row.values): - axs[1].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") - fig.colorbar(c_positivity, ax=axs[1]) + axs[1].set_xticks([]) + axs[1].set_yticks([]) + axs[1].set_title("Mean positivity per tile") + fig.colorbar(c_positivity, ax=axs[1], ticks=[0, 0.25, 0.5, 0.75, 1]) # 3. visualize heatmap of average number of counts across FOVs cmap_counts = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) + max_occupancy_stat = np.round(np.max(occupancy_stats.mean(dim="fov").values), 2) c_counts = axs[2].imshow( occupancy_stats.mean(dim="fov").values, cmap=cmap_counts, - aspect="auto" + aspect="auto", + vmin=0, + vmax=max_occupancy_stat + ) + axs[2].set_xticks([]) + axs[2].set_yticks([]) + axs[2].set_title("Mean num cells per tile") + fig.colorbar(c_counts, ax=axs[2], ticks=np.round(np.linspace(0, max_occupancy_stat, 4), 2)) + + fig.suptitle( + f"Stats for # tiles: {tiles_per_row_col}, positive threshold: {positive_threshold}", + fontsize=24 ) - for x, y in itertools.pairwise(occupancy_stats.tile_row.values): - axs[2].text(y, x, f"R{x}C{y}", ha="center", va="center", color="black") - fig.colorbar(c_positivity, ax=axs[2]) # save the figure to save_dir fig.savefig( From fc56d8eed020f3c4aec8540a0410d295ba69ce63 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 27 Feb 2024 11:14:25 -0800 Subject: [PATCH 06/14] Update grid sizes to grid search over: don't need to be too granular --- python_files/generate_supplementary_plots.py | 4 ++-- python_files/supplementary_plot_helpers.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index 1215dc1..4822c14 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -445,8 +445,8 @@ # ) # massive GridSearch -for tiles_per_row_col in [32, 64, 128, 256, 512]: - for positive_threshold in [1, 2, 5, 10]: +for tiles_per_row_col in [4, 8, 16]: + for positive_threshold in [10, 25, 50, 100, 200, 500]: supplementary_plot_helpers.visualize_occupancy_statistics( cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold, diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index ce8ed62..ba77dac 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -391,7 +391,6 @@ def visualize_occupancy_statistics( # count number of positive tiles per FOV fov_positive_tiles: xr.DataArray = occupancy_stats_positivity.sum(dim=["x", "y"]) - print(fov_positive_tiles) fov_positive_tile_counts: Dict[str, int] = dict( zip(fov_positive_tiles.fov.values, fov_positive_tiles.values) ) From be8083df0b284aee4d4748f47b87033b605b5488 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 1 Mar 2024 12:07:28 -0800 Subject: [PATCH 07/14] Save the occupancy statistics for faster loading --- python_files/generate_supplementary_plots.py | 55 +++++++++++--- python_files/supplementary_plot_helpers.py | 75 ++++++++++++++++++++ 2 files changed, 122 insertions(+), 8 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index 4822c14..4ec8be4 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -8,6 +8,7 @@ from ark.utils.plot_utils import cohort_cluster_plot # from toffy import qc_comp, qc_metrics_plots from alpineer import io_utils +from alpineer.load_utils import load_imgs_from_tree matplotlib.rcParams['pdf.fonttype'] = 42 @@ -21,6 +22,7 @@ import supplementary_plot_helpers ANALYSIS_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files" +IMAGE_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" @@ -436,19 +438,56 @@ cell_table = pd.read_csv( os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") ) + occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats") if not os.path.exists(occupancy_stats_viz_dir): os.makedirs(occupancy_stats_viz_dir) -# supplementary_plot_helpers.visualize_occupancy_statistics( -# cell_table, occupancy_stats_viz_dir -# ) +# because not all of the images are of the same size, make sure to append that as a feature +# NOTE: this is expensive to create, load in a previously-generated version if it exists +if os.path.exists(os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv")): + cell_table = pd.read_csv( + os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv") + ) +else: + cell_table = pd.read_csv( + os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") + ) + fov_sizes = { + fov: load_imgs_from_tree( + IMAGE_DIR, img_sub_folder=None, fovs=[fov], channels=["CD20"] + ).shape[1] + for fov in list(cell_table["fov"].unique()) + } + fov_sizes = pd.DataFrame( + {"fov": list(fov_sizes.keys()), "fov_pixel_size": list(fov_sizes.values())} + ) + + cell_table = cell_table.merge(fov_sizes, on="fov") + cell_table.to_csv( + os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv"), + index=False + ) # massive GridSearch +total_occupancy_stats_df = pd.DataFrame() for tiles_per_row_col in [4, 8, 16]: - for positive_threshold in [10, 25, 50, 100, 200, 500]: - supplementary_plot_helpers.visualize_occupancy_statistics( - cell_table, occupancy_stats_viz_dir, tiles_per_row_col=tiles_per_row_col, - positive_threshold=positive_threshold, + for positive_threshold in [5, 10, 15, 20]: + occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics( + cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"], + tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold + ) + occupancy_stats_df = pd.DataFrame( + { + "fov": list(occupancy_stats.keys()), + "percent_positive_tiles": list(occupancy_stats.values()) + } ) - print(f"Generated plot for # tiles {tiles_per_row_col} and threshold {positive_threshold}") + occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2 + occupancy_stats_df["positive_threshold"] = positive_threshold + + total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) + +total_occupancy_stats_df.to_csv( + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_updated.csv"), index=False +) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index ba77dac..7868c90 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -341,6 +341,81 @@ def stitch_before_after_norm( post_norm_tiled.save(post_norm_stitched_path) +def compute_occupancy_statistics( + cell_table: pd.DataFrame, pop_col: str = "cell_cluster", + pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8, + positive_threshold: int = 20 +): + """Compute the occupancy statistics over a cohort. + + Args: + cell_table (pd.DataFrame): + The cell table associated with the cohort, need to contain all FOVs as well as + a column indicating the size of the FOV. + pop_col (str): + Column containing the names of the cell populations + pop_subset (Optional[List[str]]): + Which populations, if any, to subset on. If None, use all populations + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + positive_threshold (int): + The cell count in a tile required for positivity + + Returns: + Dict[str, float]: + A dictionary mapping each FOV to the percentage of tiles above the positive threshold + for number of cells. + """ + # get all the FOV names + fov_names: np.ndarray = cell_table["fov"].unique() + + # if a population subset is specified, first validate then truncate the cell table accordingly + if pop_subset: + verify_in_list( + specified_populations=pop_subset, + valid_populations=cell_table[pop_col].unique() + ) + + cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy() + + # define the tile size in pixels for each FOV + cell_table["tile_size"] = cell_table["fov_pixel_size"] // tiles_per_row_col + + # define the tile for each cell + cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int) + cell_table["tile_col"] = (cell_table["centroid-0"] // cell_table["tile_size"]).astype(int) + + # Group by FOV and tile positions, then count occupancy + occupancy_counts: pd.DataFrame = cell_table.groupby( + ["fov", "tile_row", "tile_col"] + ).size().reset_index(name="occupancy") + + # define the occupancy_stats array + occupancy_stats = xr.DataArray( + np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + dims=["fov", "x", "y"] + ) + + # Update the DataArray based on occupancy_counts without explicit Python loops + for index, row in occupancy_counts.iterrows(): + occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] + + # define the tiles that are positive based on threshold + occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold + + # compute the percentage of positive tiles for each FOV + occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum( + dim=['x', 'y'] + ) / (tiles_per_row_col ** 2) + + # convert to dict + occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict() + occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()} + + return occupancy_stats_dict + + # occupancy statistic helpers def visualize_occupancy_statistics( cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048, From e26583eb805a40aa8395766497cc20e741ca676b Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 5 Mar 2024 11:39:29 -0800 Subject: [PATCH 08/14] File naming updates --- python_files/generate_supplementary_plots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index 4ec8be4..0fe0b9f 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -489,5 +489,5 @@ total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) total_occupancy_stats_df.to_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_updated.csv"), index=False + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False ) From dbd50c7da29fa9bb55d3eebc3964b4080c4ad188 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 6 Mar 2024 11:44:43 -0800 Subject: [PATCH 09/14] Updated plotting --- python_files/supplementary_plot_helpers.py | 71 +++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index 8ebbeef..d02f78e 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -410,6 +410,7 @@ def stitch_before_after_norm( post_norm_tiled.save(post_norm_stitched_path) +# occupancy statistic helpers def compute_occupancy_statistics( cell_table: pd.DataFrame, pop_col: str = "cell_cluster", pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8, @@ -485,8 +486,76 @@ def compute_occupancy_statistics( return occupancy_stats_dict -# occupancy statistic helpers def visualize_occupancy_statistics( + occupancy_stats_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], + pop_subset: Optional[List[str]] = None, figsize: Optional[Tuple[float, float]] = (10, 10) +): + """Visualize the distribution of the percentage of tiles above a positive threshold. + + Data needs to be generated by compute_occupancy_statistics. + + Args: + occupancy_stats_table (pd.DataFrame): + The table generated by compute_occupancy_statistics, lists the percentage of positive + tiles for each image of the cohort at different positivity thresholds and grid sizes + save_dir (Union[str, pathlib.Path]): + Directory to save the visualizations in + pop_subset (Optional[List[str]]): + Which populations, if any, to subset on. If None, use all populations + fig_size (Optional[Tuple[float, float]]): + The figure size to use for the image. + """ + # generate each unique test pair + tiles_threshold_trials = occupancy_stats_table[ + ["num_tiles", "positive_threshold"] + ].drop_duplicates().reset_index(drop=True) + + # define a separate plot on the same grid for each test pair + fig, axs = plt.subplots( + tiles_threshold_trials.shape[0], 1, figsize=figsize + ) + populations_str = f"{', '.join(pop_subset)} populations" if pop_subset else "all populations" + # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24) + # fig.subplots_adjust(top=1.50) + + # define an index to iterate through the subplots + subplot_i = 0 + + for _, trial in tiles_threshold_trials.iterrows(): + grid_size = int(np.sqrt(trial["num_tiles"])) + positive_threshold = trial["positive_threshold"] + + # subset data obtained just for the specified trial + trial_data = occupancy_stats_table[ + (occupancy_stats_table["num_tiles"] == trial["num_tiles"]) & + (occupancy_stats_table["positive_threshold"] == trial["positive_threshold"]) + ] + + # visualize the distribution using a histogram + positive_tile_percentages = trial_data["percent_positive_tiles"].values + axs[subplot_i].hist( + positive_tile_percentages, + facecolor="g", + bins=20, + alpha=0.75 + ) + axs[subplot_i].set_title( + f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}" + ) + + # update the subplot index + subplot_i += 1 + + plt.tight_layout() + + # save the figure to save_dir + fig.savefig( + pathlib.Path(save_dir) / f"occupancy_statistic_distributions.png", + dpi=300 + ) + + +def visualize_occupancy_statistics_old( cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048, tiles_per_row_col: int = 8, positive_threshold: int = 20 ): From c82a02f7877ab794177cb2b773b1d0f91e479fe5 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 6 Mar 2024 11:44:56 -0800 Subject: [PATCH 10/14] Comment out code that doesn't need to be run --- python_files/generate_supplementary_plots.py | 1109 +++++++++--------- 1 file changed, 559 insertions(+), 550 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index 97855ff..fdcf24a 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -26,535 +26,535 @@ SUPPLEMENTARY_FIG_DIR = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" -# Panel validation -panel_validation_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "panel_validation") -if not os.path.exists(panel_validation_viz_dir): - os.makedirs(panel_validation_viz_dir) - -controls_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" -controls_fov = "TONIC_TMA1_colon_bottom" -supplementary_plot_helpers.validate_panel( - controls_dir, controls_fov, panel_validation_viz_dir, font_size=180 -) - -samples_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" -samples_fov = "TONIC_TMA2_R5C4" -samples_channels = sorted(io_utils.remove_file_extensions( - io_utils.list_files(os.path.join(samples_dir, samples_fov), substrs=".tiff") -)) -exclude_chans = ["CD11c_nuc_exclude", "CK17_smoothed", "ECAD_smoothed", "FOXP3_nuc_include", - "chan_39", "chan_45", "chan_48", "chan_115", "chan_141"] -for ec in exclude_chans: - if ec in samples_channels: - samples_channels.remove(ec) -supplementary_plot_helpers.validate_panel( - samples_dir, samples_fov, panel_validation_viz_dir, channels=samples_channels, font_size=320 -) - - -# ROI selection - - -# QC -qc_metrics = ["Non-zero mean intensity"] -channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update", - "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed", - "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"] - -## FOV spatial location -cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" -qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics" -if not os.path.exists(qc_tma_metrics_dir): - os.makedirs(qc_tma_metrics_dir) - -fovs = io_utils.list_folders(cohort_path) -tmas = list(set([fov.split('_R')[0] for fov in fovs])) - -qc_tmas = qc_comp.QCTMA( - qc_metrics=qc_metrics, - cohort_path=cohort_path, - metrics_dir=qc_tma_metrics_dir, -) - -qc_tmas.compute_qc_tma_metrics(tmas=tmas) -qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude) -qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300) - -## longitudinal controls -control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" -qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control" -if not os.path.exists(qc_control_metrics_dir): - os.makedirs(qc_control_metrics_dir) - -folders = io_utils.list_folders(control_path, "TMA3_") -control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4 - else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4] - for name in folders] - -all_folders = io_utils.list_folders(control_path) -for i, control in enumerate(control_substrs): - control_sample_name = control - print(control) - if control == 'tonsil_bottom': - fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25] - else: - fovs = [folder for folder in all_folders if control in folder] - - qc_control = qc_comp.QCControlMetrics( - qc_metrics=qc_metrics, - cohort_path=control_path, - metrics_dir=qc_control_metrics_dir, - ) - - qc_control.compute_control_qc_metrics( - control_sample_name=control_sample_name, - fovs=fovs, - channel_exclude=channel_exclude, - channel_include=None, - ) - - qc_metrics_plots.longitudinal_control_heatmap( - qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300 - ) - -dfs = [] -for control in control_substrs: - df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv")) - df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])] - log2_norm_df: pd.DataFrame = df.pivot( - index="channel", columns="fov", values="Non-zero mean intensity" - ).transform(func=lambda row: np.log2(row / row.mean()), axis=1) - if control != 'tonsil_bottom_duplicate1': - dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col] - log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df - - mean_t_df: pd.DataFrame = ( - log2_norm_df.mean(axis=0) - .to_frame(name="mean") - .transpose() - .sort_values(by="mean", axis=1) - ) - transformed_df: pd.DataFrame = pd.concat( - objs=[log2_norm_df, mean_t_df] - ).sort_values(by="mean", axis=1, inplace=False) - transformed_df.rename_axis("channel", axis=0, inplace=True) - transformed_df.rename_axis("fov", axis=1, inplace=True) - - dfs.append(transformed_df) -all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True) -all_data = all_data.groupby(['channel']).mean() -all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2) - - -fig = plt.figure(figsize=(12,12), dpi=300) -fig.set_layout_engine(layout="constrained") -gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1]) -_norm = Normalize(vmin=-1, vmax=1) -_cmap = sns.color_palette("vlag", as_cmap=True) -fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ") - -annotation_kws = { - "horizontalalignment": "center", - "verticalalignment": "center", - "fontsize": 8, -} - -ax_heatmap = fig.add_subplot(gs[0, 0]) -sns.heatmap( - data=all_data[~all_data.index.isin(["mean"])], - ax=ax_heatmap, - linewidths=1, - linecolor="black", - cbar_kws={"shrink": 0.5}, - annot=True, - annot_kws=annotation_kws, - xticklabels=False, - norm=_norm, - cmap=_cmap, -) - -ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$") -ax_heatmap.set_yticks( - ticks=ax_heatmap.get_yticks(), - labels=ax_heatmap.get_yticklabels(), - rotation=0, -) -ax_heatmap.set_xlabel(None) - -ax_avg = fig.add_subplot(gs[1, 0]) -sns.heatmap( - data=all_data[all_data.index.isin(["mean"])], - ax=ax_avg, - linewidths=1, - linecolor="black", - annot=True, - annot_kws=annotation_kws, - fmt=".2f", - cmap=ListedColormap(["white"]), - cbar=False, -) -ax_avg.set_yticks( - ticks=ax_avg.get_yticks(), - labels=["Mean"], - rotation=0, -) -ax_avg.set_xticks( - ticks=ax_avg.get_xticks(), - labels=ax_avg.get_xticklabels(), - rotation=45, - ha="right", - rotation_mode="anchor", -) -ax_heatmap.set_ylabel("Channel") -ax_avg.set_xlabel("FOV") - -fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300, - bbox_inches="tight") - - -# Image processing -## show a run with images stitched in acquisition order pre- and post-normalization -acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order") -if not os.path.exists(acquisition_order_viz_dir): - os.makedirs(acquisition_order_viz_dir) - -run_name = "2022-01-14_TONIC_TMA2_run1" -pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta" -post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized" -save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" - -# NOTE: image not scaled up, this happens in Photoshop -supplementary_plot_helpers.stitch_before_after_norm( - pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name, - "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1 -) - - -# Cell identification and classification -cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') -cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, - 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, - 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, - 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} -cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) - -save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' - -## cell cluster counts -sns.histplot(data=cell_table, x="cell_cluster") -sns.despine() -plt.title("Cell Cluster Counts") -plt.xlabel("Cell Cluster") -plt.xticks(rotation=75) -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) - -## fov cell counts -cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] -plt.figure(figsize=(8, 6)) -g = sns.histplot(data=cluster_counts, kde=True) -sns.despine() -plt.title("Histogram of Cell Counts per Image") -plt.xlabel("Number of Cells in an Image") -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) - -## cell type composition by tissue location of met and timepoint -meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') -meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] - -all_data = cell_table.merge(meta_data, on=['fov'], how='left') - -for metric in ['Localization', 'Timepoint']: - data = all_data[all_data.Timepoint == 'baseline'] if metric == 'Localization' else all_data - - groups = np.unique(data.Localization) if metric == 'Localization' else \ - ['primary', 'baseline', 'post_induction', 'on_nivo'] - dfs = [] - for group in groups: - sub_data = data[data[metric] == group] - - df = sub_data.groupby("cell_cluster_broad").count().reset_index() - df = df.set_index('cell_cluster_broad').transpose() - sub_df = df.iloc[:1].reset_index(drop=True) - sub_df.insert(0, metric, [group]) - sub_df[metric] = sub_df[metric].map(str) - sub_df = sub_df.set_index(metric) - - dfs.append(sub_df) - prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) - - color_map = {'Cancer': 'dimgrey', 'Stroma': 'darksalmon', 'T': 'navajowhite', - 'Mono_Mac': 'red', 'B': 'darkviolet', 'Other': 'yellowgreen', - 'Granulocyte': 'aqua', 'NK': 'dodgerblue'} - - means = prop_data.mean(axis=0).reset_index() - means = means.sort_values(by=[0], ascending=False) - prop_data = prop_data[means.cell_cluster_broad] - - colors = [color_map[cluster] for cluster in means.cell_cluster_broad] - prop_data.plot(kind='bar', stacked=True, color=colors) - sns.despine() - plt.ticklabel_format(style='plain', useOffset=False, axis='y') - plt.gca().set_ylabel("Cell Proportions") - xlabel = "Tissue Location" if metric == 'Localization' else "Timepoint" - plt.gca().set_xlabel(xlabel) - plt.xticks(rotation=30) - plt.title(f"Cell Type Composition by {xlabel}") - handles, labels = plt.gca().get_legend_handles_labels() - plt.legend(handles[::-1], labels[::-1], - bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) - plt.tight_layout() - plot_name = "cell_props_by_tissue_loc.png" if metric == 'Localization' else "cell_props_by_timepoint.png" - plt.savefig(os.path.join(save_dir, plot_name), dpi=300) - -## colored cell cluster masks from random subset of 20 FOVs -random.seed(13) -seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' - -all_fovs = list(cell_table['fov'].unique()) -fovs = random.sample(all_fovs, 20) -cell_table_subset = cell_table[cell_table.fov.isin(fovs)] - -cohort_cluster_plot( - fovs=fovs, - seg_dir=seg_dir, - save_dir=save_dir, - cell_data=cell_table_subset, - erode=True, - fov_col='fov', - label_col='label', - cluster_col='cell_cluster_broad', - seg_suffix="_whole_cell.tiff", - cmap=color_map, - display_fig=False, -) - - -# Cell identification and classification -cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') -cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, - 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, - 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, - 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} -cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) - -save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' - -## cell cluster counts -sns.histplot(data=cell_table, x="cell_cluster") -sns.despine() -plt.title("Cell Cluster Counts") -plt.xlabel("Cell Cluster") -plt.xticks(rotation=75) -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) - -## fov cell counts -cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] -plt.figure(figsize=(8, 6)) -g = sns.histplot(data=cluster_counts, kde=True) -sns.despine() -plt.title("Histogram of Cell Counts per Image") -plt.xlabel("Number of Cells in an Image") -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) - -## cell type composition by tissue location of met -meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') -meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] - -all_data = cell_table.merge(meta_data, on=['fov'], how='left') -base_data = all_data[all_data.Timepoint == 'baseline'] - -all_locals = np.unique(base_data.Localization) -dfs = [] -for region in all_locals: - localization_data = base_data[base_data.Localization == region] - - df = localization_data.groupby("cell_cluster_broad").count().reset_index() - df = df.set_index('cell_cluster_broad').transpose() - sub_df = df.iloc[:1].reset_index(drop=True) - sub_df.insert(0, "Localization", [region]) - sub_df['Localization'] = sub_df['Localization'].map(str) - sub_df = sub_df.set_index('Localization') - - dfs.append(sub_df) -prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) - -color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'], - 'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite', 'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']} -prop_data = prop_data[color_map['cell_cluster_broad']] - -colors = color_map['color'] -prop_data.plot(kind='bar', stacked=True, color=colors) -plt.ticklabel_format(style='plain', useOffset=False, axis='y') -plt.gca().set_ylabel("Cell Proportions") -plt.gca().set_xlabel("Tissue Location") -plt.xticks(rotation=30) -plt.title("Cell Type Composition by Tissue Location") -plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) -plt.tight_layout() -plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300) - -## colored cell cluster masks from random subset of 20 FOVs -random.seed(13) -seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' - -all_fovs = list(cell_table['fov'].unique()) -fovs = random.sample(all_fovs, 20) -cell_table_subset = cell_table[cell_table.fov.isin(fovs)] - -cohort_cluster_plot( - fovs=fovs, - seg_dir=seg_dir, - save_dir=save_dir, - cell_data=cell_table_subset, - erode=True, - fov_col='fov', - label_col='label', - cluster_col='cell_cluster_broad', - seg_suffix="_whole_cell.tiff", - cmap=color_map, - display_fig=False, -) - -# Functional marker thresholding -cell_table = pd.read_csv( - os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") -) -functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds") -if not os.path.exists(functional_marker_viz_dir): - os.makedirs(functional_marker_viz_dir) - -marker_info = { - "Ki67": { - "populations": ["Cancer", "Mast"], - "threshold": 0.002, - "x_range": (0, 0.012), - "x_ticks": np.array([0, 0.004, 0.008, 0.012]), - "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]), - }, - "CD38": { - "populations": ["Endothelium", "Cancer_EMT"], - "threshold": 0.004, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]), - }, - "CD45RB": { - "populations": ["CD4T", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.015), - "x_ticks": np.array([0, 0.005, 0.010, 0.015]), - "x_tick_labels": np.array([0, 0.005, 0.010, 0.015]) - }, - "CD45RO": { - "populations": ["CD4T", "Fibroblast"], - "threshold": 0.002, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) - }, - "CD57": { - "populations": ["CD8T", "B"], - "threshold": 0.002, - "x_range": (0, 0.006), - "x_ticks": np.array([0, 0.002, 0.004, 0.006]), - "x_tick_labels": np.array([0, 0.002, 0.004, 0.006]) - }, - "CD69": { - "populations": ["Treg", "Cancer"], - "threshold": 0.002, - "x_range": (0, 0.008), - "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]), - "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008]) - }, - "GLUT1": { - "populations": ["Cancer_EMT", "M2_Mac"], - "threshold": 0.002, - "x_range": (0, 0.02), - "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), - "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) - }, - "IDO": { - "populations": ["APC", "M1_Mac"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) - }, - "PD1": { - "populations": ["CD8T", "Stroma"], - "threshold": 0.0005, - "x_range": (0, 0.002), - "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]), - "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002]) - }, - "PDL1": { - "populations": ["Cancer", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]), - }, - "HLA1": { - "populations": ["APC", "Stroma"], - "threshold": 0.001, - "x_range": (0, 0.025), - "x_ticks": np.array([0, 0.0125, 0.025]), - "x_tick_labels": np.array([0, 0.0125, 0.025]) - }, - "HLADR": { - "populations": ["APC", "Neutrophil"], - "threshold": 0.001, - "x_range": (0, 0.025), - "x_ticks": np.array([0, 0.0125, 0.025]), - "x_tick_labels": np.array([0, 0.0125, 0.025]) - }, - "TBET": { - "populations": ["NK", "B"], - "threshold": 0.0015, - "x_range": (0, 0.0045), - "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]), - "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045]) - }, - "TCF1": { - "populations": ["CD4T", "M1_Mac"], - "threshold": 0.001, - "x_range": (0, 0.003), - "x_ticks": np.array([0, 0.001, 0.002, 0.003]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) - }, - "TIM3": { - "populations": ["Monocyte", "Endothelium"], - "threshold": 0.001, - "x_range": (0, 0.004), - "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]), - "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004]) - }, - "Vim": { - "populations": ["Endothelium", "B"], - "threshold": 0.002, - "x_range": (0, 0.06), - "x_ticks": np.array([0, 0.02, 0.04, 0.06]), - "x_tick_labels": np.array([0, 0.02, 0.04, 0.06]) - }, - "Fe": { - "populations": ["Fibroblast", "Cancer"], - "threshold": 0.1, - "x_range": (0, 0.3), - "x_ticks": np.array([0, 0.1, 0.2, 0.3]), - "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]), - } -} -supplementary_plot_helpers.functional_marker_thresholding( - cell_table, functional_marker_viz_dir, marker_info=marker_info, - figsize=(20, 40) -) - - -# Feature extraction +# # Panel validation +# panel_validation_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "panel_validation") +# if not os.path.exists(panel_validation_viz_dir): +# os.makedirs(panel_validation_viz_dir) + +# controls_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" +# controls_fov = "TONIC_TMA1_colon_bottom" +# supplementary_plot_helpers.validate_panel( +# controls_dir, controls_fov, panel_validation_viz_dir, font_size=180 +# ) + +# samples_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" +# samples_fov = "TONIC_TMA2_R5C4" +# samples_channels = sorted(io_utils.remove_file_extensions( +# io_utils.list_files(os.path.join(samples_dir, samples_fov), substrs=".tiff") +# )) +# exclude_chans = ["CD11c_nuc_exclude", "CK17_smoothed", "ECAD_smoothed", "FOXP3_nuc_include", +# "chan_39", "chan_45", "chan_48", "chan_115", "chan_141"] +# for ec in exclude_chans: +# if ec in samples_channels: +# samples_channels.remove(ec) +# supplementary_plot_helpers.validate_panel( +# samples_dir, samples_fov, panel_validation_viz_dir, channels=samples_channels, font_size=320 +# ) + + +# # ROI selection + + +# # QC +# qc_metrics = ["Non-zero mean intensity"] +# channel_exclude = ["chan_39", "chan_45", "CD11c_nuc_exclude", "CD11c_nuc_exclude_update", +# "FOXP3_nuc_include", "FOXP3_nuc_include_update", "CK17_smoothed", +# "FOXP3_nuc_exclude_update", "chan_48", "chan_141", "chan_115", "LAG3"] + +# ## FOV spatial location +# cohort_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples" +# qc_tma_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_tma_metrics" +# if not os.path.exists(qc_tma_metrics_dir): +# os.makedirs(qc_tma_metrics_dir) + +# fovs = io_utils.list_folders(cohort_path) +# tmas = list(set([fov.split('_R')[0] for fov in fovs])) + +# qc_tmas = qc_comp.QCTMA( +# qc_metrics=qc_metrics, +# cohort_path=cohort_path, +# metrics_dir=qc_tma_metrics_dir, +# ) + +# qc_tmas.compute_qc_tma_metrics(tmas=tmas) +# qc_tmas.qc_tma_metrics_zscore(tmas=tmas, channel_exclude=channel_exclude) +# qc_metrics_plots.qc_tmas_metrics_plot(qc_tmas=qc_tmas, tmas=tmas, save_figure=True, dpi=300) + +# ## longitudinal controls +# control_path = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/controls" +# qc_control_metrics_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/qc_metrics/qc_longitudinal_control" +# if not os.path.exists(qc_control_metrics_dir): +# os.makedirs(qc_control_metrics_dir) + +# folders = io_utils.list_folders(control_path, "TMA3_") +# control_substrs = [name.split("_")[2] + '_' + name.split("_")[3] if len(name.split("_")) == 4 +# else name.split("_")[2] + '_' + name.split("_")[3]+'_' + name.split("_")[4] +# for name in folders] + +# all_folders = io_utils.list_folders(control_path) +# for i, control in enumerate(control_substrs): +# control_sample_name = control +# print(control) +# if control == 'tonsil_bottom': +# fovs = [folder for folder in all_folders if control in folder and len(folder) <= 25] +# else: +# fovs = [folder for folder in all_folders if control in folder] + +# qc_control = qc_comp.QCControlMetrics( +# qc_metrics=qc_metrics, +# cohort_path=control_path, +# metrics_dir=qc_control_metrics_dir, +# ) + +# qc_control.compute_control_qc_metrics( +# control_sample_name=control_sample_name, +# fovs=fovs, +# channel_exclude=channel_exclude, +# channel_include=None, +# ) + +# qc_metrics_plots.longitudinal_control_heatmap( +# qc_control=qc_control, control_sample_name=control_sample_name, save_figure=True, dpi=300 +# ) + +# dfs = [] +# for control in control_substrs: +# df = pd.read_csv(os.path.join(qc_control_metrics_dir, f"{control}_combined_nonzero_mean_stats.csv")) +# df['fov'] = [i.replace('_' + control, '') for i in list(df['fov'])] +# log2_norm_df: pd.DataFrame = df.pivot( +# index="channel", columns="fov", values="Non-zero mean intensity" +# ).transform(func=lambda row: np.log2(row / row.mean()), axis=1) +# if control != 'tonsil_bottom_duplicate1': +# dup_col = [col for col in log2_norm_df.columns if 'duplicate1' in col] +# log2_norm_df = log2_norm_df.drop(columns=dup_col) if dup_col else log2_norm_df + +# mean_t_df: pd.DataFrame = ( +# log2_norm_df.mean(axis=0) +# .to_frame(name="mean") +# .transpose() +# .sort_values(by="mean", axis=1) +# ) +# transformed_df: pd.DataFrame = pd.concat( +# objs=[log2_norm_df, mean_t_df] +# ).sort_values(by="mean", axis=1, inplace=False) +# transformed_df.rename_axis("channel", axis=0, inplace=True) +# transformed_df.rename_axis("fov", axis=1, inplace=True) + +# dfs.append(transformed_df) +# all_data = pd.concat(dfs).replace([np.inf, -np.inf], 0, inplace=True) +# all_data = all_data.groupby(['channel']).mean() +# all_data = all_data.sort_values(by="mean", axis=1, inplace=False).round(2) + + +# fig = plt.figure(figsize=(12,12), dpi=300) +# fig.set_layout_engine(layout="constrained") +# gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, height_ratios=[len(all_data.index) - 1, 1]) +# _norm = Normalize(vmin=-1, vmax=1) +# _cmap = sns.color_palette("vlag", as_cmap=True) +# fig.suptitle(f"Average per TMA - QC: Non-zero Mean Intensity ") + +# annotation_kws = { +# "horizontalalignment": "center", +# "verticalalignment": "center", +# "fontsize": 8, +# } + +# ax_heatmap = fig.add_subplot(gs[0, 0]) +# sns.heatmap( +# data=all_data[~all_data.index.isin(["mean"])], +# ax=ax_heatmap, +# linewidths=1, +# linecolor="black", +# cbar_kws={"shrink": 0.5}, +# annot=True, +# annot_kws=annotation_kws, +# xticklabels=False, +# norm=_norm, +# cmap=_cmap, +# ) + +# ax_heatmap.collections[0].colorbar.ax.set_title(r"$\log_2(QC)$") +# ax_heatmap.set_yticks( +# ticks=ax_heatmap.get_yticks(), +# labels=ax_heatmap.get_yticklabels(), +# rotation=0, +# ) +# ax_heatmap.set_xlabel(None) + +# ax_avg = fig.add_subplot(gs[1, 0]) +# sns.heatmap( +# data=all_data[all_data.index.isin(["mean"])], +# ax=ax_avg, +# linewidths=1, +# linecolor="black", +# annot=True, +# annot_kws=annotation_kws, +# fmt=".2f", +# cmap=ListedColormap(["white"]), +# cbar=False, +# ) +# ax_avg.set_yticks( +# ticks=ax_avg.get_yticks(), +# labels=["Mean"], +# rotation=0, +# ) +# ax_avg.set_xticks( +# ticks=ax_avg.get_xticks(), +# labels=ax_avg.get_xticklabels(), +# rotation=45, +# ha="right", +# rotation_mode="anchor", +# ) +# ax_heatmap.set_ylabel("Channel") +# ax_avg.set_xlabel("FOV") + +# fig.savefig(fname=os.path.join(qc_control_metrics_dir, "figures/log2_avgs.png"), dpi=300, +# bbox_inches="tight") + + +# # Image processing +# ## show a run with images stitched in acquisition order pre- and post-normalization +# acquisition_order_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "acquisition_order") +# if not os.path.exists(acquisition_order_viz_dir): +# os.makedirs(acquisition_order_viz_dir) + +# run_name = "2022-01-14_TONIC_TMA2_run1" +# pre_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/rosetta" +# post_norm_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Acquisition/normalized" +# save_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs" + +# # NOTE: image not scaled up, this happens in Photoshop +# supplementary_plot_helpers.stitch_before_after_norm( +# pre_norm_dir, post_norm_dir, acquisition_order_viz_dir, run_name, +# "H3K9ac", pre_norm_subdir="normalized", padding=0, step=1 +# ) + + +# # Cell identification and classification +# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') +# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, +# 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, +# 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, +# 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} +# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) + +# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' + +# ## cell cluster counts +# sns.histplot(data=cell_table, x="cell_cluster") +# sns.despine() +# plt.title("Cell Cluster Counts") +# plt.xlabel("Cell Cluster") +# plt.xticks(rotation=75) +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) + +# ## fov cell counts +# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] +# plt.figure(figsize=(8, 6)) +# g = sns.histplot(data=cluster_counts, kde=True) +# sns.despine() +# plt.title("Histogram of Cell Counts per Image") +# plt.xlabel("Number of Cells in an Image") +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) + +# ## cell type composition by tissue location of met and timepoint +# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') +# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] + +# all_data = cell_table.merge(meta_data, on=['fov'], how='left') + +# for metric in ['Localization', 'Timepoint']: +# data = all_data[all_data.Timepoint == 'baseline'] if metric == 'Localization' else all_data + +# groups = np.unique(data.Localization) if metric == 'Localization' else \ +# ['primary', 'baseline', 'post_induction', 'on_nivo'] +# dfs = [] +# for group in groups: +# sub_data = data[data[metric] == group] + +# df = sub_data.groupby("cell_cluster_broad").count().reset_index() +# df = df.set_index('cell_cluster_broad').transpose() +# sub_df = df.iloc[:1].reset_index(drop=True) +# sub_df.insert(0, metric, [group]) +# sub_df[metric] = sub_df[metric].map(str) +# sub_df = sub_df.set_index(metric) + +# dfs.append(sub_df) +# prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) + +# color_map = {'Cancer': 'dimgrey', 'Stroma': 'darksalmon', 'T': 'navajowhite', +# 'Mono_Mac': 'red', 'B': 'darkviolet', 'Other': 'yellowgreen', +# 'Granulocyte': 'aqua', 'NK': 'dodgerblue'} + +# means = prop_data.mean(axis=0).reset_index() +# means = means.sort_values(by=[0], ascending=False) +# prop_data = prop_data[means.cell_cluster_broad] + +# colors = [color_map[cluster] for cluster in means.cell_cluster_broad] +# prop_data.plot(kind='bar', stacked=True, color=colors) +# sns.despine() +# plt.ticklabel_format(style='plain', useOffset=False, axis='y') +# plt.gca().set_ylabel("Cell Proportions") +# xlabel = "Tissue Location" if metric == 'Localization' else "Timepoint" +# plt.gca().set_xlabel(xlabel) +# plt.xticks(rotation=30) +# plt.title(f"Cell Type Composition by {xlabel}") +# handles, labels = plt.gca().get_legend_handles_labels() +# plt.legend(handles[::-1], labels[::-1], +# bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) +# plt.tight_layout() +# plot_name = "cell_props_by_tissue_loc.png" if metric == 'Localization' else "cell_props_by_timepoint.png" +# plt.savefig(os.path.join(save_dir, plot_name), dpi=300) + +# ## colored cell cluster masks from random subset of 20 FOVs +# random.seed(13) +# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' + +# all_fovs = list(cell_table['fov'].unique()) +# fovs = random.sample(all_fovs, 20) +# cell_table_subset = cell_table[cell_table.fov.isin(fovs)] + +# cohort_cluster_plot( +# fovs=fovs, +# seg_dir=seg_dir, +# save_dir=save_dir, +# cell_data=cell_table_subset, +# erode=True, +# fov_col='fov', +# label_col='label', +# cluster_col='cell_cluster_broad', +# seg_suffix="_whole_cell.tiff", +# cmap=color_map, +# display_fig=False, +# ) + + +# # Cell identification and classification +# cell_table = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/cell_table_clusters.csv') +# cluster_order = {'Cancer': 0, 'Cancer_EMT': 1, 'Cancer_Other': 2, 'CD4T': 3, 'CD8T': 4, 'Treg': 5, +# 'T_Other': 6, 'B': 7, 'NK': 8, 'M1_Mac': 9, 'M2_Mac': 10, 'Mac_Other': 11, +# 'Monocyte': 12, 'APC': 13, 'Mast': 14, 'Neutrophil': 15, 'Fibroblast': 16, +# 'Stroma': 17, 'Endothelium': 18, 'Other': 19, 'Immune_Other': 20} +# cell_table = cell_table.sort_values(by=['cell_cluster'], key=lambda x: x.map(cluster_order)) + +# save_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/supplementary_figs' + +# ## cell cluster counts +# sns.histplot(data=cell_table, x="cell_cluster") +# sns.despine() +# plt.title("Cell Cluster Counts") +# plt.xlabel("Cell Cluster") +# plt.xticks(rotation=75) +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_cluster.png"), dpi=300) + +# ## fov cell counts +# cluster_counts = np.unique(cell_table.fov, return_counts=True)[1] +# plt.figure(figsize=(8, 6)) +# g = sns.histplot(data=cluster_counts, kde=True) +# sns.despine() +# plt.title("Histogram of Cell Counts per Image") +# plt.xlabel("Number of Cells in an Image") +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cells_per_fov.png"), dpi=300) + +# ## cell type composition by tissue location of met +# meta_data = pd.read_csv('/Volumes/Shared/Noah Greenwald/TONIC_Cohort/analysis_files/harmonized_metadata.csv') +# meta_data = meta_data[['fov', 'Patient_ID', 'Timepoint', 'Localization']] + +# all_data = cell_table.merge(meta_data, on=['fov'], how='left') +# base_data = all_data[all_data.Timepoint == 'baseline'] + +# all_locals = np.unique(base_data.Localization) +# dfs = [] +# for region in all_locals: +# localization_data = base_data[base_data.Localization == region] + +# df = localization_data.groupby("cell_cluster_broad").count().reset_index() +# df = df.set_index('cell_cluster_broad').transpose() +# sub_df = df.iloc[:1].reset_index(drop=True) +# sub_df.insert(0, "Localization", [region]) +# sub_df['Localization'] = sub_df['Localization'].map(str) +# sub_df = sub_df.set_index('Localization') + +# dfs.append(sub_df) +# prop_data = pd.concat(dfs).transform(func=lambda row: row / row.sum(), axis=1) + +# color_map = {'cell_cluster_broad': ['Cancer', 'Stroma', 'Mono_Mac', 'T','Other', 'Granulocyte', 'NK', 'B'], +# 'color': ['dimgrey', 'darksalmon', 'red', 'navajowhite', 'yellowgreen', 'aqua', 'dodgerblue', 'darkviolet']} +# prop_data = prop_data[color_map['cell_cluster_broad']] + +# colors = color_map['color'] +# prop_data.plot(kind='bar', stacked=True, color=colors) +# plt.ticklabel_format(style='plain', useOffset=False, axis='y') +# plt.gca().set_ylabel("Cell Proportions") +# plt.gca().set_xlabel("Tissue Location") +# plt.xticks(rotation=30) +# plt.title("Cell Type Composition by Tissue Location") +# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) +# plt.tight_layout() +# plt.savefig(os.path.join(save_dir, "cell_props_by_tissue_loc.png"), dpi=300) + +# ## colored cell cluster masks from random subset of 20 FOVs +# random.seed(13) +# seg_dir = '/Volumes/Shared/Noah Greenwald/TONIC_Cohort/segmentation_data/deepcell_output' + +# all_fovs = list(cell_table['fov'].unique()) +# fovs = random.sample(all_fovs, 20) +# cell_table_subset = cell_table[cell_table.fov.isin(fovs)] + +# cohort_cluster_plot( +# fovs=fovs, +# seg_dir=seg_dir, +# save_dir=save_dir, +# cell_data=cell_table_subset, +# erode=True, +# fov_col='fov', +# label_col='label', +# cluster_col='cell_cluster_broad', +# seg_suffix="_whole_cell.tiff", +# cmap=color_map, +# display_fig=False, +# ) + +# # Functional marker thresholding +# cell_table = pd.read_csv( +# os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") +# ) +# functional_marker_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "functional_marker_dist_thresholds") +# if not os.path.exists(functional_marker_viz_dir): +# os.makedirs(functional_marker_viz_dir) + +# marker_info = { +# "Ki67": { +# "populations": ["Cancer", "Mast"], +# "threshold": 0.002, +# "x_range": (0, 0.012), +# "x_ticks": np.array([0, 0.004, 0.008, 0.012]), +# "x_tick_labels": np.array([0, 0.004, 0.008, 0.012]), +# }, +# "CD38": { +# "populations": ["Endothelium", "Cancer_EMT"], +# "threshold": 0.004, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# }, +# "CD45RB": { +# "populations": ["CD4T", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.015), +# "x_ticks": np.array([0, 0.005, 0.010, 0.015]), +# "x_tick_labels": np.array([0, 0.005, 0.010, 0.015]) +# }, +# "CD45RO": { +# "populations": ["CD4T", "Fibroblast"], +# "threshold": 0.002, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) +# }, +# "CD57": { +# "populations": ["CD8T", "B"], +# "threshold": 0.002, +# "x_range": (0, 0.006), +# "x_ticks": np.array([0, 0.002, 0.004, 0.006]), +# "x_tick_labels": np.array([0, 0.002, 0.004, 0.006]) +# }, +# "CD69": { +# "populations": ["Treg", "Cancer"], +# "threshold": 0.002, +# "x_range": (0, 0.008), +# "x_ticks": np.array([0, 0.002, 0.004, 0.006, 0.008]), +# "x_tick_labels": np.array([0, 0.002, 0.004, 0.006, 0.008]) +# }, +# "GLUT1": { +# "populations": ["Cancer_EMT", "M2_Mac"], +# "threshold": 0.002, +# "x_range": (0, 0.02), +# "x_ticks": np.array([0, 0.005, 0.01, 0.015, 0.02]), +# "x_tick_labels": np.array([0, 0.005, 0.01, 0.015, 0.02]) +# }, +# "IDO": { +# "populations": ["APC", "M1_Mac"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) +# }, +# "PD1": { +# "populations": ["CD8T", "Stroma"], +# "threshold": 0.0005, +# "x_range": (0, 0.002), +# "x_ticks": np.array([0, 0.0005, 0.001, 0.0015, 0.002]), +# "x_tick_labels": np.array([0, 0.0005, 0.001, 0.0015, 0.002]) +# }, +# "PDL1": { +# "populations": ["Cancer", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]), +# }, +# "HLA1": { +# "populations": ["APC", "Stroma"], +# "threshold": 0.001, +# "x_range": (0, 0.025), +# "x_ticks": np.array([0, 0.0125, 0.025]), +# "x_tick_labels": np.array([0, 0.0125, 0.025]) +# }, +# "HLADR": { +# "populations": ["APC", "Neutrophil"], +# "threshold": 0.001, +# "x_range": (0, 0.025), +# "x_ticks": np.array([0, 0.0125, 0.025]), +# "x_tick_labels": np.array([0, 0.0125, 0.025]) +# }, +# "TBET": { +# "populations": ["NK", "B"], +# "threshold": 0.0015, +# "x_range": (0, 0.0045), +# "x_ticks": np.array([0, 0.0015, 0.003, 0.0045]), +# "x_tick_labels": np.array([0, 0.0015, 0.003, 0.0045]) +# }, +# "TCF1": { +# "populations": ["CD4T", "M1_Mac"], +# "threshold": 0.001, +# "x_range": (0, 0.003), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003]) +# }, +# "TIM3": { +# "populations": ["Monocyte", "Endothelium"], +# "threshold": 0.001, +# "x_range": (0, 0.004), +# "x_ticks": np.array([0, 0.001, 0.002, 0.003, 0.004]), +# "x_tick_labels": np.array([0, 0.001, 0.002, 0.003, 0.004]) +# }, +# "Vim": { +# "populations": ["Endothelium", "B"], +# "threshold": 0.002, +# "x_range": (0, 0.06), +# "x_ticks": np.array([0, 0.02, 0.04, 0.06]), +# "x_tick_labels": np.array([0, 0.02, 0.04, 0.06]) +# }, +# "Fe": { +# "populations": ["Fibroblast", "Cancer"], +# "threshold": 0.1, +# "x_range": (0, 0.3), +# "x_ticks": np.array([0, 0.1, 0.2, 0.3]), +# "x_tick_labels": np.array([0, 0.1, 0.2, 0.3]), +# } +# } +# supplementary_plot_helpers.functional_marker_thresholding( +# cell_table, functional_marker_viz_dir, marker_info=marker_info, +# figsize=(20, 40) +# ) + + +# # Feature extraction # Occupancy statistics @@ -593,26 +593,35 @@ index=False ) - # massive GridSearch -total_occupancy_stats_df = pd.DataFrame() -for tiles_per_row_col in [4, 8, 16]: - for positive_threshold in [5, 10, 15, 20]: - occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics( - cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"], - tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold - ) - occupancy_stats_df = pd.DataFrame( - { - "fov": list(occupancy_stats.keys()), - "percent_positive_tiles": list(occupancy_stats.values()) - } - ) - occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2 - occupancy_stats_df["positive_threshold"] = positive_threshold - - total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) - -total_occupancy_stats_df.to_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False +if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")): + total_occupancy_stats_df = pd.read_csv( + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv") + ) +else: + total_occupancy_stats_df = pd.DataFrame() + for tiles_per_row_col in [4, 8, 16]: + for positive_threshold in [5, 10, 15, 20]: + occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics( + cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"], + tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold + ) + occupancy_stats_df = pd.DataFrame( + { + "fov": list(occupancy_stats.keys()), + "percent_positive_tiles": list(occupancy_stats.values()) + } + ) + occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2 + occupancy_stats_df["positive_threshold"] = positive_threshold + + total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) + + total_occupancy_stats_df.to_csv( + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False + ) + +# visualize histograms for each trial in the occupancy stats table +supplementary_plot_helpers.visualize_occupancy_statistics( + total_occupancy_stats_df, occupancy_stats_viz_dir, pop_subset=["T cells"], figsize=(20, 40) ) From e9aaa0c2c208398ed8af7fae63d2ad54145f4415 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 9 May 2024 15:16:34 -0700 Subject: [PATCH 11/14] Update to explicitly pass max_size_image param --- python_files/generate_supplementary_plots.py | 10 ++++++---- python_files/supplementary_plot_helpers.py | 7 +++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index fdcf24a..b0e1af3 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -594,9 +594,9 @@ ) # massive GridSearch -if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv")): +if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")): total_occupancy_stats_df = pd.read_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv") + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv") ) else: total_occupancy_stats_df = pd.DataFrame() @@ -604,7 +604,9 @@ for positive_threshold in [5, 10, 15, 20]: occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics( cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"], - tiles_per_row_col=tiles_per_row_col, positive_threshold=positive_threshold + tiles_per_row_col=tiles_per_row_col, + max_image_size=cell_table["fov_pixel_size"].max(), + positive_threshold=positive_threshold ) occupancy_stats_df = pd.DataFrame( { @@ -618,7 +620,7 @@ total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) total_occupancy_stats_df.to_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells.csv"), index=False + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv"), index=False ) # visualize histograms for each trial in the occupancy stats table diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index d02f78e..f79df4e 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -414,7 +414,7 @@ def stitch_before_after_norm( def compute_occupancy_statistics( cell_table: pd.DataFrame, pop_col: str = "cell_cluster", pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8, - positive_threshold: int = 20 + max_image_size: int = 2048, positive_threshold: int = 20 ): """Compute the occupancy statistics over a cohort. @@ -428,6 +428,9 @@ def compute_occupancy_statistics( Which populations, if any, to subset on. If None, use all populations tiles_per_row_col (int): The row/col dims of tiles to define over each image + max_image_size (int): + Maximum size of an image passed in, assuming square images + NOTE: all images should have an image size that is a factor of max_image_size positive_threshold (int): The cell count in a tile required for positivity @@ -449,7 +452,7 @@ def compute_occupancy_statistics( cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy() # define the tile size in pixels for each FOV - cell_table["tile_size"] = cell_table["fov_pixel_size"] // tiles_per_row_col + cell_table["tile_size"] = max_image_size // tiles_per_row_col # define the tile for each cell cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int) From 9128e2c5bd07896176302d0efe40e58b234a12b9 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 16 May 2024 10:58:49 -0700 Subject: [PATCH 12/14] Update occupancy stat workflow --- python_files/generate_supplementary_plots.py | 45 ++-- python_files/supplementary_plot_helpers.py | 232 +++++++++++++------ 2 files changed, 182 insertions(+), 95 deletions(-) diff --git a/python_files/generate_supplementary_plots.py b/python_files/generate_supplementary_plots.py index b0e1af3..58d4006 100644 --- a/python_files/generate_supplementary_plots.py +++ b/python_files/generate_supplementary_plots.py @@ -558,11 +558,6 @@ # Occupancy statistics -# TODO: make a constant in supplementary_plot_helpers -cell_table = pd.read_csv( - os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") -) - occupancy_stats_viz_dir = os.path.join(SUPPLEMENTARY_FIG_DIR, "occupancy_stats") if not os.path.exists(occupancy_stats_viz_dir): os.makedirs(occupancy_stats_viz_dir) @@ -574,6 +569,7 @@ os.path.join(occupancy_stats_viz_dir, "cell_table_with_pixel_size.csv") ) else: + # TODO: make constant earlier in this script cell_table = pd.read_csv( os.path.join(ANALYSIS_DIR, "combined_cell_table_normalized_cell_labels_updated.csv") ) @@ -594,36 +590,41 @@ ) # massive GridSearch -if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv")): +if os.path.exists(os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv")): total_occupancy_stats_df = pd.read_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv") + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv") ) else: total_occupancy_stats_df = pd.DataFrame() + total_occupancy_stats_grouped_df = pd.DataFrame() for tiles_per_row_col in [4, 8, 16]: for positive_threshold in [5, 10, 15, 20]: - occupancy_stats = supplementary_plot_helpers.compute_occupancy_statistics( - cell_table, pop_subset=["CD4T", "CD8T", "Treg", "T_Other"], + occupancy_stats_df, occupancy_stats_grouped_df = supplementary_plot_helpers.compute_occupancy_statistics( + cell_table, tiles_per_row_col=tiles_per_row_col, max_image_size=cell_table["fov_pixel_size"].max(), positive_threshold=positive_threshold ) - occupancy_stats_df = pd.DataFrame( - { - "fov": list(occupancy_stats.keys()), - "percent_positive_tiles": list(occupancy_stats.values()) - } - ) - occupancy_stats_df["num_tiles"] = tiles_per_row_col ** 2 - occupancy_stats_df["positive_threshold"] = positive_threshold + # occupancy_stats_df = pd.DataFrame( + # { + # "fov": list(occupancy_stats.keys()), + # "percent_positive_tiles": list(occupancy_stats.values()) + # } + # ) + occupancy_stats_grouped_df["num_tiles"] = tiles_per_row_col ** 2 + occupancy_stats_grouped_df["positive_threshold"] = positive_threshold total_occupancy_stats_df = pd.concat([total_occupancy_stats_df, occupancy_stats_df]) + total_occupancy_stats_grouped_df = pd.concat([total_occupancy_stats_grouped_df, occupancy_stats_grouped_df]) total_occupancy_stats_df.to_csv( - os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_t_cells_tile_size_update.csv"), index=False + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad.csv"), index=False + ) + total_occupancy_stats_grouped_df.to_csv( + os.path.join(occupancy_stats_viz_dir, "occupancy_stats_trials_cell_cluster_broad_grouped.csv"), index=False ) -# visualize histograms for each trial in the occupancy stats table -supplementary_plot_helpers.visualize_occupancy_statistics( - total_occupancy_stats_df, occupancy_stats_viz_dir, pop_subset=["T cells"], figsize=(20, 40) -) +# # visualize histograms for each trial in the occupancy stats table +# supplementary_plot_helpers.visualize_occupancy_statistics( +# total_occupancy_stats_df, occupancy_stats_viz_dir, figsize=(20, 40) +# ) diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index f79df4e..5426185 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -410,9 +410,35 @@ def stitch_before_after_norm( post_norm_tiled.save(post_norm_stitched_path) +def occupancy_division_factor(occupancy_group: pd.api.typing.DataFrameGroupBy, + tiles_per_row_col: int, + max_image_size: int, + cell_table: pd.DataFrame): + """Compute the denominator to ensure the proper percentage is computed for occupancy stats + + Args: + occupancy_group (pd.api.typing.DataFrameGroupBy): + The group by object for the FOV and cell type + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + max_image_size (int): + Maximum size of an image passed in, assuming square images + NOTE: all images should have an image size that is a factor of max_image_size + cell_table (pd.DataFrame): + The cell table associated with the cohort + + Returns: + float: + The factor to divide the occupancy stat positive counts by + """ + fov_name = occupancy_group["fov"].values[0] + image_size = cell_table.loc[cell_table["fov"] == fov_name, "fov_pixel_size"].values[0] + return tiles_per_row_col ** 2 / (max_image_size / image_size) + + # occupancy statistic helpers def compute_occupancy_statistics( - cell_table: pd.DataFrame, pop_col: str = "cell_cluster", + cell_table: pd.DataFrame, pop_col: str = "cell_cluster_broad", pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8, max_image_size: int = 2048, positive_threshold: int = 20 ): @@ -448,8 +474,9 @@ def compute_occupancy_statistics( specified_populations=pop_subset, valid_populations=cell_table[pop_col].unique() ) - - cell_table = cell_table[cell_table[pop_col].isin(pop_subset)].copy() + # otherwise, use all possible subsets as defined by pop_col + else: + pop_subset = cell_table[pop_col].unique() # define the tile size in pixels for each FOV cell_table["tile_size"] = max_image_size // tiles_per_row_col @@ -458,40 +485,86 @@ def compute_occupancy_statistics( cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int) cell_table["tile_col"] = (cell_table["centroid-0"] // cell_table["tile_size"]).astype(int) - # Group by FOV and tile positions, then count occupancy + # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col + all_combos = pd.MultiIndex.from_product( + [ + cell_table["fov"].unique(), + cell_table["tile_row"].unique(), + cell_table["tile_col"].unique(), + cell_table[pop_col].unique() + ], + names=["fov", "tile_row", "tile_col", pop_col] + ).to_frame(index=False) + occupancy_counts: pd.DataFrame = cell_table.groupby( - ["fov", "tile_row", "tile_col"] + ["fov", "tile_row", "tile_col", pop_col] ).size().reset_index(name="occupancy") - - # define the occupancy_stats array - occupancy_stats = xr.DataArray( - np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), - coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], - dims=["fov", "x", "y"] + occupancy_counts = pd.merge( + all_combos, occupancy_counts, + on=["fov", "tile_row", "tile_col", pop_col], + how="left" ) - - # Update the DataArray based on occupancy_counts without explicit Python loops - for index, row in occupancy_counts.iterrows(): - occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] - - # define the tiles that are positive based on threshold - occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold - - # compute the percentage of positive tiles for each FOV - occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum( - dim=['x', 'y'] - ) / (tiles_per_row_col ** 2) - - # convert to dict - occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict() - occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()} - - return occupancy_stats_dict + occupancy_counts["occupancy"].fillna(0, inplace=True) + print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + + occupancy_counts["is_positive"] = occupancy_counts["occupancy"] > positive_threshold + # occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + # lambda row: row["is_positive"].sum() / occupancy_division_factor( + # row, tiles_per_row_col, max_image_size, cell_table + # ) * 100 + # ).reset_index(name="percent_positive") + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + lambda row: row["is_positive"].sum() + ).reset_index(name="total_positive") + + return occupancy_counts, occupancy_counts_grouped + + # # define the occupancy_stats array + # occupancy_stats = xr.DataArray( + # np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + # coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + # dims=["fov", "x", "y"] + # ) + + # # iterate over each population + # for pop in pop_subset: + # cell_table_pop = cell_table[cell_table[pop_col] == pop].copy() + + # # Group by FOV and tile positions, then count occupancy + # occupancy_counts: pd.DataFrame = cell_table_pop.groupby( + # ["fov", "tile_row", "tile_col"] + # ).size().reset_index(name="occupancy") + + # # define the occupancy_stats array + # occupancy_stats = xr.DataArray( + # np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + # coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + # dims=["fov", "x", "y"] + # ) + + # # Update the DataArray based on occupancy_counts without explicit Python loops + # for index, row in occupancy_counts.iterrows(): + # occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] + + # # define the tiles that are positive based on threshold + # occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold + + # # compute the percentage of positive tiles for each FOV + # occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum( + # dim=['x', 'y'] + # ) / (tiles_per_row_col ** 2) + + # # convert to dict + # occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict() + # occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()} + + # return occupancy_stats_dict def visualize_occupancy_statistics( occupancy_stats_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], - pop_subset: Optional[List[str]] = None, figsize: Optional[Tuple[float, float]] = (10, 10) + pop_col: str = "cell_cluster_broad", pop_subset: Optional[List[str]] = None, + figsize: Optional[Tuple[float, float]] = (10, 10) ): """Visualize the distribution of the percentage of tiles above a positive threshold. @@ -503,59 +576,72 @@ def visualize_occupancy_statistics( tiles for each image of the cohort at different positivity thresholds and grid sizes save_dir (Union[str, pathlib.Path]): Directory to save the visualizations in + pop_col (str): + Column containing the names of the cell populations pop_subset (Optional[List[str]]): Which populations, if any, to subset on. If None, use all populations fig_size (Optional[Tuple[float, float]]): The figure size to use for the image. """ - # generate each unique test pair - tiles_threshold_trials = occupancy_stats_table[ - ["num_tiles", "positive_threshold"] - ].drop_duplicates().reset_index(drop=True) - - # define a separate plot on the same grid for each test pair - fig, axs = plt.subplots( - tiles_threshold_trials.shape[0], 1, figsize=figsize - ) - populations_str = f"{', '.join(pop_subset)} populations" if pop_subset else "all populations" - # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24) - # fig.subplots_adjust(top=1.50) - - # define an index to iterate through the subplots - subplot_i = 0 - - for _, trial in tiles_threshold_trials.iterrows(): - grid_size = int(np.sqrt(trial["num_tiles"])) - positive_threshold = trial["positive_threshold"] - - # subset data obtained just for the specified trial - trial_data = occupancy_stats_table[ - (occupancy_stats_table["num_tiles"] == trial["num_tiles"]) & - (occupancy_stats_table["positive_threshold"] == trial["positive_threshold"]) - ] - - # visualize the distribution using a histogram - positive_tile_percentages = trial_data["percent_positive_tiles"].values - axs[subplot_i].hist( - positive_tile_percentages, - facecolor="g", - bins=20, - alpha=0.75 + if pop_subset is not None: + verify_in_list( + specified_populations=pop_subset, + valid_populations=occupancy_stats_table[pop_col].unique() ) - axs[subplot_i].set_title( - f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}" + else: + pop_subset = occupancy_stats_table[pop_col].unique() + + for pop in pop_subset: + occupancy_stats_table_sub = occupancy_stats_table[occupancy_stats_table[pop_col] == pop] + + # generate each unique test pair + tiles_threshold_trials = occupancy_stats_table_sub[ + ["num_tiles", "positive_threshold"] + ].drop_duplicates().reset_index(drop=True) + + # define a separate plot on the same grid for each test pair + fig, axs = plt.subplots( + tiles_threshold_trials.shape[0], 1, figsize=figsize ) + populations_str = f"{pop} cell populations" + # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24) + # fig.subplots_adjust(top=1.50) + + # define an index to iterate through the subplots + subplot_i = 0 + + for _, trial in tiles_threshold_trials.iterrows(): + grid_size = int(np.sqrt(trial["num_tiles"])) + positive_threshold = trial["positive_threshold"] + + # subset data obtained just for the specified trial + trial_data = occupancy_stats_table_sub[ + (occupancy_stats_table_sub["num_tiles"] == trial["num_tiles"]) & + (occupancy_stats_table_sub["positive_threshold"] == trial["positive_threshold"]) + ] + + # visualize the distribution using a histogram + positive_tile_percentages = trial_data["percent_positive"].values + axs[subplot_i].hist( + positive_tile_percentages, + facecolor="g", + bins=20, + alpha=0.75 + ) + axs[subplot_i].set_title( + f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}" + ) - # update the subplot index - subplot_i += 1 + # update the subplot index + subplot_i += 1 - plt.tight_layout() + plt.tight_layout() - # save the figure to save_dir - fig.savefig( - pathlib.Path(save_dir) / f"occupancy_statistic_distributions.png", - dpi=300 - ) + # save the figure to save_dir + fig.savefig( + pathlib.Path(save_dir) / f"occupancy_statistic_distributions_{pop}.png", + dpi=300 + ) def visualize_occupancy_statistics_old( From 3b50d3480468fdca48c24386c59463989e23e33d Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 4 Sep 2024 13:56:44 -0700 Subject: [PATCH 13/14] Integrate occupancy stat generation into DF per core --- .../postprocessing/5_create_dfs_per_core.py | 226 +++++++++++++++++- python_files/supplementary_plot_helpers.py | 60 ++++- python_files/utils.py | 160 +++++++++++++ 3 files changed, 436 insertions(+), 10 deletions(-) diff --git a/python_files/postprocessing/5_create_dfs_per_core.py b/python_files/postprocessing/5_create_dfs_per_core.py index a911cc8..7d6e905 100644 --- a/python_files/postprocessing/5_create_dfs_per_core.py +++ b/python_files/postprocessing/5_create_dfs_per_core.py @@ -5,7 +5,7 @@ import numpy as np from scipy.stats import spearmanr -from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster +from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy # # This file creates plotting-ready data structures to enumerate the frequency, count, and density @@ -36,6 +36,9 @@ cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv')) cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv')) cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv')) +cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated'), usecols=[ + "fov", "label", "centroid-0", "centroid-1", "cell_meta_cluster", "cell_cluster", "cell_cluster_broad" +]) area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv')) annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'cell_annotation_mask.csv')) fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv')) @@ -51,6 +54,7 @@ cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left') cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left') cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left') +cell_table_occupancy = cell_table_occupancy.merge(harmonized_annotations, on=['fov', 'label'], how='left') # check for FOVs present in imaged data that aren't in core metadata missing_fovs = cell_table_clusters.loc[~cell_table_clusters.fov.isin(core_metadata.fov), 'fov'].unique() @@ -61,6 +65,7 @@ cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :] cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :] cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :] +cell_table_occupancy = cell_table_occupancy.loc[~cell_table_occupancy.fov.isin(missing_fovs), :] fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :] fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :] @@ -1021,6 +1026,225 @@ deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False) +# occupancy stat summary + +# Create list to hold parameters for each df that will be produced +occupancy_df_params = [['cluster_broad_freq', 'cell_cluster_broad']] + +occupancy_dfs = [] +for result_name, cluster_col_name in occupancy_df_params: + + # remove cluster_names except for the one specified for the df + drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] + drop_cols.remove(cluster_col_name) + + # create df + occupancy_dfs.append(create_long_df_by_occupancy(cell_table=cell_table_occupancy, + result_name=result_name, + cluster_col_name=cluster_col_name, + drop_cols=drop_cols, + zscore_threshold=0.0)) + +# create combined df +total_df_occupancy = pd.concat(occupancy_dfs, axis=0) +total_df_occupancy = total_df_occupancy.merge(harmonized_metadata, on='fov', how='inner') +total_df_occupancy = total_df_occupancy.rename(columns={'functional_marker': 'occupancy_feature'}) + +# save df +total_df_occupancy.to_csv(os.path.join(output_dir, 'occupancy_df_per_core.csv'), index=False) + +# filter diversity scores to only include FOVs with at least the specified number of cells +total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +min_cells = 5 + +filtered_dfs = [] +metrics = [['cluster_broad_count', 'cluster_broad_freq'], + ['cluster_count', 'cluster_freq'], + ['meta_cluster_count', 'meta_cluster_freq']] +for metric in metrics: + # subset count df to include cells at the relevant clustering resolution + for compartment in ['all']: + count_df = total_df[total_df.metric == metric[0]] + count_df = count_df[count_df.subset == compartment] + + # subset diversity df to only include diversity metrics at this resolution + occupancy_df = total_df_occupancy[total_df_occupancy.metric == metric[1]] + occupancy_df = occupancy_df[occupancy_df.subset == compartment] + + # for each cell type, determine which FOVs have high enough counts to be included + for cell_type in count_df.cell_type.unique(): + keep_df = count_df[count_df.cell_type == cell_type] + keep_df = keep_df[keep_df.value >= min_cells] + keep_fovs = keep_df.fov.unique() + + # subset morphology df to only include FOVs with high enough counts + keep_features = occupancy_df[occupancy_df.cell_type == cell_type] + keep_features = keep_features[keep_features.fov.isin(keep_fovs)] + + # append to list of filtered dfs + filtered_dfs.append(keep_features) + +filtered_occupancy_df = pd.concat(filtered_dfs) + +# save filtered df +filtered_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_filtered.csv'), index=False) + +# remove distances that are correlated with abundance of cell type + +if not os.path.exists(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv')): + density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')] + + # NOTE: this is purely for consistency, the table when created only includes the cluster_broad_freq metric and all compartments + filtered_occupancy_df = filtered_occupancy_df.loc[(filtered_occupancy_df.metric == 'cluster_broad_freq') & (filtered_occupancy_df.subset == 'all')] + + # remove images without tumor cells + density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :] + filtered_occupancy_df = filtered_occupancy_df.loc[filtered_occupancy_df.fov.isin(density_df.fov.unique())] + cell_types = filtered_occupancy_df.cell_type.unique() + + # calculate which pairings to keep + keep_cells, keep_features = [], [] + + for cell_type in cell_types: + density_subset = density_df.loc[density_df.cell_type == cell_type] + occupancy_subset = filtered_occupancy_df.loc[filtered_occupancy_df.linear_distance == cell_type] + occupancy_wide = occupancy_subset.pivot(index='fov', columns='cell_type', values='value') + occupancy_wide.reset_index(inplace=True) + occupancy_wide = pd.merge(occupancy_wide, density_subset[['fov', 'value']], on='fov', how='inner') + + # get correlations + corr_df = occupancy_wide.corr(method='spearman') + + # determine which features to keep + corr_vals = corr_df.loc['value', :].abs() + corr_vals = corr_vals[corr_vals < 0.7] + + # add to list of features to keep + keep_cells.extend(corr_vals.index) + keep_features.extend([cell_type] * len(corr_vals.index)) + + keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features}) + + keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv'), index=False) +else: + keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv')) + +deduped_dfs = [] +for cell_type in keep_df.cell_type.unique(): + keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique() + if len(keep_features) > 0: + keep_df_subset = filtered_occupancy_df.loc[filtered_occupancy_df.cell_type == cell_type] + keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)] + deduped_dfs.append(keep_df_subset) + +deduped_occupancy_df = pd.concat(deduped_dfs) + +# save filtered df +deduped_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_deduped.csv'), index=False) + +# create version aggregated by timepoint +deduped_occupancy_df_grouped = deduped_occupancy_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset']) +deduped_occupancy_df_timepoint = deduped_occupancy_df_grouped['value'].agg([np.mean, np.std]) +deduped_occupancy_df_timepoint.reset_index(inplace=True) +deduped_occupancy_df_timepoint = deduped_occupancy_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# save timepoint df +deduped_occupancy_df_timepoint.to_csv(os.path.join(output_dir, 'occupancy_df_per_timepoint_deduped.csv'), index=False) + + +# create manual df with total morphology marker average across all cells in an image +morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] + +# group by specified columns +grouped_table = morph_table_small.groupby('fov') +transformed = grouped_table.agg(np.mean) +transformed.reset_index(inplace=True) + +# reshape to long df +long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature') +long_df['metric'] = 'total_freq' +long_df['cell_type'] = 'all' +long_df['subset'] = 'all' + +long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') + +long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False) + +# filter morphology markers to only include FOVs with at least the specified number of cells +total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +min_cells = 5 + +filtered_dfs = [] +metrics = [['cluster_broad_count', 'cluster_broad_freq'], + ['cluster_count', 'cluster_freq'], + ['meta_cluster_count', 'meta_cluster_freq']] +for metric in metrics: + # subset count df to include cells at the relevant clustering resolution + for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', + 'tls', 'tagg', 'all']: + count_df = total_df[total_df.metric == metric[0]] + count_df = count_df[count_df.subset == compartment] + + # subset morphology df to only include morphology metrics at this resolution + morph_df = total_df_morph[total_df_morph.metric == metric[1]] + morph_df = morph_df[morph_df.subset == compartment] + + # for each cell type, determine which FOVs have high enough counts to be included + for cell_type in count_df.cell_type.unique(): + keep_df = count_df[count_df.cell_type == cell_type] + keep_df = keep_df[keep_df.value >= min_cells] + keep_fovs = keep_df.fov.unique() + + # subset morphology df to only include FOVs with high enough counts + keep_features = morph_df[morph_df.cell_type == cell_type] + keep_features = keep_features[keep_features.fov.isin(keep_fovs)] + + # append to list of filtered dfs + filtered_dfs.append(keep_features) + +filtered_dfs.append(long_df) +filtered_morph_df = pd.concat(filtered_dfs) + +# save filtered df +filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False) + +# create version aggregated by timepoint +filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset']) +filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std]) +filtered_morph_df_timepoint.reset_index(inplace=True) +filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# save timepoint df +filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False) + +# remove redundant morphology features +block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter'] + +block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear'] + +block3 = ['eccentricity', 'major_axis_equiv_diam_ratio'] + +block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear'] + +deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] + +# only keep complex morphology features for cancer cells, remove everything except area and nc for others +cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17', + 'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim'] +basic_morph_features = ['area', 'area_nuclear', 'nc_ratio'] + +deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :] + +# saved deduped +deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False) + +# same for timepoints +deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] +deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :] +deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False) + + + # fiber analysis fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])] diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index 5426185..7cc2900 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -439,10 +439,11 @@ def occupancy_division_factor(occupancy_group: pd.api.typing.DataFrameGroupBy, # occupancy statistic helpers def compute_occupancy_statistics( cell_table: pd.DataFrame, pop_col: str = "cell_cluster_broad", - pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 8, - max_image_size: int = 2048, positive_threshold: int = 20 + pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 4, + max_image_size: int = 2048, positive_threshold: int = 0.0, sample_dir: Union[str, pathlib.Path] ): - """Compute the occupancy statistics over a cohort. + """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile + across the cohort. Args: cell_table (pd.DataFrame): @@ -456,9 +457,9 @@ def compute_occupancy_statistics( The row/col dims of tiles to define over each image max_image_size (int): Maximum size of an image passed in, assuming square images - NOTE: all images should have an image size that is a factor of max_image_size + NOTE: all images should have an image size that is a factor of `max_image_size` positive_threshold (int): - The cell count in a tile required for positivity + The z-scored cell count in a tile required for positivity Returns: Dict[str, float]: @@ -496,6 +497,8 @@ def compute_occupancy_statistics( names=["fov", "tile_row", "tile_col", pop_col] ).to_frame(index=False) + # compute the total occupancy of each tile for each population type + # NOTE: need to merge this way to account for tiles with 0 counts occupancy_counts: pd.DataFrame = cell_table.groupby( ["fov", "tile_row", "tile_col", pop_col] ).size().reset_index(name="occupancy") @@ -505,9 +508,21 @@ def compute_occupancy_statistics( how="left" ) occupancy_counts["occupancy"].fillna(0, inplace=True) - print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + + # zscore the tile counts across the cohort grouped by the cell types + occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby( + [pop_col] + )["occupancy"].transform(lambda x: zscore(x, ddof=0)) + + # mark tiles as positive depending on the positive_threshold value + occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold + + # total up the positive tiles per FOV and cell type + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + lambda row: row["is_positive"].sum() + ).reset_ondex("total_positive_tiles") - occupancy_counts["is_positive"] = occupancy_counts["occupancy"] > positive_threshold # occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( # lambda row: row["is_positive"].sum() / occupancy_division_factor( # row, tiles_per_row_col, max_image_size, cell_table @@ -515,9 +530,36 @@ def compute_occupancy_statistics( # ).reset_index(name="percent_positive") occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( lambda row: row["is_positive"].sum() - ).reset_index(name="total_positive") + ).reset_index(name="total_positive_tiles") + + # get the max image size, this will determine how many tiles there are when finding percentages + occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply( + lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0], + axis=1 + ) + occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply( + lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])), + axis=1 + ) + + # determine the percent positivity of tiles + occupancy_counts_grouped["percent_positive_tiles"] = \ + occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"] + + # occupancy_stats_grouped: pd.DataFrame = occupancy_stats.groupby( + # ["fov", "cell_cluster_broad"] + # ).apply(lambda row: row["is_positive"].sum()).reset_index(name="total_positive_tiles") + + # # everything after here is grouped + # occupancy_stats_grouped["num_cells"] = occupancy_stats.groupby( + # ["fov", "cell_cluster_broad", "positivity_threshold"] + # ).apply(lambda row: row["occupancy"].sum()).values + + # occuapcny_stats_grouped["max_tiles"] = occupancy_count + + return occupancy_counts_grouped - return occupancy_counts, occupancy_counts_grouped + # return occupancy_counts, occupancy_counts_grouped # # define the occupancy_stats array # occupancy_stats = xr.DataArray( diff --git a/python_files/utils.py b/python_files/utils.py index 7bfe0fa..24a319c 100644 --- a/python_files/utils.py +++ b/python_files/utils.py @@ -4,6 +4,7 @@ import pandas as pd from skimage import morphology from scipy.ndimage import gaussian_filter +from scipy.stats import zscore from skimage.measure import label import skimage.io as io @@ -221,6 +222,165 @@ def create_long_df_by_functional(func_table, cluster_col_name, drop_cols, result return long_df_all +def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, subset_col=None, + tiles_per_row_col=4, max_image_size=2048, positive_threshold=0): + """Helper function for creating the unmelted occupancy table + + Args: + cell_table (pd.DataFrame): the dataframe containing information on each cell + cluster_col_name (str): the column name in cell_table that contains the cluster information + drop_cols (list): list of columns to drop from cell_table + result_name (str): the name of this statistic in the returned df + subset_col (str): the name of the subset col to group by, if not provided then skip + tiles_per_row_col (int): the row/col dims of tiles to define over each image + max_image_size (int): maximum size of an image passed in, assuming square images + positive_threshold (int): the z-scored cell count in a tile required for positivity + + Returns: + pd.DataFrame: occupancy stats in unmelted format""" + + verify_in_list(cell_type_col=cluster_col_name, cell_table_columns=cell_table.columns) + verify_in_list(drop_cols=drop_cols, cell_table_columns=cell_table.columns) + if subset_col is not None: + verify_in_list(subset_col=subset_col, cell_table_columns=cell_table.columns) + + # remove the drop columns + cell_table_small = cell_table.loc[:, ~cell_table.columns.isin(drop_cols)] + + # get all the FOV names + fov_names = cell_table_small["fov"].unique() + + # if a population subset is specified, first validate then truncate the cell table accordingly + if pop_subset: + verify_in_list( + specified_populations=pop_subset, + valid_populations=cell_table_small[cluster_col_name].unique() + ) + # otherwise, use all possible subsets as defined by pop_col + else: + pop_subset = cell_table_small[cluster_col_name].unique() + + # define the tile size in pixels for each FOV + tile_size = max_image_size // tiles_per_row_col + + # define the tile for each cell + cell_table_small["tile_row"] = (cell_table_small["centroid-1"] // tile_size).astype(int) + cell_table_small["tile_col"] = (cell_table_small["centroid-0"] // tile_size).astype(int) + + # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col + all_combos: pd.DataFrame = pd.MultiIndex.from_product( + [ + cell_table_small["fov"].unique(), + cell_table_small["tile_row"].unique(), + cell_table_small["tile_col"].unique(), + cell_table_small[pop_col].unique() + ], + names=["fov", "tile_row", "tile_col", cluster_col_name] + ).to_frame(index=False) + + occupancy_count_groupby = ["fov", "tile_row", "tile_col", cluster_col_name] + if subset_col: + occupancy_group_by.append("subset_col") + + # compute the total occupancy of each tile for each population type + # NOTE: need to merge this way to account for tiles with 0 counts + occupancy_counts = cell_table_small.groupby( + occupancy_count_groupby + ).size().reset_index(name="occupancy") + occupancy_counts = pd.merge( + all_combos, occupancy_counts, + on=occupancy_count_groupby, + how="left" + ) + occupancy_counts["occupancy"].fillna(0, inplace=True) + # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + + # zscore the tile counts across the cohort grouped by the cell types + occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby( + [pop_col] + )["occupancy"].transform(lambda x: zscore(x, ddof=0)) + + # mark tiles as positive depending on the positive_threshold value + occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold + + # total up the positive tiles per FOV and cell type + occupancy_zscore_groupby = ["fov", cluster_col_name] + if subset_col is not None: + occupancy_zscore_groupby.append(subset_col) + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(occupancy_zscore_groupby).apply( + lambda row: row["is_positive"].sum() + ).reset_ondex("total_positive_tiles") + + # get the max image size, this will determine how many tiles there are when finding percentages + occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply( + lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0], + axis=1 + ) + occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply( + lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])), + axis=1 + ) + + # determine the percent positivity of tiles + occupancy_counts_grouped["percent_positive_tiles"] = \ + occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"] + + # reshape to long df + occupancy_long_df = pd.melt( + occupancy_counts_grouped[occupancy_zscore_groupby + ["percent_positive_tiles"]], + id_vars=occupancy_zscore_groupby, + var_name='functional_marker' + ) + occupancy_long_df['metric'] = result_name + occupancy_long_df = long_df.rename(columns={cluster_col_name: "cell_type"}) + if subset_col is not None: + occupancy_long_df.rename(columns={subset_col: "subset"}) + + return occupancy_long_df + + +def create_long_df_by_occupancy(cell_table, cluster_col_name, drop_cols, result_name, + subset_col=None, tiles_per_row_col=4, max_image_size=2048, + positive_threshold=0): + """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile + across the cohort. + + Args: + cell_table (pd.DataFrame): the dataframe containing information on each cell + cluster_col_name (str): the column name in cell_table that contains the cluster information + drop_cols (list): list of columns to drop from cell_table + result_name (str): the name of this statistic in the returned df + subset_col (str): the column name in cell_table to subset by + tiles_per_row_col (int): the row/col dims of tiles to define over each image + max_image_size (int): maximum size of an image passed in, assuming square images + positive_threshold (int): the z-scored cell count in a tile required for positivity + + Returns: + pd.DataFrame: long format dataframe containing the summarized occupancy data""" + + # first generate df without subsetting + drop_cols_all = drop_cols.copy() + if subset_col is not None: + drop_cols_all = drop_cols + [subset_col] + + occupancy_long_all_compartments = occupancy_df_helper( + cell_table, cluster_col_name, drop_cols, result_name, None, + tiles_per_row_col, max_image_size, positive_threshold + ) + long_df_all["subset"] = "all" + + if subset_col is not None: + occupancy_long_per_compartment = occupancy_df_helper( + cell_table, cluster_col_name, drop_cols, result_name, subset_col, + tiles_per_row_col, max_image_size, positive_threshold + ) + occupancy_long_comb = pd.concat( + [occupancy_long_all_compartments, occupancy_long_per_compartment] + ) + + return occupancy_long_comb + + def create_channel_mask(img, intensity_thresh, sigma, min_mask_size=0, max_hole_size=100000): """Generates a binary mask from a single channel image From 3ed1549029bc0585808c823da6f2a93e9009be9a Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 4 Sep 2024 15:05:09 -0700 Subject: [PATCH 14/14] Remove unnecessary pop_subset --- .../postprocessing/5_create_dfs_per_core.py | 2040 +++++++++-------- python_files/utils.py | 18 +- 2 files changed, 1026 insertions(+), 1032 deletions(-) diff --git a/python_files/postprocessing/5_create_dfs_per_core.py b/python_files/postprocessing/5_create_dfs_per_core.py index 7d6e905..01b0970 100644 --- a/python_files/postprocessing/5_create_dfs_per_core.py +++ b/python_files/postprocessing/5_create_dfs_per_core.py @@ -5,7 +5,11 @@ import numpy as np from scipy.stats import spearmanr -from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy +# from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +# from python_files.utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy +from utils import create_long_df_by_functional, create_long_df_by_cluster, create_long_df_by_occupancy # # This file creates plotting-ready data structures to enumerate the frequency, count, and density @@ -29,1001 +33,1001 @@ # load relevant tables core_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_core.csv')) -timepoint_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_timepoint.csv')) +# timepoint_metadata = pd.read_csv(os.path.join(intermediate_dir, 'metadata', 'TONIC_data_per_timepoint.csv')) harmonized_metadata = pd.read_csv(os.path.join(analysis_dir, 'harmonized_metadata.csv')) cell_table_clusters = pd.read_csv(os.path.join(analysis_dir, 'cell_table_clusters.csv')) -cell_table_func = pd.read_csv(os.path.join(analysis_dir, 'cell_table_func_all.csv')) -cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv')) -cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv')) -cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv')) -cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated'), usecols=[ +# cell_table_func = pd.read_csv(os.path.join(analysis_dir, 'cell_table_func_all.csv')) +# cell_table_morph = pd.read_csv(os.path.join(analysis_dir, 'cell_table_morph.csv')) +# cell_table_diversity = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/neighborhood_diversity_radius50.csv')) +# cell_table_distances_broad = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/cell_neighbor_analysis/cell_cluster_broad_avg_dists-nearest_1.csv')) +cell_table_occupancy = pd.read_csv(os.path.join(analysis_dir, 'combined_cell_table_normalized_cell_labels_updated.csv'), usecols=[ "fov", "label", "centroid-0", "centroid-1", "cell_meta_cluster", "cell_cluster", "cell_cluster_broad" ]) -area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv')) -annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'cell_annotation_mask.csv')) -fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv')) -fiber_tile_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data/tile_stats_512', 'fiber_stats_table-tile_512.csv')) +# area_df = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir/individual_masks-no_tagg_tls', 'fov_annotation_mask_area.csv')) +annotations_by_mask = pd.read_csv(os.path.join(intermediate_dir, 'mask_dir', 'cell_annotation_mask.csv')) +# fiber_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data', 'fiber_object_table.csv')) +# fiber_tile_df = pd.read_csv(os.path.join(intermediate_dir, 'fiber_segmentation_processed_data/tile_stats_512', 'fiber_stats_table-tile_512.csv')) # merge cell-level annotations harmonized_annotations = annotations_by_mask harmonized_annotations = harmonized_annotations.rename(columns={'mask_name': 'tumor_region'}) assert len(harmonized_annotations) == len(cell_table_clusters) -cell_table_clusters = cell_table_clusters.merge(harmonized_annotations, on=['fov', 'label'], how='left') -cell_table_func = cell_table_func.merge(harmonized_annotations, on=['fov', 'label'], how='left') -cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left') -cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left') -cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left') +# cell_table_clusters = cell_table_clusters.merge(harmonized_annotations, on=['fov', 'label'], how='left') +# cell_table_func = cell_table_func.merge(harmonized_annotations, on=['fov', 'label'], how='left') +# cell_table_morph = cell_table_morph.merge(harmonized_annotations, on=['fov', 'label'], how='left') +# cell_table_diversity = cell_table_diversity.merge(harmonized_annotations, on=['fov', 'label'], how='left') +# cell_table_distances_broad = cell_table_distances_broad.merge(harmonized_annotations, on=['fov', 'label'], how='left') cell_table_occupancy = cell_table_occupancy.merge(harmonized_annotations, on=['fov', 'label'], how='left') # check for FOVs present in imaged data that aren't in core metadata missing_fovs = cell_table_clusters.loc[~cell_table_clusters.fov.isin(core_metadata.fov), 'fov'].unique() -# TONIC_TMA24_R11C2 and TONIC_TMA24_R11C3 are wrong tissue -cell_table_clusters = cell_table_clusters.loc[~cell_table_clusters.fov.isin(missing_fovs), :] -cell_table_func = cell_table_func.loc[~cell_table_func.fov.isin(missing_fovs), :] -cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :] -cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :] -cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :] +# # TONIC_TMA24_R11C2 and TONIC_TMA24_R11C3 are wrong tissue +# cell_table_clusters = cell_table_clusters.loc[~cell_table_clusters.fov.isin(missing_fovs), :] +# cell_table_func = cell_table_func.loc[~cell_table_func.fov.isin(missing_fovs), :] +# cell_table_morph = cell_table_morph.loc[~cell_table_morph.fov.isin(missing_fovs), :] +# cell_table_diversity = cell_table_diversity.loc[~cell_table_diversity.fov.isin(missing_fovs), :] +# cell_table_distances_broad = cell_table_distances_broad.loc[~cell_table_distances_broad.fov.isin(missing_fovs), :] cell_table_occupancy = cell_table_occupancy.loc[~cell_table_occupancy.fov.isin(missing_fovs), :] -fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :] -fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :] +# fiber_df = fiber_df.loc[~fiber_df.fov.isin(missing_fovs), :] +# fiber_tile_df = fiber_tile_df.loc[~fiber_tile_df.fov.isin(missing_fovs), :] + +# # +# # Generate counts and proportions of cell clusters per FOV +# # + +# # Specify the types of cluster dfs to produce. Each row corresponds to a different way of +# # summarizing the data. The first item in each list is the name for the df, the second is the +# # name of the column to use to for the cluster labels, and the third is a boolean flag controlling +# # whether normalized frequencies or total counts will be returned + +# cluster_df_params = [['cluster_broad_freq', 'cell_cluster_broad', True], +# ['cluster_broad_count', 'cell_cluster_broad', False], +# ['cluster_freq', 'cell_cluster', True], +# ['cluster_count', 'cell_cluster', False], +# ['meta_cluster_freq', 'cell_meta_cluster', True], +# ['meta_cluster_count', 'cell_meta_cluster', False]] + +# cluster_dfs = [] +# for result_name, cluster_col_name, normalize in cluster_df_params: +# cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters, +# result_name=result_name, +# cluster_col_name=cluster_col_name, +# subset_col='tumor_region', +# normalize=normalize)) + + +# # create masks for dfs looking at only a subset of cells + +# # proportion of T cell subsets +# tcell_mask = cell_table_clusters['cell_cluster'].isin(['Treg', 'CD8T', 'CD4T', 'T_Other']) + +# # proportion of immune cell subsets +# immune_mask = cell_table_clusters['cell_cluster_broad'].isin(['Mono_Mac', 'T', +# 'Granulocyte', 'NK', 'B']) +# immune_mask_2 = cell_table_clusters.cell_cluster == 'Immune_Other' +# immune_mask = np.logical_or(immune_mask, immune_mask_2) + +# # proportion of stromal subsets +# stroma_mask = cell_table_clusters['cell_cluster_broad'].isin(['Stroma']) + +# # proportion of cancer subsets +# cancer_mask = cell_table_clusters['cell_cluster_broad'].isin(['Cancer']) + +# cluster_mask_params = [['tcell_freq', 'cell_cluster', True, tcell_mask], +# ['immune_freq', 'cell_cluster', True, immune_mask], +# ['stroma_freq', 'cell_meta_cluster', True, stroma_mask], +# ['cancer_freq', 'cell_meta_cluster', True, cancer_mask]] + +# for result_name, cluster_col_name, normalize, mask in cluster_mask_params: +# cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters.loc[mask, :], +# result_name=result_name, +# cluster_col_name=cluster_col_name, +# subset_col='tumor_region', +# normalize=normalize)) + +# # calculate total number of cells per image +# grouped_cell_counts = cell_table_clusters[['fov']].groupby('fov').value_counts() +# grouped_cell_counts = pd.DataFrame(grouped_cell_counts) +# grouped_cell_counts.columns = ['value'] +# grouped_cell_counts.reset_index(inplace=True) +# grouped_cell_counts['metric'] = 'total_cell_count' +# grouped_cell_counts['cell_type'] = 'all' +# grouped_cell_counts['subset'] = 'all' + +# # + +# # calculate total number of cells per region per image +# grouped_cell_counts_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov', 'tumor_region']).value_counts() +# grouped_cell_counts_region = pd.DataFrame(grouped_cell_counts_region) +# grouped_cell_counts_region.columns = ['value'] +# grouped_cell_counts_region.reset_index(inplace=True) +# grouped_cell_counts_region['metric'] = 'total_cell_count' +# grouped_cell_counts_region.rename(columns={'tumor_region': 'subset'}, inplace=True) +# grouped_cell_counts_region['cell_type'] = 'all' + +# # calculate proportions of cells per region per image +# grouped_cell_freq_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov']) +# grouped_cell_freq_region = grouped_cell_freq_region['tumor_region'].value_counts(normalize=True) +# grouped_cell_freq_region = pd.DataFrame(grouped_cell_freq_region) +# grouped_cell_freq_region.columns = ['value'] +# grouped_cell_freq_region.reset_index(inplace=True) +# grouped_cell_freq_region['metric'] = 'total_cell_freq' +# grouped_cell_freq_region.rename(columns={'tumor_region': 'subset'}, inplace=True) +# grouped_cell_freq_region['cell_type'] = 'all' + +# # add manually defined dfs to overall list +# cluster_dfs.extend([grouped_cell_counts, +# grouped_cell_counts_region, +# grouped_cell_freq_region]) + +# # create single df with appropriate metadata +# total_df = pd.concat(cluster_dfs, axis=0) + +# # compute density of cells for counts-based metrics +# count_metrics = total_df.metric.unique() +# count_metrics = [x for x in count_metrics if 'count' in x] + +# count_df = total_df.loc[total_df.metric.isin(count_metrics), :] +# area_df = area_df.rename(columns={'compartment': 'subset'}) +# count_df = count_df.merge(area_df, on=['fov', 'subset'], how='left') +# count_df['value'] = count_df['value'] / count_df['area'] +# count_df['value'] = count_df['value'] * 1000 + +# # rename metric from count to density +# count_df['metric'] = count_df['metric'].str.replace('count', 'density') +# count_df = count_df.drop(columns=['area']) +# total_df = pd.concat([total_df, count_df], axis=0) + +# # check that all metadata from core_metadata succesfully transferred over +# len_total_df = len(total_df) +# total_df = total_df.merge(harmonized_metadata, on='fov', how='left') +# assert len_total_df == len(total_df) + + +# # save annotated cluster counts +# total_df.to_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'), index=False) + +# # create version aggregated by timepoint +# total_df_grouped = total_df.groupby(['Tissue_ID', 'cell_type', 'metric', 'subset']) +# total_df_timepoint = total_df_grouped['value'].agg([np.mean, np.std]) +# total_df_timepoint.reset_index(inplace=True) +# total_df_timepoint = total_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# total_df_timepoint.to_csv(os.path.join(output_dir, 'cluster_df_per_timepoint.csv'), index=False) + + +# # +# # Create summary dataframe with proportions and counts of different functional marker populations +# # + + +# # Columns which are not thresholded (such as ratios between markers) can only be calculated for +# # dfs looking at normalized expression, and need to be dropped when calculating counts +# count_drop_cols = ['H3K9ac_H3K27me3_ratio', 'CD45RO_CD45RB_ratio'] + +# # Create list to hold parameters for each df that will be produced +# func_df_params = [['cluster_broad_count', 'cell_cluster_broad', False], +# ['cluster_broad_freq', 'cell_cluster_broad', True], +# ['cluster_count', 'cell_cluster', False], +# ['cluster_freq', 'cell_cluster', True], +# ['meta_cluster_count', 'cell_meta_cluster', False], +# ['meta_cluster_freq', 'cell_meta_cluster', True]] + + +# func_dfs = [] +# for result_name, cluster_col_name, normalize in func_df_params: +# # columns which are not functional markers need to be dropped from the df +# drop_cols = ['label'] +# if not normalize: +# drop_cols.extend(count_drop_cols) + +# # remove cluster_names except for the one specified for the df +# cluster_names = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad'] # , 'kmeans_labels'] +# cluster_names.remove(cluster_col_name) +# drop_cols.extend(cluster_names) + +# # create df +# func_dfs.append(create_long_df_by_functional(func_table=cell_table_func, +# result_name=result_name, +# cluster_col_name=cluster_col_name, +# drop_cols=drop_cols, +# normalize=normalize, +# subset_col='tumor_region')) + +# # create combined df +# total_df_func = pd.concat(func_dfs, axis=0) +# total_df_func = total_df_func.merge(harmonized_metadata, on='fov', how='inner') + +# # save combined df +# total_df_func.to_csv(os.path.join(output_dir, 'functional_df_per_core.csv'), index=False) + +# # create version aggregated by timepoint +# total_df_grouped_func = total_df_func.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) +# total_df_timepoint_func = total_df_grouped_func['value'].agg([np.mean, np.std]) +# total_df_timepoint_func.reset_index(inplace=True) +# total_df_timepoint_func = total_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# total_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint.csv'), index=False) + +# # +# # Filter functional markers +# # -# -# Generate counts and proportions of cell clusters per FOV -# - -# Specify the types of cluster dfs to produce. Each row corresponds to a different way of -# summarizing the data. The first item in each list is the name for the df, the second is the -# name of the column to use to for the cluster labels, and the third is a boolean flag controlling -# whether normalized frequencies or total counts will be returned - -cluster_df_params = [['cluster_broad_freq', 'cell_cluster_broad', True], - ['cluster_broad_count', 'cell_cluster_broad', False], - ['cluster_freq', 'cell_cluster', True], - ['cluster_count', 'cell_cluster', False], - ['meta_cluster_freq', 'cell_meta_cluster', True], - ['meta_cluster_count', 'cell_meta_cluster', False]] - -cluster_dfs = [] -for result_name, cluster_col_name, normalize in cluster_df_params: - cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters, - result_name=result_name, - cluster_col_name=cluster_col_name, - subset_col='tumor_region', - normalize=normalize)) - - -# create masks for dfs looking at only a subset of cells - -# proportion of T cell subsets -tcell_mask = cell_table_clusters['cell_cluster'].isin(['Treg', 'CD8T', 'CD4T', 'T_Other']) - -# proportion of immune cell subsets -immune_mask = cell_table_clusters['cell_cluster_broad'].isin(['Mono_Mac', 'T', - 'Granulocyte', 'NK', 'B']) -immune_mask_2 = cell_table_clusters.cell_cluster == 'Immune_Other' -immune_mask = np.logical_or(immune_mask, immune_mask_2) - -# proportion of stromal subsets -stroma_mask = cell_table_clusters['cell_cluster_broad'].isin(['Stroma']) - -# proportion of cancer subsets -cancer_mask = cell_table_clusters['cell_cluster_broad'].isin(['Cancer']) - -cluster_mask_params = [['tcell_freq', 'cell_cluster', True, tcell_mask], - ['immune_freq', 'cell_cluster', True, immune_mask], - ['stroma_freq', 'cell_meta_cluster', True, stroma_mask], - ['cancer_freq', 'cell_meta_cluster', True, cancer_mask]] - -for result_name, cluster_col_name, normalize, mask in cluster_mask_params: - cluster_dfs.append(create_long_df_by_cluster(cell_table=cell_table_clusters.loc[mask, :], - result_name=result_name, - cluster_col_name=cluster_col_name, - subset_col='tumor_region', - normalize=normalize)) - -# calculate total number of cells per image -grouped_cell_counts = cell_table_clusters[['fov']].groupby('fov').value_counts() -grouped_cell_counts = pd.DataFrame(grouped_cell_counts) -grouped_cell_counts.columns = ['value'] -grouped_cell_counts.reset_index(inplace=True) -grouped_cell_counts['metric'] = 'total_cell_count' -grouped_cell_counts['cell_type'] = 'all' -grouped_cell_counts['subset'] = 'all' - -# - -# calculate total number of cells per region per image -grouped_cell_counts_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov', 'tumor_region']).value_counts() -grouped_cell_counts_region = pd.DataFrame(grouped_cell_counts_region) -grouped_cell_counts_region.columns = ['value'] -grouped_cell_counts_region.reset_index(inplace=True) -grouped_cell_counts_region['metric'] = 'total_cell_count' -grouped_cell_counts_region.rename(columns={'tumor_region': 'subset'}, inplace=True) -grouped_cell_counts_region['cell_type'] = 'all' - -# calculate proportions of cells per region per image -grouped_cell_freq_region = cell_table_clusters[['fov', 'tumor_region']].groupby(['fov']) -grouped_cell_freq_region = grouped_cell_freq_region['tumor_region'].value_counts(normalize=True) -grouped_cell_freq_region = pd.DataFrame(grouped_cell_freq_region) -grouped_cell_freq_region.columns = ['value'] -grouped_cell_freq_region.reset_index(inplace=True) -grouped_cell_freq_region['metric'] = 'total_cell_freq' -grouped_cell_freq_region.rename(columns={'tumor_region': 'subset'}, inplace=True) -grouped_cell_freq_region['cell_type'] = 'all' - -# add manually defined dfs to overall list -cluster_dfs.extend([grouped_cell_counts, - grouped_cell_counts_region, - grouped_cell_freq_region]) - -# create single df with appropriate metadata -total_df = pd.concat(cluster_dfs, axis=0) - -# compute density of cells for counts-based metrics -count_metrics = total_df.metric.unique() -count_metrics = [x for x in count_metrics if 'count' in x] - -count_df = total_df.loc[total_df.metric.isin(count_metrics), :] -area_df = area_df.rename(columns={'compartment': 'subset'}) -count_df = count_df.merge(area_df, on=['fov', 'subset'], how='left') -count_df['value'] = count_df['value'] / count_df['area'] -count_df['value'] = count_df['value'] * 1000 - -# rename metric from count to density -count_df['metric'] = count_df['metric'].str.replace('count', 'density') -count_df = count_df.drop(columns=['area']) -total_df = pd.concat([total_df, count_df], axis=0) - -# check that all metadata from core_metadata succesfully transferred over -len_total_df = len(total_df) -total_df = total_df.merge(harmonized_metadata, on='fov', how='left') -assert len_total_df == len(total_df) - - -# save annotated cluster counts -total_df.to_csv(os.path.join(output_dir, 'cluster_df_per_core.csv'), index=False) - -# create version aggregated by timepoint -total_df_grouped = total_df.groupby(['Tissue_ID', 'cell_type', 'metric', 'subset']) -total_df_timepoint = total_df_grouped['value'].agg([np.mean, np.std]) -total_df_timepoint.reset_index(inplace=True) -total_df_timepoint = total_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') - -# save timepoint df -total_df_timepoint.to_csv(os.path.join(output_dir, 'cluster_df_per_timepoint.csv'), index=False) - - -# -# Create summary dataframe with proportions and counts of different functional marker populations -# - - -# Columns which are not thresholded (such as ratios between markers) can only be calculated for -# dfs looking at normalized expression, and need to be dropped when calculating counts -count_drop_cols = ['H3K9ac_H3K27me3_ratio', 'CD45RO_CD45RB_ratio'] - -# Create list to hold parameters for each df that will be produced -func_df_params = [['cluster_broad_count', 'cell_cluster_broad', False], - ['cluster_broad_freq', 'cell_cluster_broad', True], - ['cluster_count', 'cell_cluster', False], - ['cluster_freq', 'cell_cluster', True], - ['meta_cluster_count', 'cell_meta_cluster', False], - ['meta_cluster_freq', 'cell_meta_cluster', True]] - - -func_dfs = [] -for result_name, cluster_col_name, normalize in func_df_params: - # columns which are not functional markers need to be dropped from the df - drop_cols = ['label'] - if not normalize: - drop_cols.extend(count_drop_cols) - - # remove cluster_names except for the one specified for the df - cluster_names = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad'] # , 'kmeans_labels'] - cluster_names.remove(cluster_col_name) - drop_cols.extend(cluster_names) - - # create df - func_dfs.append(create_long_df_by_functional(func_table=cell_table_func, - result_name=result_name, - cluster_col_name=cluster_col_name, - drop_cols=drop_cols, - normalize=normalize, - subset_col='tumor_region')) - -# create combined df -total_df_func = pd.concat(func_dfs, axis=0) -total_df_func = total_df_func.merge(harmonized_metadata, on='fov', how='inner') - -# save combined df -total_df_func.to_csv(os.path.join(output_dir, 'functional_df_per_core.csv'), index=False) - -# create version aggregated by timepoint -total_df_grouped_func = total_df_func.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) -total_df_timepoint_func = total_df_grouped_func['value'].agg([np.mean, np.std]) -total_df_timepoint_func.reset_index(inplace=True) -total_df_timepoint_func = total_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') - -# save timepoint df -total_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint.csv'), index=False) - -# -# Filter functional markers -# - -# filter functional markers to only include FOVs with at least the specified number of cells -total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -min_cells = 5 - -filtered_dfs = [] -metrics = [['cluster_broad_count', 'cluster_broad_freq'], - ['cluster_count', 'cluster_freq'], - ['meta_cluster_count', 'meta_cluster_freq']] - -for metric in metrics: - # subset count df to include cells at the relevant clustering resolution - for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', - 'tls', 'tagg', 'all']: - count_df = total_df[total_df.metric == metric[0]] - count_df = count_df[count_df.subset == compartment] - - # subset functional df to only include functional markers at this resolution - func_df = total_df_func[total_df_func.metric.isin(metric)] - func_df = func_df[func_df.subset == compartment] - - # for each cell type, determine which FOVs have high enough counts to be included - for cell_type in count_df.cell_type.unique(): - keep_df = count_df[count_df.cell_type == cell_type] - keep_df = keep_df[keep_df.value >= min_cells] - keep_fovs = keep_df.fov.unique() - - # subset functional df to only include FOVs with high enough counts - keep_markers = func_df[func_df.cell_type == cell_type] - keep_markers = keep_markers[keep_markers.fov.isin(keep_fovs)] - - # append to list of filtered dfs - filtered_dfs.append(keep_markers) - -filtered_func_df = pd.concat(filtered_dfs) - -# take subset for plotting average functional marker expression -filtered_func_df_plot = filtered_func_df.loc[filtered_func_df.subset == 'all', :] -filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.metric.isin(['cluster_broad_freq', 'cluster_freq', 'meta_cluster_freq']), :] -filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.functional_marker.isin(sp_markers), :] - -# save filtered df -filtered_func_df_plot.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False) - -# # identify combinations of markers and cell types to include in analysis based on threshold -# mean_percent_positive = 0.05 -# sp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' not in x] -# broad_df = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq'] -# broad_df = broad_df[broad_df.functional_marker.isin(sp_markers)] -# broad_df = broad_df[broad_df.subset == 'all'] -# broad_df = broad_df[broad_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# broad_df_agg = broad_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# broad_df_agg = broad_df_agg.reset_index() -# broad_df = broad_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# broad_df_include = broad_df > mean_percent_positive -# -# # include for all cells -# general_markers = ['Ki67', 'HLA1', 'Vim', 'H3K9ac_H3K27me3_ratio'] -# broad_df_include[[general_markers]] = True -# -# # CD45 isoform ratios -# double_pos = np.logical_and(broad_df_include['CD45RO'], broad_df_include['CD45RB']) -# broad_df_include['CD45RO_CD45RB_ratio'] = double_pos -# -# # Cancer expression -# broad_df_include.loc['Cancer', ['HLADR', 'CD57']] = True -# -# broad_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv')) -# -# # apply thresholds to medium level clustering -# assignment_dict_med = {'Cancer': ['Cancer', 'Cancer_EMT', 'Cancer_Other'], -# 'Mono_Mac': ['M1_Mac', 'M2_Mac', 'Mac_Other', 'Monocyte', 'APC'], -# 'B': ['B'], -# 'T': ['CD4T', 'CD8T', 'Treg', 'T_Other', 'Immune_Other'], -# 'Granulocyte': ['Neutrophil', 'Mast'], -# 'Stroma': ['Endothelium', 'Fibroblast', 'Stroma'], -# 'NK': ['NK'], -# 'Other': ['Other']} -# -# # get a list of all cell types -# cell_types = np.concatenate([assignment_dict_med[key] for key in assignment_dict_med.keys()]) -# cell_types.sort() -# -# med_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns) -# -# for key in assignment_dict_med.keys(): -# values = assignment_dict_med[key] -# med_df_include.loc[values] = broad_df_include.loc[key].values -# -# # check if assignment makes sense -# med_df = filtered_func_df[filtered_func_df.metric == 'cluster_freq'] -# med_df = med_df[med_df.functional_marker.isin(sp_markers)] -# med_df = med_df[med_df.subset == 'all'] -# med_df = med_df[med_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# med_df_agg = med_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# med_df_agg.reset_index(inplace=True) -# med_df = med_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# med_df_bin = med_df > mean_percent_positive -# -# # add CD38 signal -# med_df_include.loc[['Endothelium', 'Immune_Other'], 'CD38'] = True -# -# # add IDO signal -# med_df_include.loc[['APC'], 'IDO'] = True -# -# # compare to see where assignments disagree, to see if any others need to be added -# new_includes = (med_df_bin == True) & (med_df_include == False) -# -# med_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv')) -# -# # do the same for the fine-grained clustering -# assignment_dict_meta = {'Cancer': ['Cancer_CD56', 'Cancer_CK17', 'Cancer_Ecad'], -# 'Cancer_EMT': ['Cancer_SMA', 'Cancer_Vim'], -# 'Cancer_Other': ['Cancer_Other', 'Cancer_Mono'], -# 'M1_Mac': ['CD68'], -# 'M2_Mac': ['CD163'], -# 'Mac_Other': ['CD68_CD163_DP'], -# 'Monocyte': ['CD4_Mono', 'CD14'], -# 'APC': ['CD11c_HLADR'], -# 'B': ['CD20'], -# 'Endothelium': ['CD31', 'CD31_VIM'], -# 'Fibroblast': ['FAP', 'FAP_SMA', 'SMA'], -# 'Stroma': ['Stroma_Collagen', 'Stroma_Fibronectin', 'VIM'], -# 'NK': ['CD56'], -# 'Neutrophil': ['Neutrophil'], -# 'Mast': ['Mast'], -# 'CD4T': ['CD4T','CD4T_HLADR'], -# 'CD8T': ['CD8T'], -# 'Treg': ['Treg'], -# 'T_Other': ['CD3_DN','CD4T_CD8T_DP'], -# 'Immune_Other': ['Immune_Other'], -# 'Other': ['Other']} -# -# # get a list of all cell types -# cell_types = np.concatenate([assignment_dict_meta[key] for key in assignment_dict_meta.keys()]) -# cell_types.sort() -# -# meta_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns) -# -# for key in assignment_dict_meta.keys(): -# values = assignment_dict_meta[key] -# meta_df_include.loc[values] = med_df_include.loc[key].values -# -# # check if assignment makes sense -# meta_df = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq'] -# meta_df = meta_df[meta_df.functional_marker.isin(sp_markers)] -# meta_df = meta_df[meta_df.subset == 'all'] -# meta_df = meta_df[meta_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# meta_df_agg = meta_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# meta_df_agg.reset_index(inplace=True) -# meta_df = meta_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# meta_df_bin = meta_df > mean_percent_positive -# -# # compare to see where assignments disagree -# new_includes = (meta_df_bin == True) & (meta_df_include == False) -# -# meta_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv')) - -# process functional data so that only the specified cell type/marker combos are included - -# load matrices -broad_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'), index_col=0) -med_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'), index_col=0) -meta_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'), index_col=0) - -# identify metrics and dfs that will be filtered -filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include], - ['cluster_count', 'cluster_freq', med_df_include], - ['meta_cluster_count', 'meta_cluster_freq', meta_df_include]] - -combo_dfs = [] - -for filters in filtering: - # get variables - metric_names = filters[:2] - metric_df = filters[2] - - # subset functional df to only include functional markers at this resolution - func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)] - - # loop over each cell type, and get the corresponding markers - for cell_type in metric_df.index: - markers = metric_df.columns[metric_df.loc[cell_type] == True] - - # subset functional df to only include this cell type - func_df_cell = func_df[func_df.cell_type == cell_type] - - # subset functional df to only include markers for this cell type - func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)] - - # append to list of dfs - combo_dfs.append(func_df_cell) - -# do the same thing for double positive cells - -# # identify combinations of markers and cell types to include in analysis based on threshold -# dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x] -# mean_percent_positive_dp = 0.05 -# broad_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq'] -# broad_df_dp = broad_df_dp[broad_df_dp.subset == 'all'] -# broad_df_dp = broad_df_dp[broad_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# broad_df_dp = broad_df_dp[broad_df_dp.functional_marker.isin(dp_markers)] -# broad_df_dp_agg = broad_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# broad_df_dp_agg = broad_df_dp_agg.reset_index() -# broad_df_dp = broad_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# broad_df_dp_include = broad_df_dp > mean_percent_positive_dp -# -# broad_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv')) -# -# # do the same for medium-level clustering -# med_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_freq'] -# med_df_dp = med_df_dp[med_df_dp.subset == 'all'] -# med_df_dp = med_df_dp[med_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# med_df_dp = med_df_dp[med_df_dp.functional_marker.isin(dp_markers)] -# med_df_dp_agg = med_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# # create matrix of cell types and markers -# med_df_dp_agg = med_df_dp_agg.reset_index() -# med_df_dp = med_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# med_df_dp_include = med_df_dp > mean_percent_positive_dp -# -# med_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv')) -# -# -# # do the same for finest-level clustering -# meta_df_dp = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq'] -# meta_df_dp = meta_df_dp[meta_df_dp.subset == 'all'] -# meta_df_dp = meta_df_dp[meta_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] -# meta_df_dp = meta_df_dp[meta_df_dp.functional_marker.isin(dp_markers)] -# meta_df_dp_agg = meta_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) -# -# # create matrix of cell types and markers -# meta_df_dp_agg = meta_df_dp_agg.reset_index() -# meta_df_dp = meta_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') -# meta_df_dp_include = meta_df_dp > mean_percent_positive_dp -# -# meta_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv')) - -# load inclusion matrices -broad_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'), index_col=0) -med_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'), index_col=0) -meta_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'), index_col=0) - -# identify metrics and dfs that will be filtered -filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include_dp], - ['cluster_count', 'cluster_freq', med_df_include_dp], - ['meta_cluster_count', 'meta_cluster_freq', meta_df_include_dp]] - -for filters in filtering: - # get variables - metric_names = filters[:2] - metric_df = filters[2] - - # subset functional df to only include functional markers at this resolution - func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)] - - # loop over each cell type, and get the corresponding markers - for cell_type in metric_df.index: - markers = metric_df.columns[metric_df.loc[cell_type] == True] - - # subset functional df to only include this cell type - func_df_cell = func_df[func_df.cell_type == cell_type] - - # subset functional df to only include markers for this cell type - func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)] - - # append to list of dfs - combo_dfs.append(func_df_cell) - - -# create manual df with total functional marker positivity across all cells in an image -# dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x] -# func_table_small = cell_table_func.loc[:, ~cell_table_func.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] -# func_table_small = func_table_small.loc[:, ~func_table_small.columns.isin(dp_markers)] -# -# # group by specified columns -# grouped_table = func_table_small.groupby('fov') -# transformed = grouped_table.agg(np.mean) -# transformed.reset_index(inplace=True) -# -# # reshape to long df -# long_df = pd.melt(transformed, id_vars=['fov'], var_name='functional_marker') -# long_df['metric'] = 'total_freq' -# long_df['cell_type'] = 'all' -# long_df['subset'] = 'all' -# -# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') -# -# long_df.to_csv(os.path.join(output_dir, 'total_func_per_core.csv'), index=False) - -# append to list of dfs -long_df = pd.read_csv(os.path.join(output_dir, 'total_func_per_core.csv')) -combo_dfs.append(long_df) - -# combine -combo_df = pd.concat(combo_dfs) -combo_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False) - -# create version of filtered df aggregated by timepoint -combo_df_grouped_func = combo_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) -combo_df_timepoint_func = combo_df_grouped_func['value'].agg([np.mean, np.std]) -combo_df_timepoint_func.reset_index(inplace=True) -combo_df_timepoint_func = combo_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') - -# save timepoint df -combo_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered.csv'), index=False) +# # filter functional markers to only include FOVs with at least the specified number of cells +# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +# min_cells = 5 + +# filtered_dfs = [] +# metrics = [['cluster_broad_count', 'cluster_broad_freq'], +# ['cluster_count', 'cluster_freq'], +# ['meta_cluster_count', 'meta_cluster_freq']] + +# for metric in metrics: +# # subset count df to include cells at the relevant clustering resolution +# for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', +# 'tls', 'tagg', 'all']: +# count_df = total_df[total_df.metric == metric[0]] +# count_df = count_df[count_df.subset == compartment] + +# # subset functional df to only include functional markers at this resolution +# func_df = total_df_func[total_df_func.metric.isin(metric)] +# func_df = func_df[func_df.subset == compartment] + +# # for each cell type, determine which FOVs have high enough counts to be included +# for cell_type in count_df.cell_type.unique(): +# keep_df = count_df[count_df.cell_type == cell_type] +# keep_df = keep_df[keep_df.value >= min_cells] +# keep_fovs = keep_df.fov.unique() + +# # subset functional df to only include FOVs with high enough counts +# keep_markers = func_df[func_df.cell_type == cell_type] +# keep_markers = keep_markers[keep_markers.fov.isin(keep_fovs)] + +# # append to list of filtered dfs +# filtered_dfs.append(keep_markers) + +# filtered_func_df = pd.concat(filtered_dfs) + +# # take subset for plotting average functional marker expression +# filtered_func_df_plot = filtered_func_df.loc[filtered_func_df.subset == 'all', :] +# filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.metric.isin(['cluster_broad_freq', 'cluster_freq', 'meta_cluster_freq']), :] +# filtered_func_df_plot = filtered_func_df_plot.loc[filtered_func_df_plot.functional_marker.isin(sp_markers), :] + +# # save filtered df +# filtered_func_df_plot.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False) + +# # # identify combinations of markers and cell types to include in analysis based on threshold +# # mean_percent_positive = 0.05 +# # sp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' not in x] +# # broad_df = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq'] +# # broad_df = broad_df[broad_df.functional_marker.isin(sp_markers)] +# # broad_df = broad_df[broad_df.subset == 'all'] +# # broad_df = broad_df[broad_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # broad_df_agg = broad_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # broad_df_agg = broad_df_agg.reset_index() +# # broad_df = broad_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # broad_df_include = broad_df > mean_percent_positive +# # +# # # include for all cells +# # general_markers = ['Ki67', 'HLA1', 'Vim', 'H3K9ac_H3K27me3_ratio'] +# # broad_df_include[[general_markers]] = True +# # +# # # CD45 isoform ratios +# # double_pos = np.logical_and(broad_df_include['CD45RO'], broad_df_include['CD45RB']) +# # broad_df_include['CD45RO_CD45RB_ratio'] = double_pos +# # +# # # Cancer expression +# # broad_df_include.loc['Cancer', ['HLADR', 'CD57']] = True +# # +# # broad_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv')) +# # +# # # apply thresholds to medium level clustering +# # assignment_dict_med = {'Cancer': ['Cancer', 'Cancer_EMT', 'Cancer_Other'], +# # 'Mono_Mac': ['M1_Mac', 'M2_Mac', 'Mac_Other', 'Monocyte', 'APC'], +# # 'B': ['B'], +# # 'T': ['CD4T', 'CD8T', 'Treg', 'T_Other', 'Immune_Other'], +# # 'Granulocyte': ['Neutrophil', 'Mast'], +# # 'Stroma': ['Endothelium', 'Fibroblast', 'Stroma'], +# # 'NK': ['NK'], +# # 'Other': ['Other']} +# # +# # # get a list of all cell types +# # cell_types = np.concatenate([assignment_dict_med[key] for key in assignment_dict_med.keys()]) +# # cell_types.sort() +# # +# # med_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns) +# # +# # for key in assignment_dict_med.keys(): +# # values = assignment_dict_med[key] +# # med_df_include.loc[values] = broad_df_include.loc[key].values +# # +# # # check if assignment makes sense +# # med_df = filtered_func_df[filtered_func_df.metric == 'cluster_freq'] +# # med_df = med_df[med_df.functional_marker.isin(sp_markers)] +# # med_df = med_df[med_df.subset == 'all'] +# # med_df = med_df[med_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # med_df_agg = med_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # med_df_agg.reset_index(inplace=True) +# # med_df = med_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # med_df_bin = med_df > mean_percent_positive +# # +# # # add CD38 signal +# # med_df_include.loc[['Endothelium', 'Immune_Other'], 'CD38'] = True +# # +# # # add IDO signal +# # med_df_include.loc[['APC'], 'IDO'] = True +# # +# # # compare to see where assignments disagree, to see if any others need to be added +# # new_includes = (med_df_bin == True) & (med_df_include == False) +# # +# # med_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv')) +# # +# # # do the same for the fine-grained clustering +# # assignment_dict_meta = {'Cancer': ['Cancer_CD56', 'Cancer_CK17', 'Cancer_Ecad'], +# # 'Cancer_EMT': ['Cancer_SMA', 'Cancer_Vim'], +# # 'Cancer_Other': ['Cancer_Other', 'Cancer_Mono'], +# # 'M1_Mac': ['CD68'], +# # 'M2_Mac': ['CD163'], +# # 'Mac_Other': ['CD68_CD163_DP'], +# # 'Monocyte': ['CD4_Mono', 'CD14'], +# # 'APC': ['CD11c_HLADR'], +# # 'B': ['CD20'], +# # 'Endothelium': ['CD31', 'CD31_VIM'], +# # 'Fibroblast': ['FAP', 'FAP_SMA', 'SMA'], +# # 'Stroma': ['Stroma_Collagen', 'Stroma_Fibronectin', 'VIM'], +# # 'NK': ['CD56'], +# # 'Neutrophil': ['Neutrophil'], +# # 'Mast': ['Mast'], +# # 'CD4T': ['CD4T','CD4T_HLADR'], +# # 'CD8T': ['CD8T'], +# # 'Treg': ['Treg'], +# # 'T_Other': ['CD3_DN','CD4T_CD8T_DP'], +# # 'Immune_Other': ['Immune_Other'], +# # 'Other': ['Other']} +# # +# # # get a list of all cell types +# # cell_types = np.concatenate([assignment_dict_meta[key] for key in assignment_dict_meta.keys()]) +# # cell_types.sort() +# # +# # meta_df_include = pd.DataFrame(index=cell_types, columns=broad_df.columns) +# # +# # for key in assignment_dict_meta.keys(): +# # values = assignment_dict_meta[key] +# # meta_df_include.loc[values] = med_df_include.loc[key].values +# # +# # # check if assignment makes sense +# # meta_df = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq'] +# # meta_df = meta_df[meta_df.functional_marker.isin(sp_markers)] +# # meta_df = meta_df[meta_df.subset == 'all'] +# # meta_df = meta_df[meta_df.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # meta_df_agg = meta_df[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # meta_df_agg.reset_index(inplace=True) +# # meta_df = meta_df_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # meta_df_bin = meta_df > mean_percent_positive +# # +# # # compare to see where assignments disagree +# # new_includes = (meta_df_bin == True) & (meta_df_include == False) +# # +# # meta_df_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv')) + +# # process functional data so that only the specified cell type/marker combos are included + +# # load matrices +# broad_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad.csv'), index_col=0) +# med_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med.csv'), index_col=0) +# meta_df_include = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta.csv'), index_col=0) + +# # identify metrics and dfs that will be filtered +# filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include], +# ['cluster_count', 'cluster_freq', med_df_include], +# ['meta_cluster_count', 'meta_cluster_freq', meta_df_include]] + +# combo_dfs = [] + +# for filters in filtering: +# # get variables +# metric_names = filters[:2] +# metric_df = filters[2] +# # subset functional df to only include functional markers at this resolution +# func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)] + +# # loop over each cell type, and get the corresponding markers +# for cell_type in metric_df.index: +# markers = metric_df.columns[metric_df.loc[cell_type] == True] + +# # subset functional df to only include this cell type +# func_df_cell = func_df[func_df.cell_type == cell_type] + +# # subset functional df to only include markers for this cell type +# func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)] + +# # append to list of dfs +# combo_dfs.append(func_df_cell) + +# # do the same thing for double positive cells + +# # # identify combinations of markers and cell types to include in analysis based on threshold +# # dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x] +# # mean_percent_positive_dp = 0.05 +# # broad_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_broad_freq'] +# # broad_df_dp = broad_df_dp[broad_df_dp.subset == 'all'] +# # broad_df_dp = broad_df_dp[broad_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # broad_df_dp = broad_df_dp[broad_df_dp.functional_marker.isin(dp_markers)] +# # broad_df_dp_agg = broad_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # broad_df_dp_agg = broad_df_dp_agg.reset_index() +# # broad_df_dp = broad_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # broad_df_dp_include = broad_df_dp > mean_percent_positive_dp +# # +# # broad_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv')) +# # +# # # do the same for medium-level clustering +# # med_df_dp = filtered_func_df[filtered_func_df.metric == 'cluster_freq'] +# # med_df_dp = med_df_dp[med_df_dp.subset == 'all'] +# # med_df_dp = med_df_dp[med_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # med_df_dp = med_df_dp[med_df_dp.functional_marker.isin(dp_markers)] +# # med_df_dp_agg = med_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # # create matrix of cell types and markers +# # med_df_dp_agg = med_df_dp_agg.reset_index() +# # med_df_dp = med_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # med_df_dp_include = med_df_dp > mean_percent_positive_dp +# # +# # med_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv')) +# # +# # +# # # do the same for finest-level clustering +# # meta_df_dp = filtered_func_df[filtered_func_df.metric == 'meta_cluster_freq'] +# # meta_df_dp = meta_df_dp[meta_df_dp.subset == 'all'] +# # meta_df_dp = meta_df_dp[meta_df_dp.Timepoint.isin(['primary_untreated', 'baseline', 'post_induction', 'on_nivo'])] +# # meta_df_dp = meta_df_dp[meta_df_dp.functional_marker.isin(dp_markers)] +# # meta_df_dp_agg = meta_df_dp[['fov', 'functional_marker', 'cell_type', 'value']].groupby(['cell_type', 'functional_marker']).agg(np.mean) +# # +# # # create matrix of cell types and markers +# # meta_df_dp_agg = meta_df_dp_agg.reset_index() +# # meta_df_dp = meta_df_dp_agg.pivot(index='cell_type', columns='functional_marker', values='value') +# # meta_df_dp_include = meta_df_dp > mean_percent_positive_dp +# # +# # meta_df_dp_include.to_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv')) + +# # load inclusion matrices +# broad_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_broad_dp.csv'), index_col=0) +# med_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_med_dp.csv'), index_col=0) +# meta_df_include_dp = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'inclusion_matrix_meta_dp.csv'), index_col=0) + +# # identify metrics and dfs that will be filtered +# filtering = [['cluster_broad_count', 'cluster_broad_freq', broad_df_include_dp], +# ['cluster_count', 'cluster_freq', med_df_include_dp], +# ['meta_cluster_count', 'meta_cluster_freq', meta_df_include_dp]] + +# for filters in filtering: +# # get variables +# metric_names = filters[:2] +# metric_df = filters[2] -# -# Remove double positive functional markers that are highly correlated with single positive scores -# +# # subset functional df to only include functional markers at this resolution +# func_df = filtered_func_df[filtered_func_df.metric.isin(metric_names)] + +# # loop over each cell type, and get the corresponding markers +# for cell_type in metric_df.index: +# markers = metric_df.columns[metric_df.loc[cell_type] == True] + +# # subset functional df to only include this cell type +# func_df_cell = func_df[func_df.cell_type == cell_type] + +# # subset functional df to only include markers for this cell type +# func_df_cell = func_df_cell[func_df_cell.functional_marker.isin(markers)] + +# # append to list of dfs +# combo_dfs.append(func_df_cell) + + +# # create manual df with total functional marker positivity across all cells in an image +# # dp_markers = [x for x in filtered_func_df.functional_marker.unique() if '__' in x] +# # func_table_small = cell_table_func.loc[:, ~cell_table_func.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] +# # func_table_small = func_table_small.loc[:, ~func_table_small.columns.isin(dp_markers)] +# # +# # # group by specified columns +# # grouped_table = func_table_small.groupby('fov') +# # transformed = grouped_table.agg(np.mean) +# # transformed.reset_index(inplace=True) +# # +# # # reshape to long df +# # long_df = pd.melt(transformed, id_vars=['fov'], var_name='functional_marker') +# # long_df['metric'] = 'total_freq' +# # long_df['cell_type'] = 'all' +# # long_df['subset'] = 'all' +# # +# # long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') +# # +# # long_df.to_csv(os.path.join(output_dir, 'total_func_per_core.csv'), index=False) + +# # append to list of dfs +# long_df = pd.read_csv(os.path.join(output_dir, 'total_func_per_core.csv')) +# combo_dfs.append(long_df) + +# # combine +# combo_df = pd.concat(combo_dfs) +# combo_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered.csv'), index=False) + +# # create version of filtered df aggregated by timepoint +# combo_df_grouped_func = combo_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) +# combo_df_timepoint_func = combo_df_grouped_func['value'].agg([np.mean, np.std]) +# combo_df_timepoint_func.reset_index(inplace=True) +# combo_df_timepoint_func = combo_df_timepoint_func.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# combo_df_timepoint_func.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered.csv'), index=False) + + +# # +# # Remove double positive functional markers that are highly correlated with single positive scores +# # + +# # cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'], +# # ['cluster_freq', 'cluster_count'], +# # ['meta_cluster_freq', 'meta_cluster_count']] +# # +# # all_markers = combo_df.functional_marker.unique() +# # dp_markers = [x for x in all_markers if '__' in x] +# # +# # exclude_lists = [] +# # for cluster in cluster_resolution: +# # +# # # subset functional df to only include functional markers at this resolution +# # func_df = combo_df[combo_df.metric.isin(cluster)] +# # +# # # add unique identifier for cell + marker combo +# # func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker'] +# # +# # # subset the df further to look at just frequency and just one compartment +# # func_df_subset = func_df[func_df.metric == cluster[0]] +# # func_df_subset = func_df_subset[func_df_subset.subset == 'all'] +# # +# # # loop over each cell type, and each double positive functional marker +# # exclude_markers = [] +# # cell_types = func_df_subset.cell_type.unique() +# # for cell_type in cell_types: +# # for marker in dp_markers: +# # # get the two markers that make up the double positive marker +# # marker_1, marker_2 = marker.split('__') +# # +# # # subset to only include this cell type and these markers +# # current_df = func_df_subset.loc[func_df_subset.cell_type == cell_type, :] +# # current_df = current_df.loc[current_df.functional_marker.isin([marker, marker_1, marker_2]), :] +# # +# # # these markers are not present in this cell type +# # if len(current_df) == 0: +# # continue +# # +# # # this double positive marker is not present in this cell type +# # if marker not in current_df.functional_marker.unique(): +# # continue +# # +# # current_df_wide = current_df.pivot(index='fov', columns='functional_marker', values='value') +# # +# # # the double positive marker is present, but both single positives are not; exclude it +# # if len(current_df_wide.columns) != 3: +# # exclude_markers.append(marker) +# # continue +# # +# # corr_1, _ = spearmanr(current_df_wide[marker_1].values, current_df_wide[marker].values) +# # corr_2, _ = spearmanr(current_df_wide[marker_2].values, current_df_wide[marker].values) +# # +# # if (corr_1 > 0.7) | (corr_2 > 0.7): +# # exclude_markers.append(cell_type + '__' + marker) +# # +# # # add to list +# # exclude_lists.append(exclude_markers) +# # +# # # construct df to hold list of exlcuded cells +# # exclude_df_cluster = ['cluster_broad_freq'] * len(exclude_lists[0]) + ['cluster_freq'] * len(exclude_lists[1]) + ['meta_cluster_freq'] * len(exclude_lists[2]) +# # exclude_df_name = exclude_lists[0] + exclude_lists[1] + exclude_lists[2] +# # +# # exclude_df = pd.DataFrame({'metric': exclude_df_cluster, 'feature_name': exclude_df_name}) +# # exclude_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'), index=False) + +# # use previously generated exclude list +# exclude_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv')) + +# dedup_dfs = [] # cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'], # ['cluster_freq', 'cluster_count'], # ['meta_cluster_freq', 'meta_cluster_count']] -# -# all_markers = combo_df.functional_marker.unique() -# dp_markers = [x for x in all_markers if '__' in x] -# -# exclude_lists = [] + # for cluster in cluster_resolution: -# # # subset functional df to only include functional markers at this resolution # func_df = combo_df[combo_df.metric.isin(cluster)] -# + # # add unique identifier for cell + marker combo # func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker'] -# -# # subset the df further to look at just frequency and just one compartment -# func_df_subset = func_df[func_df.metric == cluster[0]] -# func_df_subset = func_df_subset[func_df_subset.subset == 'all'] -# -# # loop over each cell type, and each double positive functional marker -# exclude_markers = [] -# cell_types = func_df_subset.cell_type.unique() -# for cell_type in cell_types: -# for marker in dp_markers: -# # get the two markers that make up the double positive marker -# marker_1, marker_2 = marker.split('__') -# -# # subset to only include this cell type and these markers -# current_df = func_df_subset.loc[func_df_subset.cell_type == cell_type, :] -# current_df = current_df.loc[current_df.functional_marker.isin([marker, marker_1, marker_2]), :] -# -# # these markers are not present in this cell type -# if len(current_df) == 0: -# continue -# -# # this double positive marker is not present in this cell type -# if marker not in current_df.functional_marker.unique(): -# continue -# -# current_df_wide = current_df.pivot(index='fov', columns='functional_marker', values='value') -# -# # the double positive marker is present, but both single positives are not; exclude it -# if len(current_df_wide.columns) != 3: -# exclude_markers.append(marker) -# continue -# -# corr_1, _ = spearmanr(current_df_wide[marker_1].values, current_df_wide[marker].values) -# corr_2, _ = spearmanr(current_df_wide[marker_2].values, current_df_wide[marker].values) -# -# if (corr_1 > 0.7) | (corr_2 > 0.7): -# exclude_markers.append(cell_type + '__' + marker) -# -# # add to list -# exclude_lists.append(exclude_markers) -# -# # construct df to hold list of exlcuded cells -# exclude_df_cluster = ['cluster_broad_freq'] * len(exclude_lists[0]) + ['cluster_freq'] * len(exclude_lists[1]) + ['meta_cluster_freq'] * len(exclude_lists[2]) -# exclude_df_name = exclude_lists[0] + exclude_lists[1] + exclude_lists[2] -# -# exclude_df = pd.DataFrame({'metric': exclude_df_cluster, 'feature_name': exclude_df_name}) -# exclude_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv'), index=False) - -# use previously generated exclude list -exclude_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'exclude_double_positive_markers.csv')) - -dedup_dfs = [] - -cluster_resolution = [['cluster_broad_freq', 'cluster_broad_count'], - ['cluster_freq', 'cluster_count'], - ['meta_cluster_freq', 'meta_cluster_count']] - -for cluster in cluster_resolution: - # subset functional df to only include functional markers at this resolution - func_df = combo_df[combo_df.metric.isin(cluster)] - - # add unique identifier for cell + marker combo - func_df['feature_name'] = func_df['cell_type'] + '__' + func_df['functional_marker'] - - exclude_names = exclude_df.loc[exclude_df.metric == cluster[0], 'feature_name'].values - - # remove double positive markers that are highly correlated with single positive scores - func_df = func_df[~func_df.feature_name.isin(exclude_names)] - dedup_dfs.append(func_df) - - -dedup_dfs.append(long_df) -deduped_df = pd.concat(dedup_dfs) -deduped_df = deduped_df.drop('feature_name', axis=1) - -# save deduped df -deduped_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered_deduped.csv'), index=False) - -# create version aggregated by timepoint -deduped_df_grouped = deduped_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) -deduped_df_timepoint = deduped_df_grouped['value'].agg([np.mean, np.std]) -deduped_df_timepoint.reset_index(inplace=True) -deduped_df_timepoint = deduped_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') - -# save timepoint df -deduped_df_timepoint.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered_deduped.csv'), index=False) - - -# morphology metric summary - -# Create list to hold parameters for each df that will be produced -morph_df_params = [['cluster_broad_freq', 'cell_cluster_broad'], - ['cluster_freq', 'cell_cluster'], - ['meta_cluster_freq', 'cell_meta_cluster']] -morph_dfs = [] -for result_name, cluster_col_name in morph_df_params: +# exclude_names = exclude_df.loc[exclude_df.metric == cluster[0], 'feature_name'].values - # remove cluster_names except for the one specified for the df - drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] - drop_cols.remove(cluster_col_name) - - # create df - morph_dfs.append(create_long_df_by_functional(func_table=cell_table_morph, - result_name=result_name, - cluster_col_name=cluster_col_name, - drop_cols=drop_cols, - normalize=True, - subset_col='tumor_region')) - -# create combined df -total_df_morph = pd.concat(morph_dfs, axis=0) -total_df_morph = total_df_morph.merge(harmonized_metadata, on='fov', how='inner') -total_df_morph = total_df_morph.rename(columns={'functional_marker': 'morphology_feature'}) - -# save df -total_df_morph.to_csv(os.path.join(output_dir, 'morph_df_per_core.csv'), index=False) - - -# create manual df with total morphology marker average across all cells in an image -morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] - -# group by specified columns -grouped_table = morph_table_small.groupby('fov') -transformed = grouped_table.agg(np.mean) -transformed.reset_index(inplace=True) - -# reshape to long df -long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature') -long_df['metric'] = 'total_freq' -long_df['cell_type'] = 'all' -long_df['subset'] = 'all' - -long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') - -long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False) - -# filter morphology markers to only include FOVs with at least the specified number of cells -total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -min_cells = 5 - -filtered_dfs = [] -metrics = [['cluster_broad_count', 'cluster_broad_freq'], - ['cluster_count', 'cluster_freq'], - ['meta_cluster_count', 'meta_cluster_freq']] -for metric in metrics: - # subset count df to include cells at the relevant clustering resolution - for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', - 'tls', 'tagg', 'all']: - count_df = total_df[total_df.metric == metric[0]] - count_df = count_df[count_df.subset == compartment] +# # remove double positive markers that are highly correlated with single positive scores +# func_df = func_df[~func_df.feature_name.isin(exclude_names)] +# dedup_dfs.append(func_df) - # subset morphology df to only include morphology metrics at this resolution - morph_df = total_df_morph[total_df_morph.metric == metric[1]] - morph_df = morph_df[morph_df.subset == compartment] - # for each cell type, determine which FOVs have high enough counts to be included - for cell_type in count_df.cell_type.unique(): - keep_df = count_df[count_df.cell_type == cell_type] - keep_df = keep_df[keep_df.value >= min_cells] - keep_fovs = keep_df.fov.unique() +# dedup_dfs.append(long_df) +# deduped_df = pd.concat(dedup_dfs) +# deduped_df = deduped_df.drop('feature_name', axis=1) - # subset morphology df to only include FOVs with high enough counts - keep_features = morph_df[morph_df.cell_type == cell_type] - keep_features = keep_features[keep_features.fov.isin(keep_fovs)] +# # save deduped df +# deduped_df.to_csv(os.path.join(output_dir, 'functional_df_per_core_filtered_deduped.csv'), index=False) - # append to list of filtered dfs - filtered_dfs.append(keep_features) +# # create version aggregated by timepoint +# deduped_df_grouped = deduped_df.groupby(['Tissue_ID', 'cell_type', 'functional_marker', 'metric', 'subset']) +# deduped_df_timepoint = deduped_df_grouped['value'].agg([np.mean, np.std]) +# deduped_df_timepoint.reset_index(inplace=True) +# deduped_df_timepoint = deduped_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') -filtered_dfs.append(long_df) -filtered_morph_df = pd.concat(filtered_dfs) +# # save timepoint df +# deduped_df_timepoint.to_csv(os.path.join(output_dir, 'functional_df_per_timepoint_filtered_deduped.csv'), index=False) -# save filtered df -filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False) -# create version aggregated by timepoint -filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset']) -filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std]) -filtered_morph_df_timepoint.reset_index(inplace=True) -filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# # morphology metric summary -# save timepoint df -filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False) +# # Create list to hold parameters for each df that will be produced +# morph_df_params = [['cluster_broad_freq', 'cell_cluster_broad'], +# ['cluster_freq', 'cell_cluster'], +# ['meta_cluster_freq', 'cell_meta_cluster']] -# remove redundant morphology features -block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter'] +# morph_dfs = [] +# for result_name, cluster_col_name in morph_df_params: -block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear'] +# # remove cluster_names except for the one specified for the df +# drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] +# drop_cols.remove(cluster_col_name) -block3 = ['eccentricity', 'major_axis_equiv_diam_ratio'] +# # create df +# morph_dfs.append(create_long_df_by_functional(func_table=cell_table_morph, +# result_name=result_name, +# cluster_col_name=cluster_col_name, +# drop_cols=drop_cols, +# normalize=True, +# subset_col='tumor_region')) -block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear'] +# # create combined df +# total_df_morph = pd.concat(morph_dfs, axis=0) +# total_df_morph = total_df_morph.merge(harmonized_metadata, on='fov', how='inner') +# total_df_morph = total_df_morph.rename(columns={'functional_marker': 'morphology_feature'}) -deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] +# # save df +# total_df_morph.to_csv(os.path.join(output_dir, 'morph_df_per_core.csv'), index=False) -# only keep complex morphology features for cancer cells, remove everything except area and nc for others -cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17', - 'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim'] -basic_morph_features = ['area', 'area_nuclear', 'nc_ratio'] -deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :] +# # create manual df with total morphology marker average across all cells in an image +# morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] -# saved deduped -deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False) +# # group by specified columns +# grouped_table = morph_table_small.groupby('fov') +# transformed = grouped_table.agg(np.mean) +# transformed.reset_index(inplace=True) -# same for timepoints -deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] -deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :] -deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False) +# # reshape to long df +# long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature') +# long_df['metric'] = 'total_freq' +# long_df['cell_type'] = 'all' +# long_df['subset'] = 'all' -# -# spatial features -# +# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') -# format mixing scores -mixing_scores = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/mixing_score/cell_cluster_broad/homogeneous_mixing_scores.csv')) -cols = mixing_scores.columns.tolist() -keep_cols = [col for col in cols if 'mixing_score' in col] -mixing_scores = mixing_scores[['fov'] + keep_cols] +# long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False) -mixing_scores = pd.melt(mixing_scores, id_vars=['fov'], var_name='mixing_score', value_name='value') -mixing_scores.to_csv(os.path.join(output_dir, 'formatted_mixing_scores.csv'), index=False) +# # filter morphology markers to only include FOVs with at least the specified number of cells +# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +# min_cells = 5 -# compute local diversity scores per image +# filtered_dfs = [] +# metrics = [['cluster_broad_count', 'cluster_broad_freq'], +# ['cluster_count', 'cluster_freq'], +# ['meta_cluster_count', 'meta_cluster_freq']] +# for metric in metrics: +# # subset count df to include cells at the relevant clustering resolution +# for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', +# 'tls', 'tagg', 'all']: +# count_df = total_df[total_df.metric == metric[0]] +# count_df = count_df[count_df.subset == compartment] -# Create list to hold parameters for each df that will be produced -diversity_df_params = [['cluster_broad_freq', 'cell_cluster_broad'], - ['cluster_freq', 'cell_cluster'], - ['meta_cluster_freq', 'cell_meta_cluster']] +# # subset morphology df to only include morphology metrics at this resolution +# morph_df = total_df_morph[total_df_morph.metric == metric[1]] +# morph_df = morph_df[morph_df.subset == compartment] -diversity_dfs = [] -for result_name, cluster_col_name in diversity_df_params: +# # for each cell type, determine which FOVs have high enough counts to be included +# for cell_type in count_df.cell_type.unique(): +# keep_df = count_df[count_df.cell_type == cell_type] +# keep_df = keep_df[keep_df.value >= min_cells] +# keep_fovs = keep_df.fov.unique() - # remove cluster_names except for the one specified for the df - drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] - drop_cols.remove(cluster_col_name) +# # subset morphology df to only include FOVs with high enough counts +# keep_features = morph_df[morph_df.cell_type == cell_type] +# keep_features = keep_features[keep_features.fov.isin(keep_fovs)] - # create df - diversity_dfs.append(create_long_df_by_functional(func_table=cell_table_diversity, - result_name=result_name, - cluster_col_name=cluster_col_name, - drop_cols=drop_cols, - normalize=True, - subset_col='tumor_region')) +# # append to list of filtered dfs +# filtered_dfs.append(keep_features) -# create combined df -total_df_diversity = pd.concat(diversity_dfs, axis=0) -total_df_diversity = total_df_diversity.merge(harmonized_metadata, on='fov', how='inner') -total_df_diversity = total_df_diversity.rename(columns={'functional_marker': 'diversity_feature'}) +# filtered_dfs.append(long_df) +# filtered_morph_df = pd.concat(filtered_dfs) -# save df -total_df_diversity.to_csv(os.path.join(output_dir, 'diversity_df_per_core.csv'), index=False) +# # save filtered df +# filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False) -# filter diversity scores to only include FOVs with at least the specified number of cells -total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -min_cells = 5 +# # create version aggregated by timepoint +# filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset']) +# filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std]) +# filtered_morph_df_timepoint.reset_index(inplace=True) +# filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') -filtered_dfs = [] -metrics = [['cluster_broad_count', 'cluster_broad_freq'], - ['cluster_count', 'cluster_freq'], - ['meta_cluster_count', 'meta_cluster_freq']] -for metric in metrics: - # subset count df to include cells at the relevant clustering resolution - for compartment in ['cancer_core', 'cancer_border', 'stroma_core', - 'stroma_border', 'tagg', 'tls', 'all']: - count_df = total_df[total_df.metric == metric[0]] - count_df = count_df[count_df.subset == compartment] +# # save timepoint df +# filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False) - # subset diversity df to only include diversity metrics at this resolution - diversity_df = total_df_diversity[total_df_diversity.metric == metric[1]] - diversity_df = diversity_df[diversity_df.subset == compartment] +# # remove redundant morphology features +# block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter'] - # for each cell type, determine which FOVs have high enough counts to be included - for cell_type in count_df.cell_type.unique(): - keep_df = count_df[count_df.cell_type == cell_type] - keep_df = keep_df[keep_df.value >= min_cells] - keep_fovs = keep_df.fov.unique() +# block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear'] - # subset morphology df to only include FOVs with high enough counts - keep_features = diversity_df[diversity_df.cell_type == cell_type] - keep_features = keep_features[keep_features.fov.isin(keep_fovs)] +# block3 = ['eccentricity', 'major_axis_equiv_diam_ratio'] - # append to list of filtered dfs - filtered_dfs.append(keep_features) +# block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear'] -filtered_diversity_df = pd.concat(filtered_dfs) +# deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] -# save filtered df -filtered_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered.csv'), index=False) +# # only keep complex morphology features for cancer cells, remove everything except area and nc for others +# cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17', +# 'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim'] +# basic_morph_features = ['area', 'area_nuclear', 'nc_ratio'] -# create version aggregated by timepoint -filtered_diversity_df_grouped = filtered_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset']) -filtered_diversity_df_timepoint = filtered_diversity_df_grouped['value'].agg([np.mean, np.std]) -filtered_diversity_df_timepoint.reset_index(inplace=True) -filtered_diversity_df_timepoint = filtered_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :] -# save timepoint df -filtered_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered.csv'), index=False) +# # saved deduped +# deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False) +# # same for timepoints +# deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] +# deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :] +# deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False) -# investigate correlation between diversity scores -fov_data = filtered_diversity_df.copy() -fov_data['feature_name_unique'] = fov_data['cell_type'] + '_' + fov_data['diversity_feature'] -fov_data = fov_data.loc[(fov_data.subset == 'all') & (fov_data.metric == 'cluster_freq')] -fov_data = fov_data.loc[fov_data.diversity_feature != 'diversity_cell_meta_cluster'] -fov_data_wide = fov_data.pivot(index='fov', columns='feature_name_unique', values='value') +# # +# # spatial features +# # -corr_df = fov_data_wide.corr(method='spearman') +# # format mixing scores +# mixing_scores = pd.read_csv(os.path.join(intermediate_dir, 'spatial_analysis/mixing_score/cell_cluster_broad/homogeneous_mixing_scores.csv')) +# cols = mixing_scores.columns.tolist() +# keep_cols = [col for col in cols if 'mixing_score' in col] +# mixing_scores = mixing_scores[['fov'] + keep_cols] -# replace Nans -corr_df = corr_df.fillna(0) -clustergrid = sns.clustermap(corr_df, cmap='vlag', vmin=-1, vmax=1, figsize=(20, 20)) +# mixing_scores = pd.melt(mixing_scores, id_vars=['fov'], var_name='mixing_score', value_name='value') +# mixing_scores.to_csv(os.path.join(output_dir, 'formatted_mixing_scores.csv'), index=False) -# save deduped df that excludes cell meta cluster -deduped_diversity_df = filtered_diversity_df.loc[filtered_diversity_df.diversity_feature != 'diversity_cell_meta_cluster'] -deduped_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered_deduped.csv'), index=False) +# # compute local diversity scores per image -# create version aggregated by timepoint -deduped_diversity_df_grouped = deduped_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset']) -deduped_diversity_df_timepoint = deduped_diversity_df_grouped['value'].agg([np.mean, np.std]) -deduped_diversity_df_timepoint.reset_index(inplace=True) -deduped_diversity_df_timepoint = deduped_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# # Create list to hold parameters for each df that will be produced +# diversity_df_params = [['cluster_broad_freq', 'cell_cluster_broad'], +# ['cluster_freq', 'cell_cluster'], +# ['meta_cluster_freq', 'cell_meta_cluster']] -# save timepoint df -deduped_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered_deduped.csv'), index=False) +# diversity_dfs = [] +# for result_name, cluster_col_name in diversity_df_params: +# # remove cluster_names except for the one specified for the df +# drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] +# drop_cols.remove(cluster_col_name) -# process linear distance dfs -total_df_distance = create_long_df_by_functional(func_table=cell_table_distances_broad, - result_name='cluster_broad_freq', - cluster_col_name='cell_cluster_broad', - drop_cols=['label'], - normalize=True, - subset_col='tumor_region') +# # create df +# diversity_dfs.append(create_long_df_by_functional(func_table=cell_table_diversity, +# result_name=result_name, +# cluster_col_name=cluster_col_name, +# drop_cols=drop_cols, +# normalize=True, +# subset_col='tumor_region')) -# create combined df -total_df_distance = total_df_distance.merge(harmonized_metadata, on='fov', how='inner') -total_df_distance = total_df_distance.rename(columns={'functional_marker': 'linear_distance'}) -total_df_distance.dropna(inplace=True) +# # create combined df +# total_df_diversity = pd.concat(diversity_dfs, axis=0) +# total_df_diversity = total_df_diversity.merge(harmonized_metadata, on='fov', how='inner') +# total_df_diversity = total_df_diversity.rename(columns={'functional_marker': 'diversity_feature'}) -# save df -total_df_distance.to_csv(os.path.join(output_dir, 'distance_df_per_core.csv'), index=False) +# # save df +# total_df_diversity.to_csv(os.path.join(output_dir, 'diversity_df_per_core.csv'), index=False) -# filter distance scores to only include FOVs with at least the specified number of cells -total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -min_cells = 5 - -filtered_dfs = [] -metrics = [['cluster_broad_count', 'cluster_broad_freq']] - -for metric in metrics: - # subset count df to include cells at the relevant clustering resolution - for compartment in ['cancer_core', 'cancer_border', 'stroma_core', - 'stroma_border', 'tagg', 'tls', 'all']: - count_df = total_df[total_df.metric == metric[0]] - count_df = count_df[count_df.subset == compartment] - - # subset distance df to only include distance metrics at this resolution - distance_df = total_df_distance[total_df_distance.metric == metric[1]] - distance_df = distance_df[distance_df.subset == compartment] - - # for each cell type, determine which FOVs have high enough counts to be included - for cell_type in count_df.cell_type.unique(): - keep_df = count_df[count_df.cell_type == cell_type] - keep_df = keep_df[keep_df.value >= min_cells] - keep_fovs = keep_df.fov.unique() - - # subset morphology df to only include FOVs with high enough counts - keep_features = distance_df[distance_df.cell_type == cell_type] - keep_features = keep_features[keep_features.fov.isin(keep_fovs)] - - # append to list of filtered dfs - filtered_dfs.append(keep_features) - -filtered_distance_df = pd.concat(filtered_dfs) - -# save filtered df -filtered_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_filtered.csv'), index=False) - -# remove distances that are correlated with abundance of cell type - -# load data +# # filter diversity scores to only include FOVs with at least the specified number of cells # total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -# density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')] -# filtered_distance_df = filtered_distance_df.loc[(filtered_distance_df.metric == 'cluster_broad_freq') & (filtered_distance_df.subset == 'all')] -# -# # remove images without tumor cells -# density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :] -# filtered_distance_df = filtered_distance_df.loc[filtered_distance_df.fov.isin(density_df.fov.unique())] -# cell_types = filtered_distance_df.cell_type.unique() -# -# # calculate which pairings to keep -# keep_cells, keep_features = [], [] -# -# for cell_type in cell_types: -# density_subset = density_df.loc[density_df.cell_type == cell_type] -# distance_subset = filtered_distance_df.loc[filtered_distance_df.linear_distance == cell_type] -# distance_wide = distance_subset.pivot(index='fov', columns='cell_type', values='value') -# distance_wide.reset_index(inplace=True) -# distance_wide = pd.merge(distance_wide, density_subset[['fov', 'value']], on='fov', how='inner') -# -# # get correlations -# corr_df = distance_wide.corr(method='spearman') -# -# # determine which features to keep -# corr_vals = corr_df.loc['value', :].abs() -# corr_vals = corr_vals[corr_vals < 0.7] -# -# # add to list of features to keep -# keep_cells.extend(corr_vals.index) -# keep_features.extend([cell_type] * len(corr_vals.index)) -# -# keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features}) -# -# keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'), index=False) - -# filter distance df to only include features with low correlation with abundance -keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv')) - - -deduped_dfs = [] -for cell_type in keep_df.cell_type.unique(): - keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique() - if len(keep_features) > 0: - keep_df_subset = filtered_distance_df.loc[filtered_distance_df.cell_type == cell_type] - keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)] - deduped_dfs.append(keep_df_subset) - -deduped_distance_df = pd.concat(deduped_dfs) - -# save filtered df -deduped_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_deduped.csv'), index=False) - -# create version aggregated by timepoint -deduped_distance_df_grouped = deduped_distance_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset']) -deduped_distance_df_timepoint = deduped_distance_df_grouped['value'].agg([np.mean, np.std]) -deduped_distance_df_timepoint.reset_index(inplace=True) -deduped_distance_df_timepoint = deduped_distance_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') - -# save timepoint df -deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False) +# min_cells = 5 + +# filtered_dfs = [] +# metrics = [['cluster_broad_count', 'cluster_broad_freq'], +# ['cluster_count', 'cluster_freq'], +# ['meta_cluster_count', 'meta_cluster_freq']] +# for metric in metrics: +# # subset count df to include cells at the relevant clustering resolution +# for compartment in ['cancer_core', 'cancer_border', 'stroma_core', +# 'stroma_border', 'tagg', 'tls', 'all']: +# count_df = total_df[total_df.metric == metric[0]] +# count_df = count_df[count_df.subset == compartment] + +# # subset diversity df to only include diversity metrics at this resolution +# diversity_df = total_df_diversity[total_df_diversity.metric == metric[1]] +# diversity_df = diversity_df[diversity_df.subset == compartment] + +# # for each cell type, determine which FOVs have high enough counts to be included +# for cell_type in count_df.cell_type.unique(): +# keep_df = count_df[count_df.cell_type == cell_type] +# keep_df = keep_df[keep_df.value >= min_cells] +# keep_fovs = keep_df.fov.unique() + +# # subset morphology df to only include FOVs with high enough counts +# keep_features = diversity_df[diversity_df.cell_type == cell_type] +# keep_features = keep_features[keep_features.fov.isin(keep_fovs)] + +# # append to list of filtered dfs +# filtered_dfs.append(keep_features) + +# filtered_diversity_df = pd.concat(filtered_dfs) + +# # save filtered df +# filtered_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered.csv'), index=False) + +# # create version aggregated by timepoint +# filtered_diversity_df_grouped = filtered_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset']) +# filtered_diversity_df_timepoint = filtered_diversity_df_grouped['value'].agg([np.mean, np.std]) +# filtered_diversity_df_timepoint.reset_index(inplace=True) +# filtered_diversity_df_timepoint = filtered_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# filtered_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered.csv'), index=False) + + +# # investigate correlation between diversity scores +# fov_data = filtered_diversity_df.copy() +# fov_data['feature_name_unique'] = fov_data['cell_type'] + '_' + fov_data['diversity_feature'] +# fov_data = fov_data.loc[(fov_data.subset == 'all') & (fov_data.metric == 'cluster_freq')] +# fov_data = fov_data.loc[fov_data.diversity_feature != 'diversity_cell_meta_cluster'] +# fov_data_wide = fov_data.pivot(index='fov', columns='feature_name_unique', values='value') + +# corr_df = fov_data_wide.corr(method='spearman') + +# # replace Nans +# corr_df = corr_df.fillna(0) +# clustergrid = sns.clustermap(corr_df, cmap='vlag', vmin=-1, vmax=1, figsize=(20, 20)) + +# # save deduped df that excludes cell meta cluster +# deduped_diversity_df = filtered_diversity_df.loc[filtered_diversity_df.diversity_feature != 'diversity_cell_meta_cluster'] +# deduped_diversity_df.to_csv(os.path.join(output_dir, 'diversity_df_per_core_filtered_deduped.csv'), index=False) + +# # create version aggregated by timepoint +# deduped_diversity_df_grouped = deduped_diversity_df.groupby(['Tissue_ID', 'cell_type', 'diversity_feature', 'metric', 'subset']) +# deduped_diversity_df_timepoint = deduped_diversity_df_grouped['value'].agg([np.mean, np.std]) +# deduped_diversity_df_timepoint.reset_index(inplace=True) +# deduped_diversity_df_timepoint = deduped_diversity_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# deduped_diversity_df_timepoint.to_csv(os.path.join(output_dir, 'diversity_df_per_timepoint_filtered_deduped.csv'), index=False) + + +# # process linear distance dfs +# total_df_distance = create_long_df_by_functional(func_table=cell_table_distances_broad, +# result_name='cluster_broad_freq', +# cluster_col_name='cell_cluster_broad', +# drop_cols=['label'], +# normalize=True, +# subset_col='tumor_region') + +# # create combined df +# total_df_distance = total_df_distance.merge(harmonized_metadata, on='fov', how='inner') +# total_df_distance = total_df_distance.rename(columns={'functional_marker': 'linear_distance'}) +# total_df_distance.dropna(inplace=True) + +# # save df +# total_df_distance.to_csv(os.path.join(output_dir, 'distance_df_per_core.csv'), index=False) + +# # filter distance scores to only include FOVs with at least the specified number of cells +# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +# min_cells = 5 + +# filtered_dfs = [] +# metrics = [['cluster_broad_count', 'cluster_broad_freq']] + +# for metric in metrics: +# # subset count df to include cells at the relevant clustering resolution +# for compartment in ['cancer_core', 'cancer_border', 'stroma_core', +# 'stroma_border', 'tagg', 'tls', 'all']: +# count_df = total_df[total_df.metric == metric[0]] +# count_df = count_df[count_df.subset == compartment] + +# # subset distance df to only include distance metrics at this resolution +# distance_df = total_df_distance[total_df_distance.metric == metric[1]] +# distance_df = distance_df[distance_df.subset == compartment] + +# # for each cell type, determine which FOVs have high enough counts to be included +# for cell_type in count_df.cell_type.unique(): +# keep_df = count_df[count_df.cell_type == cell_type] +# keep_df = keep_df[keep_df.value >= min_cells] +# keep_fovs = keep_df.fov.unique() + +# # subset morphology df to only include FOVs with high enough counts +# keep_features = distance_df[distance_df.cell_type == cell_type] +# keep_features = keep_features[keep_features.fov.isin(keep_fovs)] + +# # append to list of filtered dfs +# filtered_dfs.append(keep_features) + +# filtered_distance_df = pd.concat(filtered_dfs) + +# # save filtered df +# filtered_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_filtered.csv'), index=False) + +# # remove distances that are correlated with abundance of cell type + +# # load data +# # total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +# # density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')] +# # filtered_distance_df = filtered_distance_df.loc[(filtered_distance_df.metric == 'cluster_broad_freq') & (filtered_distance_df.subset == 'all')] +# # +# # # remove images without tumor cells +# # density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :] +# # filtered_distance_df = filtered_distance_df.loc[filtered_distance_df.fov.isin(density_df.fov.unique())] +# # cell_types = filtered_distance_df.cell_type.unique() +# # +# # # calculate which pairings to keep +# # keep_cells, keep_features = [], [] +# # +# # for cell_type in cell_types: +# # density_subset = density_df.loc[density_df.cell_type == cell_type] +# # distance_subset = filtered_distance_df.loc[filtered_distance_df.linear_distance == cell_type] +# # distance_wide = distance_subset.pivot(index='fov', columns='cell_type', values='value') +# # distance_wide.reset_index(inplace=True) +# # distance_wide = pd.merge(distance_wide, density_subset[['fov', 'value']], on='fov', how='inner') +# # +# # # get correlations +# # corr_df = distance_wide.corr(method='spearman') +# # +# # # determine which features to keep +# # corr_vals = corr_df.loc['value', :].abs() +# # corr_vals = corr_vals[corr_vals < 0.7] +# # +# # # add to list of features to keep +# # keep_cells.extend(corr_vals.index) +# # keep_features.extend([cell_type] * len(corr_vals.index)) +# # +# # keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features}) +# # +# # keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv'), index=False) + +# # filter distance df to only include features with low correlation with abundance +# keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'distance_df_keep_features.csv')) + + +# deduped_dfs = [] +# for cell_type in keep_df.cell_type.unique(): +# keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique() +# if len(keep_features) > 0: +# keep_df_subset = filtered_distance_df.loc[filtered_distance_df.cell_type == cell_type] +# keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)] +# deduped_dfs.append(keep_df_subset) + +# deduped_distance_df = pd.concat(deduped_dfs) + +# # save filtered df +# deduped_distance_df.to_csv(os.path.join(output_dir, 'distance_df_per_core_deduped.csv'), index=False) + +# # create version aggregated by timepoint +# deduped_distance_df_grouped = deduped_distance_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset']) +# deduped_distance_df_timepoint = deduped_distance_df_grouped['value'].agg([np.mean, np.std]) +# deduped_distance_df_timepoint.reset_index(inplace=True) +# deduped_distance_df_timepoint = deduped_distance_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# # save timepoint df +# deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False) # occupancy stat summary @@ -1043,7 +1047,7 @@ result_name=result_name, cluster_col_name=cluster_col_name, drop_cols=drop_cols, - zscore_threshold=0.0)) + positive_threshold=0.0)) # create combined df total_df_occupancy = pd.concat(occupancy_dfs, axis=0) @@ -1152,142 +1156,142 @@ deduped_occupancy_df_timepoint.to_csv(os.path.join(output_dir, 'occupancy_df_per_timepoint_deduped.csv'), index=False) -# create manual df with total morphology marker average across all cells in an image -morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] +# # create manual df with total morphology marker average across all cells in an image +# morph_table_small = cell_table_morph.loc[:, ~cell_table_morph.columns.isin(['cell_cluster', 'cell_cluster_broad', 'cell_meta_cluster', 'label', 'tumor_region'])] -# group by specified columns -grouped_table = morph_table_small.groupby('fov') -transformed = grouped_table.agg(np.mean) -transformed.reset_index(inplace=True) +# # group by specified columns +# grouped_table = morph_table_small.groupby('fov') +# transformed = grouped_table.agg(np.mean) +# transformed.reset_index(inplace=True) -# reshape to long df -long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature') -long_df['metric'] = 'total_freq' -long_df['cell_type'] = 'all' -long_df['subset'] = 'all' +# # reshape to long df +# long_df = pd.melt(transformed, id_vars=['fov'], var_name='morphology_feature') +# long_df['metric'] = 'total_freq' +# long_df['cell_type'] = 'all' +# long_df['subset'] = 'all' -long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') +# long_df = long_df.merge(harmonized_metadata, on='fov', how='inner') -long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False) +# long_df.to_csv(os.path.join(output_dir, 'total_morph_per_core.csv'), index=False) -# filter morphology markers to only include FOVs with at least the specified number of cells -total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) -min_cells = 5 +# # filter morphology markers to only include FOVs with at least the specified number of cells +# total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +# min_cells = 5 -filtered_dfs = [] -metrics = [['cluster_broad_count', 'cluster_broad_freq'], - ['cluster_count', 'cluster_freq'], - ['meta_cluster_count', 'meta_cluster_freq']] -for metric in metrics: - # subset count df to include cells at the relevant clustering resolution - for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', - 'tls', 'tagg', 'all']: - count_df = total_df[total_df.metric == metric[0]] - count_df = count_df[count_df.subset == compartment] +# filtered_dfs = [] +# metrics = [['cluster_broad_count', 'cluster_broad_freq'], +# ['cluster_count', 'cluster_freq'], +# ['meta_cluster_count', 'meta_cluster_freq']] +# for metric in metrics: +# # subset count df to include cells at the relevant clustering resolution +# for compartment in ['cancer_core', 'cancer_border', 'stroma_core', 'stroma_border', +# 'tls', 'tagg', 'all']: +# count_df = total_df[total_df.metric == metric[0]] +# count_df = count_df[count_df.subset == compartment] - # subset morphology df to only include morphology metrics at this resolution - morph_df = total_df_morph[total_df_morph.metric == metric[1]] - morph_df = morph_df[morph_df.subset == compartment] +# # subset morphology df to only include morphology metrics at this resolution +# morph_df = total_df_morph[total_df_morph.metric == metric[1]] +# morph_df = morph_df[morph_df.subset == compartment] - # for each cell type, determine which FOVs have high enough counts to be included - for cell_type in count_df.cell_type.unique(): - keep_df = count_df[count_df.cell_type == cell_type] - keep_df = keep_df[keep_df.value >= min_cells] - keep_fovs = keep_df.fov.unique() +# # for each cell type, determine which FOVs have high enough counts to be included +# for cell_type in count_df.cell_type.unique(): +# keep_df = count_df[count_df.cell_type == cell_type] +# keep_df = keep_df[keep_df.value >= min_cells] +# keep_fovs = keep_df.fov.unique() - # subset morphology df to only include FOVs with high enough counts - keep_features = morph_df[morph_df.cell_type == cell_type] - keep_features = keep_features[keep_features.fov.isin(keep_fovs)] +# # subset morphology df to only include FOVs with high enough counts +# keep_features = morph_df[morph_df.cell_type == cell_type] +# keep_features = keep_features[keep_features.fov.isin(keep_fovs)] - # append to list of filtered dfs - filtered_dfs.append(keep_features) +# # append to list of filtered dfs +# filtered_dfs.append(keep_features) -filtered_dfs.append(long_df) -filtered_morph_df = pd.concat(filtered_dfs) +# filtered_dfs.append(long_df) +# filtered_morph_df = pd.concat(filtered_dfs) -# save filtered df -filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False) +# # save filtered df +# filtered_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered.csv'), index=False) -# create version aggregated by timepoint -filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset']) -filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std]) -filtered_morph_df_timepoint.reset_index(inplace=True) -filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# # create version aggregated by timepoint +# filtered_morph_df_grouped = filtered_morph_df.groupby(['Tissue_ID', 'cell_type', 'morphology_feature', 'metric', 'subset']) +# filtered_morph_df_timepoint = filtered_morph_df_grouped['value'].agg([np.mean, np.std]) +# filtered_morph_df_timepoint.reset_index(inplace=True) +# filtered_morph_df_timepoint = filtered_morph_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') -# save timepoint df -filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False) +# # save timepoint df +# filtered_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered.csv'), index=False) -# remove redundant morphology features -block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter'] +# # remove redundant morphology features +# block1 = ['area', 'major_axis_length', 'minor_axis_length', 'perimeter', 'convex_area', 'equivalent_diameter'] -block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear'] +# block2 = ['area_nuclear', 'major_axis_length_nuclear', 'minor_axis_length_nuclear', 'perimeter_nuclear', 'convex_area_nuclear', 'equivalent_diameter_nuclear'] -block3 = ['eccentricity', 'major_axis_equiv_diam_ratio'] +# block3 = ['eccentricity', 'major_axis_equiv_diam_ratio'] -block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear'] +# block4 = ['eccentricity_nuclear', 'major_axis_equiv_diam_ratio_nuclear', 'perim_square_over_area_nuclear'] -deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] +# deduped_morph_df = filtered_morph_df.loc[~filtered_morph_df.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] -# only keep complex morphology features for cancer cells, remove everything except area and nc for others -cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17', - 'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim'] -basic_morph_features = ['area', 'area_nuclear', 'nc_ratio'] +# # only keep complex morphology features for cancer cells, remove everything except area and nc for others +# cancer_clusters = ['Cancer', 'Cancer_EMT', 'Cancer_Other', 'Cancer_CD56', 'Cancer_CK17', +# 'Cancer_Ecad', 'Cancer_Mono', 'Cancer_SMA', 'Cancer_Vim'] +# basic_morph_features = ['area', 'area_nuclear', 'nc_ratio'] -deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :] +# deduped_morph_df = deduped_morph_df.loc[~(~(deduped_morph_df.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df.morphology_feature.isin(basic_morph_features))), :] -# saved deduped -deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False) +# # saved deduped +# deduped_morph_df.to_csv(os.path.join(output_dir, 'morph_df_per_core_filtered_deduped.csv'), index=False) -# same for timepoints -deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] -deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :] -deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False) +# # same for timepoints +# deduped_morph_df_timepoint = filtered_morph_df_timepoint.loc[~filtered_morph_df_timepoint.morphology_feature.isin(block1[1:] + block2[1:] + block3[1:] + block4[1:]), :] +# deduped_morph_df_timepoint = deduped_morph_df_timepoint.loc[~(~(deduped_morph_df_timepoint.cell_type.isin(cancer_clusters)) & ~(deduped_morph_df_timepoint.morphology_feature.isin(basic_morph_features))), :] +# deduped_morph_df_timepoint.to_csv(os.path.join(output_dir, 'morph_df_per_timepoint_filtered_deduped.csv'), index=False) -# fiber analysis -fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])] +# # fiber analysis +# fiber_df = fiber_df.loc[:, ~fiber_df.columns.isin(['label', 'centroid-0', 'centroid-1'])] -fiber_df = fiber_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left') +# fiber_df = fiber_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left') -fiber_df_means = fiber_df.groupby(['Tissue_ID', 'fov']).agg(np.mean) -fiber_df_means.reset_index(inplace=True) +# fiber_df_means = fiber_df.groupby(['Tissue_ID', 'fov']).agg(np.mean) +# fiber_df_means.reset_index(inplace=True) -fiber_df_long = pd.melt(fiber_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value') -fiber_df_long['fiber_metric'] = 'fiber_' + fiber_df_long['fiber_metric'] +# fiber_df_long = pd.melt(fiber_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value') +# fiber_df_long['fiber_metric'] = 'fiber_' + fiber_df_long['fiber_metric'] -fiber_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_core.csv'), index=False) +# fiber_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_core.csv'), index=False) -# create version aggregated by timepoint -fiber_df_grouped = fiber_df_long.groupby(['Tissue_ID', 'fiber_metric']) -fiber_df_timepoint = fiber_df_grouped['value'].agg([np.mean, np.std]) -fiber_df_timepoint.reset_index(inplace=True) -fiber_df_timepoint = fiber_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# # create version aggregated by timepoint +# fiber_df_grouped = fiber_df_long.groupby(['Tissue_ID', 'fiber_metric']) +# fiber_df_timepoint = fiber_df_grouped['value'].agg([np.mean, np.std]) +# fiber_df_timepoint.reset_index(inplace=True) +# fiber_df_timepoint = fiber_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') -# save timepoint df -fiber_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_timepoint.csv'), index=False) +# # save timepoint df +# fiber_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_timepoint.csv'), index=False) -# for tiles, get max per image -fiber_tile_df = fiber_tile_df.dropna() -fiber_tile_df = fiber_tile_df.loc[:, ~fiber_tile_df.columns.isin(['pixel_density', 'tile_y', 'tile_x'])] -fiber_tile_df.columns = fiber_tile_df.columns.str.replace('avg_', '') -fiber_tile_df.columns = fiber_tile_df.columns.str.replace('fiber_', '') -fiber_tile_df = fiber_tile_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left') +# # for tiles, get max per image +# fiber_tile_df = fiber_tile_df.dropna() +# fiber_tile_df = fiber_tile_df.loc[:, ~fiber_tile_df.columns.isin(['pixel_density', 'tile_y', 'tile_x'])] +# fiber_tile_df.columns = fiber_tile_df.columns.str.replace('avg_', '') +# fiber_tile_df.columns = fiber_tile_df.columns.str.replace('fiber_', '') +# fiber_tile_df = fiber_tile_df.merge(harmonized_metadata[['Tissue_ID', 'fov']], on=['fov'], how='left') -# group by fov -fiber_tile_df_means = fiber_tile_df.groupby(['Tissue_ID', 'fov']).agg(np.max) -fiber_tile_df_means.reset_index(inplace=True) +# # group by fov +# fiber_tile_df_means = fiber_tile_df.groupby(['Tissue_ID', 'fov']).agg(np.max) +# fiber_tile_df_means.reset_index(inplace=True) -fiber_tile_df_long = pd.melt(fiber_tile_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value') -fiber_tile_df_long['fiber_metric'] = 'max_fiber_' + fiber_tile_df_long['fiber_metric'] +# fiber_tile_df_long = pd.melt(fiber_tile_df_means, id_vars=['Tissue_ID', 'fov'], var_name='fiber_metric', value_name='value') +# fiber_tile_df_long['fiber_metric'] = 'max_fiber_' + fiber_tile_df_long['fiber_metric'] -fiber_tile_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_tile.csv'), index=False) +# fiber_tile_df_long.to_csv(os.path.join(output_dir, 'fiber_df_per_tile.csv'), index=False) -# create version aggregated by timepoint -fiber_tile_df_grouped = fiber_tile_df_long.groupby(['Tissue_ID', 'fiber_metric']) -fiber_tile_df_timepoint = fiber_tile_df_grouped['value'].agg([np.mean, np.std]) -fiber_tile_df_timepoint.reset_index(inplace=True) -fiber_tile_df_timepoint = fiber_tile_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') +# # create version aggregated by timepoint +# fiber_tile_df_grouped = fiber_tile_df_long.groupby(['Tissue_ID', 'fiber_metric']) +# fiber_tile_df_timepoint = fiber_tile_df_grouped['value'].agg([np.mean, np.std]) +# fiber_tile_df_timepoint.reset_index(inplace=True) +# fiber_tile_df_timepoint = fiber_tile_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') -# save timepoint df -fiber_tile_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_tile_timepoint.csv'), index=False) +# # save timepoint df +# fiber_tile_df_timepoint.to_csv(os.path.join(output_dir, 'fiber_df_per_tile_timepoint.csv'), index=False) diff --git a/python_files/utils.py b/python_files/utils.py index 24a319c..2ea1f8a 100644 --- a/python_files/utils.py +++ b/python_files/utils.py @@ -250,16 +250,6 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su # get all the FOV names fov_names = cell_table_small["fov"].unique() - # if a population subset is specified, first validate then truncate the cell table accordingly - if pop_subset: - verify_in_list( - specified_populations=pop_subset, - valid_populations=cell_table_small[cluster_col_name].unique() - ) - # otherwise, use all possible subsets as defined by pop_col - else: - pop_subset = cell_table_small[cluster_col_name].unique() - # define the tile size in pixels for each FOV tile_size = max_image_size // tiles_per_row_col @@ -273,7 +263,7 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su cell_table_small["fov"].unique(), cell_table_small["tile_row"].unique(), cell_table_small["tile_col"].unique(), - cell_table_small[pop_col].unique() + cell_table_small[cluster_col_name].unique() ], names=["fov", "tile_row", "tile_col", cluster_col_name] ).to_frame(index=False) @@ -296,8 +286,8 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) # zscore the tile counts across the cohort grouped by the cell types - occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby( - [pop_col] + occupancy_counts["zscore_occupancy"] = occupancy_counts.groupby( + [cluster_col_name] )["occupancy"].transform(lambda x: zscore(x, ddof=0)) # mark tiles as positive depending on the positive_threshold value @@ -309,7 +299,7 @@ def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, su occupancy_zscore_groupby.append(subset_col) occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(occupancy_zscore_groupby).apply( lambda row: row["is_positive"].sum() - ).reset_ondex("total_positive_tiles") + ).reset_index(name="total_positive_tiles") # get the max image size, this will determine how many tiles there are when finding percentages occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply(