diff --git a/python_files/postprocessing/5_create_dfs_per_core.py b/python_files/postprocessing/5_create_dfs_per_core.py index e8b14c1..5669383 100644 --- a/python_files/postprocessing/5_create_dfs_per_core.py +++ b/python_files/postprocessing/5_create_dfs_per_core.py @@ -1042,6 +1042,132 @@ deduped_distance_df_timepoint.to_csv(os.path.join(output_dir, 'distance_df_per_timepoint_deduped.csv'), index=False) +# occupancy stat summary + +# Create list to hold parameters for each df that will be produced +occupancy_df_params = [['cluster_broad_freq', 'cell_cluster_broad']] + +occupancy_dfs = [] +for result_name, cluster_col_name in occupancy_df_params: + + # remove cluster_names except for the one specified for the df + drop_cols = ['cell_meta_cluster', 'cell_cluster', 'cell_cluster_broad', 'label'] + drop_cols.remove(cluster_col_name) + + # create df + occupancy_dfs.append(create_long_df_by_occupancy(cell_table=cell_table_occupancy, + result_name=result_name, + cluster_col_name=cluster_col_name, + drop_cols=drop_cols, + zscore_threshold=0.0)) + +# create combined df +total_df_occupancy = pd.concat(occupancy_dfs, axis=0) +total_df_occupancy = total_df_occupancy.merge(harmonized_metadata, on='fov', how='inner') +total_df_occupancy = total_df_occupancy.rename(columns={'functional_marker': 'occupancy_feature'}) + +# save df +total_df_occupancy.to_csv(os.path.join(output_dir, 'occupancy_df_per_core.csv'), index=False) + +# filter diversity scores to only include FOVs with at least the specified number of cells +total_df = pd.read_csv(os.path.join(output_dir, 'cluster_df_per_core.csv')) +min_cells = 5 + +filtered_dfs = [] +metrics = [['cluster_broad_count', 'cluster_broad_freq'], + ['cluster_count', 'cluster_freq'], + ['meta_cluster_count', 'meta_cluster_freq']] +for metric in metrics: + # subset count df to include cells at the relevant clustering resolution + for compartment in ['all']: + count_df = total_df[total_df.metric == metric[0]] + count_df = count_df[count_df.subset == compartment] + + # subset diversity df to only include diversity metrics at this resolution + occupancy_df = total_df_occupancy[total_df_occupancy.metric == metric[1]] + occupancy_df = occupancy_df[occupancy_df.subset == compartment] + + # for each cell type, determine which FOVs have high enough counts to be included + for cell_type in count_df.cell_type.unique(): + keep_df = count_df[count_df.cell_type == cell_type] + keep_df = keep_df[keep_df.value >= min_cells] + keep_fovs = keep_df.fov.unique() + + # subset morphology df to only include FOVs with high enough counts + keep_features = occupancy_df[occupancy_df.cell_type == cell_type] + keep_features = keep_features[keep_features.fov.isin(keep_fovs)] + + # append to list of filtered dfs + filtered_dfs.append(keep_features) + +filtered_occupancy_df = pd.concat(filtered_dfs) + +# save filtered df +filtered_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_filtered.csv'), index=False) + +# remove distances that are correlated with abundance of cell type + +if not os.path.exists(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv')): + density_df = total_df.loc[(total_df.metric == 'cluster_broad_density') & (total_df.subset == 'all')] + + # NOTE: this is purely for consistency, the table when created only includes the cluster_broad_freq metric and all compartments + filtered_occupancy_df = filtered_occupancy_df.loc[(filtered_occupancy_df.metric == 'cluster_broad_freq') & (filtered_occupancy_df.subset == 'all')] + + # remove images without tumor cells + density_df = density_df.loc[density_df.Timepoint != 'lymphnode_neg', :] + filtered_occupancy_df = filtered_occupancy_df.loc[filtered_occupancy_df.fov.isin(density_df.fov.unique())] + cell_types = filtered_occupancy_df.cell_type.unique() + + # calculate which pairings to keep + keep_cells, keep_features = [], [] + + for cell_type in cell_types: + density_subset = density_df.loc[density_df.cell_type == cell_type] + occupancy_subset = filtered_occupancy_df.loc[filtered_occupancy_df.linear_distance == cell_type] + occupancy_wide = occupancy_subset.pivot(index='fov', columns='cell_type', values='value') + occupancy_wide.reset_index(inplace=True) + occupancy_wide = pd.merge(occupancy_wide, density_subset[['fov', 'value']], on='fov', how='inner') + + # get correlations + corr_df = occupancy_wide.corr(method='spearman') + + # determine which features to keep + corr_vals = corr_df.loc['value', :].abs() + corr_vals = corr_vals[corr_vals < 0.7] + + # add to list of features to keep + keep_cells.extend(corr_vals.index) + keep_features.extend([cell_type] * len(corr_vals.index)) + + keep_df = pd.DataFrame({'cell_type': keep_cells, 'feature_name': keep_features}) + + keep_df.to_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv'), index=False) +else: + keep_df = pd.read_csv(os.path.join(intermediate_dir, 'post_processing', 'occupancy_df_keep_features.csv')) + +deduped_dfs = [] +for cell_type in keep_df.cell_type.unique(): + keep_features = keep_df.loc[keep_df.cell_type == cell_type, 'feature_name'].unique() + if len(keep_features) > 0: + keep_df_subset = filtered_occupancy_df.loc[filtered_occupancy_df.cell_type == cell_type] + keep_df_subset = keep_df_subset.loc[keep_df_subset.linear_distance.isin(keep_features)] + deduped_dfs.append(keep_df_subset) + +deduped_occupancy_df = pd.concat(deduped_dfs) + +# save filtered df +deduped_occupancy_df.to_csv(os.path.join(output_dir, 'occupancy_df_per_core_deduped.csv'), index=False) + +# create version aggregated by timepoint +deduped_occupancy_df_grouped = deduped_occupancy_df.groupby(['Tissue_ID', 'cell_type', 'linear_distance', 'metric', 'subset']) +deduped_occupancy_df_timepoint = deduped_occupancy_df_grouped['value'].agg([np.mean, np.std]) +deduped_occupancy_df_timepoint.reset_index(inplace=True) +deduped_occupancy_df_timepoint = deduped_occupancy_df_timepoint.merge(harmonized_metadata.drop(['fov', 'MIBI_data_generated'], axis=1).drop_duplicates(), on='Tissue_ID') + +# save timepoint df +deduped_occupancy_df_timepoint.to_csv(os.path.join(output_dir, 'occupancy_df_per_timepoint_deduped.csv'), index=False) + + # fiber analysis fiber_stats.columns = fiber_stats.columns.str.replace('avg_', '') fiber_stats.columns = fiber_stats.columns.str.replace('fiber_', '') diff --git a/python_files/supplementary_plot_helpers.py b/python_files/supplementary_plot_helpers.py index db6ca2a..bb7de7d 100644 --- a/python_files/supplementary_plot_helpers.py +++ b/python_files/supplementary_plot_helpers.py @@ -1,5 +1,7 @@ +import itertools import json import math +import matplotlib.colors as mcolors import matplotlib.pyplot as plt from matplotlib.ticker import ScalarFormatter, MaxNLocator import natsort as ns @@ -722,6 +724,386 @@ def stitch_before_after_norm( post_norm_tiled.save(post_norm_stitched_path) +def occupancy_division_factor(occupancy_group: pd.api.typing.DataFrameGroupBy, + tiles_per_row_col: int, + max_image_size: int, + cell_table: pd.DataFrame): + """Compute the denominator to ensure the proper percentage is computed for occupancy stats + + Args: + occupancy_group (pd.api.typing.DataFrameGroupBy): + The group by object for the FOV and cell type + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + max_image_size (int): + Maximum size of an image passed in, assuming square images + NOTE: all images should have an image size that is a factor of max_image_size + cell_table (pd.DataFrame): + The cell table associated with the cohort + + Returns: + float: + The factor to divide the occupancy stat positive counts by + """ + fov_name = occupancy_group["fov"].values[0] + image_size = cell_table.loc[cell_table["fov"] == fov_name, "fov_pixel_size"].values[0] + return tiles_per_row_col ** 2 / (max_image_size / image_size) + + +# occupancy statistic helpers +def compute_occupancy_statistics( + cell_table: pd.DataFrame, pop_col: str = "cell_cluster_broad", + pop_subset: Optional[List[str]] = None, tiles_per_row_col: int = 4, + max_image_size: int = 2048, positive_threshold: int = 0.0, sample_dir: Union[str, pathlib.Path] +): + """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile + across the cohort. + + Args: + cell_table (pd.DataFrame): + The cell table associated with the cohort, need to contain all FOVs as well as + a column indicating the size of the FOV. + pop_col (str): + Column containing the names of the cell populations + pop_subset (Optional[List[str]]): + Which populations, if any, to subset on. If None, use all populations + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + max_image_size (int): + Maximum size of an image passed in, assuming square images + NOTE: all images should have an image size that is a factor of `max_image_size` + positive_threshold (int): + The z-scored cell count in a tile required for positivity + + Returns: + Dict[str, float]: + A dictionary mapping each FOV to the percentage of tiles above the positive threshold + for number of cells. + """ + # get all the FOV names + fov_names: np.ndarray = cell_table["fov"].unique() + + # if a population subset is specified, first validate then truncate the cell table accordingly + if pop_subset: + verify_in_list( + specified_populations=pop_subset, + valid_populations=cell_table[pop_col].unique() + ) + # otherwise, use all possible subsets as defined by pop_col + else: + pop_subset = cell_table[pop_col].unique() + + # define the tile size in pixels for each FOV + cell_table["tile_size"] = max_image_size // tiles_per_row_col + + # define the tile for each cell + cell_table["tile_row"] = (cell_table["centroid-1"] // cell_table["tile_size"]).astype(int) + cell_table["tile_col"] = (cell_table["centroid-0"] // cell_table["tile_size"]).astype(int) + + # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col + all_combos = pd.MultiIndex.from_product( + [ + cell_table["fov"].unique(), + cell_table["tile_row"].unique(), + cell_table["tile_col"].unique(), + cell_table[pop_col].unique() + ], + names=["fov", "tile_row", "tile_col", pop_col] + ).to_frame(index=False) + + # compute the total occupancy of each tile for each population type + # NOTE: need to merge this way to account for tiles with 0 counts + occupancy_counts: pd.DataFrame = cell_table.groupby( + ["fov", "tile_row", "tile_col", pop_col] + ).size().reset_index(name="occupancy") + occupancy_counts = pd.merge( + all_combos, occupancy_counts, + on=["fov", "tile_row", "tile_col", pop_col], + how="left" + ) + occupancy_counts["occupancy"].fillna(0, inplace=True) + # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + + # zscore the tile counts across the cohort grouped by the cell types + occupancy_counts["zscore_occupancy"] = occupancy_stats.groupby( + [pop_col] + )["occupancy"].transform(lambda x: zscore(x, ddof=0)) + + # mark tiles as positive depending on the positive_threshold value + occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold + + # total up the positive tiles per FOV and cell type + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + lambda row: row["is_positive"].sum() + ).reset_ondex("total_positive_tiles") + + # occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + # lambda row: row["is_positive"].sum() / occupancy_division_factor( + # row, tiles_per_row_col, max_image_size, cell_table + # ) * 100 + # ).reset_index(name="percent_positive") + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(["fov", pop_col]).apply( + lambda row: row["is_positive"].sum() + ).reset_index(name="total_positive_tiles") + + # get the max image size, this will determine how many tiles there are when finding percentages + occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply( + lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0], + axis=1 + ) + occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply( + lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])), + axis=1 + ) + + # determine the percent positivity of tiles + occupancy_counts_grouped["percent_positive_tiles"] = \ + occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"] + + # occupancy_stats_grouped: pd.DataFrame = occupancy_stats.groupby( + # ["fov", "cell_cluster_broad"] + # ).apply(lambda row: row["is_positive"].sum()).reset_index(name="total_positive_tiles") + + # # everything after here is grouped + # occupancy_stats_grouped["num_cells"] = occupancy_stats.groupby( + # ["fov", "cell_cluster_broad", "positivity_threshold"] + # ).apply(lambda row: row["occupancy"].sum()).values + + # occuapcny_stats_grouped["max_tiles"] = occupancy_count + + return occupancy_counts_grouped + + # return occupancy_counts, occupancy_counts_grouped + + # # define the occupancy_stats array + # occupancy_stats = xr.DataArray( + # np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + # coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + # dims=["fov", "x", "y"] + # ) + + # # iterate over each population + # for pop in pop_subset: + # cell_table_pop = cell_table[cell_table[pop_col] == pop].copy() + + # # Group by FOV and tile positions, then count occupancy + # occupancy_counts: pd.DataFrame = cell_table_pop.groupby( + # ["fov", "tile_row", "tile_col"] + # ).size().reset_index(name="occupancy") + + # # define the occupancy_stats array + # occupancy_stats = xr.DataArray( + # np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + # coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + # dims=["fov", "x", "y"] + # ) + + # # Update the DataArray based on occupancy_counts without explicit Python loops + # for index, row in occupancy_counts.iterrows(): + # occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] + + # # define the tiles that are positive based on threshold + # occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold + + # # compute the percentage of positive tiles for each FOV + # occupancy_stats_sum: xr.DataArray = occupancy_stats_positivity.sum( + # dim=['x', 'y'] + # ) / (tiles_per_row_col ** 2) + + # # convert to dict + # occupancy_stats_dict: Dict[str, float] = occupancy_stats_sum.to_series().to_dict() + # occupancy_stats_dict = {fov: percentage for fov, percentage in occupancy_stats_dict.items()} + + # return occupancy_stats_dict + + +def visualize_occupancy_statistics( + occupancy_stats_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], + pop_col: str = "cell_cluster_broad", pop_subset: Optional[List[str]] = None, + figsize: Optional[Tuple[float, float]] = (10, 10) +): + """Visualize the distribution of the percentage of tiles above a positive threshold. + + Data needs to be generated by compute_occupancy_statistics. + + Args: + occupancy_stats_table (pd.DataFrame): + The table generated by compute_occupancy_statistics, lists the percentage of positive + tiles for each image of the cohort at different positivity thresholds and grid sizes + save_dir (Union[str, pathlib.Path]): + Directory to save the visualizations in + pop_col (str): + Column containing the names of the cell populations + pop_subset (Optional[List[str]]): + Which populations, if any, to subset on. If None, use all populations + fig_size (Optional[Tuple[float, float]]): + The figure size to use for the image. + """ + if pop_subset is not None: + verify_in_list( + specified_populations=pop_subset, + valid_populations=occupancy_stats_table[pop_col].unique() + ) + else: + pop_subset = occupancy_stats_table[pop_col].unique() + + for pop in pop_subset: + occupancy_stats_table_sub = occupancy_stats_table[occupancy_stats_table[pop_col] == pop] + + # generate each unique test pair + tiles_threshold_trials = occupancy_stats_table_sub[ + ["num_tiles", "positive_threshold"] + ].drop_duplicates().reset_index(drop=True) + + # define a separate plot on the same grid for each test pair + fig, axs = plt.subplots( + tiles_threshold_trials.shape[0], 1, figsize=figsize + ) + populations_str = f"{pop} cell populations" + # fig.suptitle(f"Percentage positive tile distributions for {populations_str}", fontsize=24) + # fig.subplots_adjust(top=1.50) + + # define an index to iterate through the subplots + subplot_i = 0 + + for _, trial in tiles_threshold_trials.iterrows(): + grid_size = int(np.sqrt(trial["num_tiles"])) + positive_threshold = trial["positive_threshold"] + + # subset data obtained just for the specified trial + trial_data = occupancy_stats_table_sub[ + (occupancy_stats_table_sub["num_tiles"] == trial["num_tiles"]) & + (occupancy_stats_table_sub["positive_threshold"] == trial["positive_threshold"]) + ] + + # visualize the distribution using a histogram + positive_tile_percentages = trial_data["percent_positive"].values + axs[subplot_i].hist( + positive_tile_percentages, + facecolor="g", + bins=20, + alpha=0.75 + ) + axs[subplot_i].set_title( + f"Grid size: {grid_size}x{grid_size}, Positive threshold: {positive_threshold}" + ) + + # update the subplot index + subplot_i += 1 + + plt.tight_layout() + + # save the figure to save_dir + fig.savefig( + pathlib.Path(save_dir) / f"occupancy_statistic_distributions_{pop}.png", + dpi=300 + ) + + +def visualize_occupancy_statistics_old( + cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], max_image_size: int = 2048, + tiles_per_row_col: int = 8, positive_threshold: int = 20 +): + """Define occupancy statistics over a cohort, and visualize distribution of positive + tile counts across FOV. + + Args: + cell_table (pd.DataFrame): + The cell table associated with the cohort, need to contain all FOVs + save_dir (Union[str, pathlib.Path]): + Directory to save the visualizations in + max_image_size (int): + The max image size to load in + tiles_per_row_col (int): + The row/col dims of tiles to define over each image + positive_threshold (int): + The cell count in a tile required for positivity + """ + # get all the FOV names + fov_names: np.ndarray = cell_table["fov"].unique() + + # define the tile size in pixels + tile_size: int = max_image_size // tiles_per_row_col + + cell_table["tile_row"] = (cell_table["centroid-1"] // tile_size).astype(int) + cell_table["tile_col"] = (cell_table["centroid-0"] // tile_size).astype(int) + + # Group by FOV and tile positions, then count occupancy + occupancy_counts: pd.DataFrame = cell_table.groupby( + ["fov", "tile_row", "tile_col"] + ).size().reset_index(name="occupancy") + + # define the occupancy_stats array + occupancy_stats = xr.DataArray( + np.zeros((len(fov_names), tiles_per_row_col, tiles_per_row_col), dtype=np.int16), + coords=[fov_names, np.arange(tiles_per_row_col), np.arange(tiles_per_row_col)], + dims=["fov", "x", "y"] + ) + + # Update the DataArray based on occupancy_counts without explicit Python loops + for index, row in occupancy_counts.iterrows(): + occupancy_stats.loc[row["fov"], row["tile_row"], row["tile_col"]] = row["occupancy"] + + # define the tiles that are positive based on threshold + occupancy_stats_positivity: xr.DataArray = occupancy_stats > positive_threshold + + # count number of positive tiles per FOV + fov_positive_tiles: xr.DataArray = occupancy_stats_positivity.sum(dim=["x", "y"]) + fov_positive_tile_counts: Dict[str, int] = dict( + zip(fov_positive_tiles.fov.values, fov_positive_tiles.values) + ) + + # 1. visualize histogram of positive tile counts across cohort + fig, axs = plt.subplots( + 3, + 1, + figsize=(10, 10) + ) + print(list(fov_positive_tile_counts.values())) + axs[0].hist(list(fov_positive_tile_counts.values())) + axs[0].set_title("Distribution of tile positivity") + + # 2. visualize heatmap of average positivity of tiles across FOVs + cmap_positivity = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) + c_positivity = axs[1].imshow( + occupancy_stats_positivity.mean(dim="fov").values, + cmap=cmap_positivity, + aspect="auto", + vmin=0, + vmax=1 + ) + axs[1].set_xticks([]) + axs[1].set_yticks([]) + axs[1].set_title("Mean positivity per tile") + fig.colorbar(c_positivity, ax=axs[1], ticks=[0, 0.25, 0.5, 0.75, 1]) + + # 3. visualize heatmap of average number of counts across FOVs + cmap_counts = mcolors.LinearSegmentedColormap.from_list("", ["white", "red"]) + max_occupancy_stat = np.round(np.max(occupancy_stats.mean(dim="fov").values), 2) + c_counts = axs[2].imshow( + occupancy_stats.mean(dim="fov").values, + cmap=cmap_counts, + aspect="auto", + vmin=0, + vmax=max_occupancy_stat + ) + axs[2].set_xticks([]) + axs[2].set_yticks([]) + axs[2].set_title("Mean num cells per tile") + fig.colorbar(c_counts, ax=axs[2], ticks=np.round(np.linspace(0, max_occupancy_stat, 4), 2)) + + fig.suptitle( + f"Stats for # tiles: {tiles_per_row_col}, positive threshold: {positive_threshold}", + fontsize=24 + ) + + # save the figure to save_dir + fig.savefig( + pathlib.Path(save_dir) / f"occupancy_stats_{tiles_per_row_col}_{positive_threshold}.png", + dpi=300 + ) + + def run_functional_marker_positivity_tuning_tests( cell_table: pd.DataFrame, save_dir: Union[str, pathlib.Path], marker_info: Dict[str, MarkerDict], threshold_mults: List[float] diff --git a/python_files/utils.py b/python_files/utils.py index cdd4192..c9e323a 100644 --- a/python_files/utils.py +++ b/python_files/utils.py @@ -11,6 +11,7 @@ import pandas as pd from skimage import morphology from scipy.ndimage import gaussian_filter +from scipy.stats import zscore from skimage.segmentation import find_boundaries from skimage.measure import label import skimage.io as io @@ -229,6 +230,155 @@ def create_long_df_by_functional(func_table, cluster_col_name, drop_cols, result return long_df_all +def occupancy_df_helper(cell_table, cluster_col_name, drop_cols, result_name, subset_col=None, + tiles_per_row_col=4, max_image_size=2048, positive_threshold=0): + """Helper function for creating the unmelted occupancy table + + Args: + cell_table (pd.DataFrame): the dataframe containing information on each cell + cluster_col_name (str): the column name in cell_table that contains the cluster information + drop_cols (list): list of columns to drop from cell_table + result_name (str): the name of this statistic in the returned df + subset_col (str): the name of the subset col to group by, if not provided then skip + tiles_per_row_col (int): the row/col dims of tiles to define over each image + max_image_size (int): maximum size of an image passed in, assuming square images + positive_threshold (int): the z-scored cell count in a tile required for positivity + + Returns: + pd.DataFrame: occupancy stats in unmelted format""" + + verify_in_list(cell_type_col=cluster_col_name, cell_table_columns=cell_table.columns) + verify_in_list(drop_cols=drop_cols, cell_table_columns=cell_table.columns) + if subset_col is not None: + verify_in_list(subset_col=subset_col, cell_table_columns=cell_table.columns) + + # remove the drop columns + cell_table_small = cell_table.loc[:, ~cell_table.columns.isin(drop_cols)] + + # get all the FOV names + fov_names = cell_table_small["fov"].unique() + + # define the tile size in pixels for each FOV + tile_size = max_image_size // tiles_per_row_col + + # define the tile for each cell + cell_table_small["tile_row"] = (cell_table_small["centroid-1"] // tile_size).astype(int) + cell_table_small["tile_col"] = (cell_table_small["centroid-0"] // tile_size).astype(int) + + # Create a DataFrame with all combinations of fov, tile_row, tile_col, and pop_col + all_combos: pd.DataFrame = pd.MultiIndex.from_product( + [ + cell_table_small["fov"].unique(), + cell_table_small["tile_row"].unique(), + cell_table_small["tile_col"].unique(), + cell_table_small[cluster_col_name].unique() + ], + names=["fov", "tile_row", "tile_col", cluster_col_name] + ).to_frame(index=False) + + occupancy_count_groupby = ["fov", "tile_row", "tile_col", cluster_col_name] + if subset_col: + occupancy_group_by.append("subset_col") + + # compute the total occupancy of each tile for each population type + # NOTE: need to merge this way to account for tiles with 0 counts + occupancy_counts = cell_table_small.groupby( + occupancy_count_groupby + ).size().reset_index(name="occupancy") + occupancy_counts = pd.merge( + all_combos, occupancy_counts, + on=occupancy_count_groupby, + how="left" + ) + occupancy_counts["occupancy"].fillna(0, inplace=True) + # print(occupancy_counts[["tile_row", "tile_col"]].drop_duplicates()) + + # zscore the tile counts across the cohort grouped by the cell types + occupancy_counts["zscore_occupancy"] = occupancy_counts.groupby( + [cluster_col_name] + )["occupancy"].transform(lambda x: zscore(x, ddof=0)) + + # mark tiles as positive depending on the positive_threshold value + occupancy_counts["is_positive"] = occupancy_counts["zscore_occupancy"] > positive_threshold + + # total up the positive tiles per FOV and cell type + occupancy_zscore_groupby = ["fov", cluster_col_name] + if subset_col is not None: + occupancy_zscore_groupby.append(subset_col) + occupancy_counts_grouped: pd.DataFrame = occupancy_counts.groupby(occupancy_zscore_groupby).apply( + lambda row: row["is_positive"].sum() + ).reset_index(name="total_positive_tiles") + + # get the max image size, this will determine how many tiles there are when finding percentages + occupancy_counts_grouped["image_size"] = occupancy_counts_grouped.apply( + lambda row: io.imread(os.path.join(samples_dir, row["fov"], "Calprotectin.tiff")).shape[0], + axis=1 + ) + occupancy_stats_grouped["max_tiles"] = occupancy_stats_grouped.apply( + lambda row: tiles_per_row_col ** 2 ** (1 / (max_image_size / row["image_size"])), + axis=1 + ) + + # determine the percent positivity of tiles + occupancy_counts_grouped["percent_positive_tiles"] = \ + occupancy_stats_grouped["total_positive_tiles"] / occupancy_stats_grouped["max_tiles"] + + # reshape to long df + occupancy_long_df = pd.melt( + occupancy_counts_grouped[occupancy_zscore_groupby + ["percent_positive_tiles"]], + id_vars=occupancy_zscore_groupby, + var_name='functional_marker' + ) + occupancy_long_df['metric'] = result_name + occupancy_long_df = long_df.rename(columns={cluster_col_name: "cell_type"}) + if subset_col is not None: + occupancy_long_df.rename(columns={subset_col: "subset"}) + + return occupancy_long_df + + +def create_long_df_by_occupancy(cell_table, cluster_col_name, drop_cols, result_name, + subset_col=None, tiles_per_row_col=4, max_image_size=2048, + positive_threshold=0): + """Compute the occupancy statistics over a cohort, based on z-scored cell counts per tile + across the cohort. + + Args: + cell_table (pd.DataFrame): the dataframe containing information on each cell + cluster_col_name (str): the column name in cell_table that contains the cluster information + drop_cols (list): list of columns to drop from cell_table + result_name (str): the name of this statistic in the returned df + subset_col (str): the column name in cell_table to subset by + tiles_per_row_col (int): the row/col dims of tiles to define over each image + max_image_size (int): maximum size of an image passed in, assuming square images + positive_threshold (int): the z-scored cell count in a tile required for positivity + + Returns: + pd.DataFrame: long format dataframe containing the summarized occupancy data""" + + # first generate df without subsetting + drop_cols_all = drop_cols.copy() + if subset_col is not None: + drop_cols_all = drop_cols + [subset_col] + + occupancy_long_all_compartments = occupancy_df_helper( + cell_table, cluster_col_name, drop_cols, result_name, None, + tiles_per_row_col, max_image_size, positive_threshold + ) + long_df_all["subset"] = "all" + + if subset_col is not None: + occupancy_long_per_compartment = occupancy_df_helper( + cell_table, cluster_col_name, drop_cols, result_name, subset_col, + tiles_per_row_col, max_image_size, positive_threshold + ) + occupancy_long_comb = pd.concat( + [occupancy_long_all_compartments, occupancy_long_per_compartment] + ) + + return occupancy_long_comb + + def create_channel_mask(img, intensity_thresh, sigma, min_mask_size=0, max_hole_size=100000): """Generates a binary mask from a single channel image