From 56048455e9eadd97cb42cd781bbda5ad14a354e6 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 18:05:27 +0100 Subject: [PATCH 1/7] perf: Use ds.variables to avoid _construct_dataarray overhead Optimize several functions by using ds.variables instead of iterating over data_vars.items() or accessing ds[name], which triggers slow _construct_dataarray calls. Changes: - io.py: save_dataset_to_netcdf, load_dataset_from_netcdf, _reduce_constant_arrays - structure.py: from_dataset (use coord_cache pattern) - core.py: drop_constant_arrays (use numpy operations) Co-Authored-By: Claude Opus 4.5 --- flixopt/core.py | 17 +++++++---- flixopt/io.py | 69 +++++++++++++++++++++++++++++--------------- flixopt/structure.py | 12 +++++++- 3 files changed, 68 insertions(+), 30 deletions(-) diff --git a/flixopt/core.py b/flixopt/core.py index 0470c1995..ba8618e1a 100644 --- a/flixopt/core.py +++ b/flixopt/core.py @@ -629,17 +629,24 @@ def drop_constant_arrays( Dataset with constant variables removed. """ drop_vars = [] + # Use ds.variables for faster access (avoids _construct_dataarray overhead) + variables = ds.variables - for name, da in ds.data_vars.items(): + for name in ds.data_vars: + var = variables[name] # Skip variables without the dimension - if dim not in da.dims: + if dim not in var.dims: if drop_arrays_without_dim: drop_vars.append(name) continue - # Check if variable is constant along the dimension (ptp < atol) - ptp = da.max(dim, skipna=True) - da.min(dim, skipna=True) - if (ptp < atol).all().item(): + # Check if variable is constant along the dimension using numpy (ptp < atol) + axis = var.dims.index(dim) + data = var.values + # Use numpy operations directly for speed + with np.errstate(invalid='ignore'): # Ignore NaN warnings + ptp = np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis) + if np.all(ptp < atol): drop_vars.append(name) if drop_vars: diff --git a/flixopt/io.py b/flixopt/io.py index bbc6ec80b..e2919a89e 100644 --- a/flixopt/io.py +++ b/flixopt/io.py @@ -561,14 +561,18 @@ def save_dataset_to_netcdf( ds.attrs = {'attrs': json.dumps(ds.attrs)} # Convert all DataArray attrs to JSON strings - for var_name, data_var in ds.data_vars.items(): - if data_var.attrs: # Only if there are attrs - ds[var_name].attrs = {'attrs': json.dumps(data_var.attrs)} + # Use ds.variables to avoid slow _construct_dataarray calls + variables = ds.variables + for var_name in ds.data_vars: + var = variables[var_name] + if var.attrs: # Only if there are attrs + var.attrs = {'attrs': json.dumps(var.attrs)} # Also handle coordinate attrs if they exist - for coord_name, coord_var in ds.coords.items(): - if hasattr(coord_var, 'attrs') and coord_var.attrs: - ds[coord_name].attrs = {'attrs': json.dumps(coord_var.attrs)} + for coord_name in ds.coords: + var = variables[coord_name] + if var.attrs: + var.attrs = {'attrs': json.dumps(var.attrs)} # Suppress numpy binary compatibility warnings from netCDF4 (numpy 1->2 transition) with warnings.catch_warnings(): @@ -602,25 +606,38 @@ def _reduce_constant_arrays(ds: xr.Dataset) -> xr.Dataset: Dataset with constant dimensions reduced. """ new_data_vars = {} + variables = ds.variables + + for name in ds.data_vars: + var = variables[name] + dims = var.dims + data = var.values - for name, da in ds.data_vars.items(): - if not da.dims or da.size == 0: - new_data_vars[name] = da + if not dims or data.size == 0: + new_data_vars[name] = var continue - # Try to reduce each dimension - reduced = da - for dim in list(da.dims): - if dim not in reduced.dims: + # Try to reduce each dimension using numpy operations + reduced_data = data + reduced_dims = list(dims) + + for _axis, dim in enumerate(dims): + if dim not in reduced_dims: continue # Already removed - # Check if constant along this dimension - first_slice = reduced.isel({dim: 0}) - is_constant = (reduced == first_slice).all() + + current_axis = reduced_dims.index(dim) + # Check if constant along this axis using numpy + first_slice = np.take(reduced_data, 0, axis=current_axis) + # Broadcast first_slice to compare + expanded = np.expand_dims(first_slice, axis=current_axis) + is_constant = np.allclose(reduced_data, expanded, equal_nan=True) + if is_constant: # Remove this dimension by taking first slice - reduced = first_slice + reduced_data = first_slice + reduced_dims.pop(current_axis) - new_data_vars[name] = reduced + new_data_vars[name] = xr.Variable(tuple(reduced_dims), reduced_data, attrs=var.attrs) return xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs) @@ -754,14 +771,18 @@ def load_dataset_from_netcdf(path: str | pathlib.Path) -> xr.Dataset: ds.attrs = json.loads(ds.attrs['attrs']) # Restore DataArray attrs (before unstacking, as stacked vars have no individual attrs) - for var_name, data_var in ds.data_vars.items(): - if 'attrs' in data_var.attrs: - ds[var_name].attrs = json.loads(data_var.attrs['attrs']) + # Use ds.variables to avoid slow _construct_dataarray calls + variables = ds.variables + for var_name in ds.data_vars: + var = variables[var_name] + if 'attrs' in var.attrs: + var.attrs = json.loads(var.attrs['attrs']) # Restore coordinate attrs - for coord_name, coord_var in ds.coords.items(): - if hasattr(coord_var, 'attrs') and 'attrs' in coord_var.attrs: - ds[coord_name].attrs = json.loads(coord_var.attrs['attrs']) + for coord_name in ds.coords: + var = variables[coord_name] + if 'attrs' in var.attrs: + var.attrs = json.loads(var.attrs['attrs']) # Unstack variables if they were stacked during saving # Detection: check if any dataset dimension starts with '__stacked__' diff --git a/flixopt/structure.py b/flixopt/structure.py index 8df65aae8..d165667bb 100644 --- a/flixopt/structure.py +++ b/flixopt/structure.py @@ -1116,7 +1116,17 @@ def from_dataset(cls, ds: xr.Dataset) -> Interface: reference_structure.pop('__class__', None) # Create arrays dictionary from dataset variables - arrays_dict = {name: array for name, array in ds.data_vars.items()} + # Use ds.variables with coord_cache for faster DataArray construction + variables = ds.variables + coord_cache = {k: ds.coords[k] for k in ds.coords} + arrays_dict = { + name: xr.DataArray( + variables[name], + coords={k: coord_cache[k] for k in variables[name].dims if k in coord_cache}, + name=name, + ) + for name in ds.data_vars + } # Resolve all references using the centralized method resolved_params = cls._resolve_reference_structure(reference_structure, arrays_dict) From 022f7a4e838a8605cb37592e166210f4a53e5a2a Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 18:16:54 +0100 Subject: [PATCH 2/7] perf: Optimize clustering serialization with ds.variables Use ds.variables for faster access in clustering/base.py: - _create_reference_structure: original_data and metrics iteration - compare plot: duration_curve generation with direct numpy indexing Co-Authored-By: Claude Opus 4.5 --- flixopt/clustering/base.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index d2b46a236..ee0d2bf43 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -1113,12 +1113,17 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup original_data_refs = None if include_original_data and self.original_data is not None: original_data_refs = [] - for name, da in self.original_data.data_vars.items(): + # Use variables for faster access (avoids _construct_dataarray overhead) + variables = self.original_data.variables + for name in self.original_data.data_vars: + var = variables[name] ref_name = f'original_data|{name}' # Rename time dim to avoid xarray alignment issues - if 'time' in da.dims: - da = da.rename({'time': 'original_time'}) - arrays[ref_name] = da + if 'time' in var.dims: + new_dims = tuple('original_time' if d == 'time' else d for d in var.dims) + arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs) + else: + arrays[ref_name] = var original_data_refs.append(f':::{ref_name}') # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's @@ -1129,9 +1134,11 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup metrics_refs = None if self._metrics is not None: metrics_refs = [] - for name, da in self._metrics.data_vars.items(): + # Use variables for faster access (avoids _construct_dataarray overhead) + metrics_vars = self._metrics.variables + for name in self._metrics.data_vars: ref_name = f'metrics|{name}' - arrays[ref_name] = da + arrays[ref_name] = metrics_vars[name] metrics_refs.append(f':::{ref_name}') reference = { @@ -1415,9 +1422,15 @@ def compare( if kind == 'duration_curve': sorted_vars = {} + # Use variables for faster access (avoids _construct_dataarray overhead) + variables = ds.variables + rep_values = ds.coords['representation'].values + rep_idx = {rep: i for i, rep in enumerate(rep_values)} for var in ds.data_vars: - for rep in ds.coords['representation'].values: - values = np.sort(ds[var].sel(representation=rep).values.flatten())[::-1] + data = variables[var].values + for rep in rep_values: + # Direct numpy indexing instead of .sel() + values = np.sort(data[rep_idx[rep]].flatten())[::-1] sorted_vars[(var, rep)] = values # Get length from first sorted array n = len(next(iter(sorted_vars.values()))) From cdcc194040f2784708183ff1883bea54568c1fe9 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 22:12:59 +0100 Subject: [PATCH 3/7] perf: Use batch assignment for clustering arrays (24x speedup) _add_clustering_to_dataset was slow due to 210 individual ds[name] = arr assignments. Each triggers xarray's expensive dataset_update_method. Changed to batch assignment with ds.assign(dict): - Before: ~2600ms for to_dataset with clustering - After: ~109ms for to_dataset with clustering Co-Authored-By: Claude Opus 4.5 --- flixopt/io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/flixopt/io.py b/flixopt/io.py index e2919a89e..2ec6a9edc 100644 --- a/flixopt/io.py +++ b/flixopt/io.py @@ -1886,9 +1886,10 @@ def _add_clustering_to_dataset( clustering_ref, clustering_arrays = clustering._create_reference_structure( include_original_data=include_original_data ) - # Add clustering arrays with prefix - for name, arr in clustering_arrays.items(): - ds[f'{cls.CLUSTERING_PREFIX}{name}'] = arr + # Add clustering arrays with prefix using batch assignment + # (individual ds[name] = arr assignments are slow) + prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()} + ds = ds.assign(prefixed_arrays) ds.attrs['clustering'] = json.dumps(clustering_ref) return ds From b215959ac354945111d5f257d0c91c4028037002 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 22:20:29 +0100 Subject: [PATCH 4/7] perf: Use ds.variables in _build_reduced_dataset (12% faster) Avoided _construct_dataarray overhead by: - Using ds.variables instead of ds.data_vars.items() - Using numpy slicing instead of .isel() - Passing attrs dict directly instead of DataArray cluster() benchmark: - Before: ~10.1s - After: ~8.9s Co-Authored-By: Claude Opus 4.5 --- flixopt/transform_accessor.py | 45 ++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 05a95ba07..196e9f014 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -525,35 +525,48 @@ def _build_reduced_dataset( all_keys = {(p, s) for p in periods for s in scenarios} ds_new_vars = {} - for name, original_da in ds.data_vars.items(): - if 'time' not in original_da.dims: - ds_new_vars[name] = original_da.copy() + # Use ds.variables to avoid _construct_dataarray overhead + variables = ds.variables + coord_cache = {k: ds.coords[k].values for k in ds.coords} + + for name in ds.data_vars: + var = variables[name] + if 'time' not in var.dims: + # No time dimension - wrap Variable in DataArray + coords = {d: coord_cache[d] for d in var.dims if d in coord_cache} + ds_new_vars[name] = xr.DataArray(var.values, dims=var.dims, coords=coords, attrs=var.attrs, name=name) elif name not in typical_das or set(typical_das[name].keys()) != all_keys: # Time-dependent but constant: reshape to (cluster, time, ...) - sliced = original_da.isel(time=slice(0, n_reduced_timesteps)) - other_dims = [d for d in sliced.dims if d != 'time'] - other_shape = [sliced.sizes[d] for d in other_dims] + # Use numpy slicing instead of .isel() + time_idx = var.dims.index('time') + slices = [slice(None)] * len(var.dims) + slices[time_idx] = slice(0, n_reduced_timesteps) + sliced_values = var.values[tuple(slices)] + + other_dims = [d for d in var.dims if d != 'time'] + other_shape = [var.sizes[d] for d in other_dims] new_shape = [actual_n_clusters, n_time_points] + other_shape - reshaped = sliced.values.reshape(new_shape) + reshaped = sliced_values.reshape(new_shape) new_coords = {'cluster': cluster_coords, 'time': time_coords} for dim in other_dims: - new_coords[dim] = sliced.coords[dim].values + if dim in coord_cache: + new_coords[dim] = coord_cache[dim] ds_new_vars[name] = xr.DataArray( reshaped, dims=['cluster', 'time'] + other_dims, coords=new_coords, - attrs=original_da.attrs, + attrs=var.attrs, ) else: # Time-varying: combine per-(period, scenario) slices da = self._combine_slices_to_dataarray_2d( slices=typical_das[name], - original_da=original_da, + attrs=var.attrs, periods=periods, scenarios=scenarios, ) - if TimeSeriesData.is_timeseries_data(original_da): - da = TimeSeriesData.from_dataarray(da.assign_attrs(original_da.attrs)) + if var.attrs.get('__timeseries_data__', False): + da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs)) ds_new_vars[name] = da # Copy attrs but remove cluster_weight @@ -1639,7 +1652,7 @@ def _combine_slices_to_dataarray_generic( @staticmethod def _combine_slices_to_dataarray_2d( slices: dict[tuple, xr.DataArray], - original_da: xr.DataArray, + attrs: dict, periods: list, scenarios: list, ) -> xr.DataArray: @@ -1647,7 +1660,7 @@ def _combine_slices_to_dataarray_2d( Args: slices: Dict mapping (period, scenario) tuples to DataArrays with (cluster, time) dims. - original_da: Original DataArray to get attrs from. + attrs: Attributes to assign to the result. periods: List of period labels ([None] if no periods dimension). scenarios: List of scenario labels ([None] if no scenarios dimension). @@ -1660,7 +1673,7 @@ def _combine_slices_to_dataarray_2d( # Simple case: no period/scenario dimensions if not has_periods and not has_scenarios: - return slices[first_key].assign_attrs(original_da.attrs) + return slices[first_key].assign_attrs(attrs) # Multi-dimensional: use xr.concat to stack along period/scenario dims if has_periods and has_scenarios: @@ -1678,7 +1691,7 @@ def _combine_slices_to_dataarray_2d( # Put cluster and time first (standard order for clustered data) result = result.transpose('cluster', 'time', ...) - return result.assign_attrs(original_da.attrs) + return result.assign_attrs(attrs) def _validate_for_expansion(self) -> Clustering: """Validate FlowSystem can be expanded and return clustering info. From 0754f06f2da159fb7971e64ed3a2462660b676d1 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 22:35:00 +0100 Subject: [PATCH 5/7] perf: Use numpy reshape in _build_typical_das (4.4x faster) Eliminated 451,856 slow pandas .loc calls by using numpy reshape for segmented clustering data instead of iterating per-cluster. cluster() with segments benchmark (50 clusters, 4 segments): - Before: ~93.7s - After: ~21.1s - Speedup: 4.4x Co-Authored-By: Claude Opus 4.5 --- flixopt/transform_accessor.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 196e9f014..0ec24b57b 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -195,15 +195,22 @@ def _build_typical_das( for key, tsam_result in tsam_aggregation_results.items(): typical_df = tsam_result.cluster_representatives if is_segmented: - # Segmented data: MultiIndex (Segment Step, Segment Duration) - # Need to extract by cluster (first level of index) - for col in typical_df.columns: - data = np.zeros((actual_n_clusters, n_time_points)) - for cluster_id in range(actual_n_clusters): - cluster_data = typical_df.loc[cluster_id, col] - data[cluster_id, :] = cluster_data.values[:n_time_points] + # Segmented data: MultiIndex with cluster as first level + # Each cluster has exactly n_time_points rows (segments) + # Extract all data at once using numpy reshape, avoiding slow .loc calls + columns = typical_df.columns.tolist() + + # Get all values as numpy array: (n_clusters * n_time_points, n_columns) + all_values = typical_df.values + + # Reshape to (n_clusters, n_time_points, n_columns) + reshaped = all_values.reshape(actual_n_clusters, n_time_points, -1) + + for col_idx, col in enumerate(columns): + # reshaped[:, :, col_idx] selects all clusters, all time points, single column + # Result shape: (n_clusters, n_time_points) typical_das.setdefault(col, {})[key] = xr.DataArray( - data, + reshaped[:, :, col_idx], dims=['cluster', 'time'], coords={'cluster': cluster_coords, 'time': time_coords}, ) From 0203945d6b8f08ac4bb51f3d835569b4a8538083 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 23:06:39 +0100 Subject: [PATCH 6/7] fix: Multiple clustering and IO bug fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - benchmark_io_performance.py: Add Gurobi → HiGHS solver fallback - components.py: Fix storage decay to use sum (not mean) for hours per cluster - flow_system.py: Add RangeIndex validation requiring explicit timestep_duration - io.py: Include auxiliary coordinates in _fast_get_dataarray - transform_accessor.py: Add empty dataset guard after drop_constant_arrays - transform_accessor.py: Fix timestep_mapping indexing for segmented clustering Co-Authored-By: Claude Opus 4.5 --- benchmarks/benchmark_io_performance.py | 13 ++++++++++++- flixopt/components.py | 6 +++--- flixopt/flow_system.py | 6 ++++++ flixopt/io.py | 5 ++++- flixopt/transform_accessor.py | 16 ++++++++++++++-- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_io_performance.py b/benchmarks/benchmark_io_performance.py index 3001850ea..e73032901 100644 --- a/benchmarks/benchmark_io_performance.py +++ b/benchmarks/benchmark_io_performance.py @@ -142,7 +142,18 @@ def run_io_benchmarks( print('\n2. Clustering and solving...') fs_clustered = fs.transform.cluster(n_clusters=n_clusters, cluster_duration='1D') - fs_clustered.optimize(fx.solvers.GurobiSolver()) + + # Try Gurobi first, fall back to HiGHS if not available + try: + solver = fx.solvers.GurobiSolver() + fs_clustered.optimize(solver) + except Exception as e: + if 'gurobi' in str(e).lower() or 'license' in str(e).lower(): + print(f' Gurobi not available ({e}), falling back to HiGHS...') + solver = fx.solvers.HighsSolver() + fs_clustered.optimize(solver) + else: + raise print('\n3. Expanding...') fs_expanded = fs_clustered.transform.expand() diff --git a/flixopt/components.py b/flixopt/components.py index 481135d1c..6535a1dd3 100644 --- a/flixopt/components.py +++ b/flixopt/components.py @@ -1505,11 +1505,11 @@ def _add_linking_constraints( # Apply self-discharge decay factor (1-loss)^hours to soc_before per Eq. 5 # relative_loss_per_hour is per-hour, so we need total hours per cluster - # Use sum over time to handle both regular and segmented systems + # Use sum over time to get total duration (handles both regular and segmented systems) # Keep as DataArray to respect per-period/scenario values rel_loss = _scalar_safe_reduce(self.element.relative_loss_per_hour, 'time', 'mean') - hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'mean') - decay_n = (1 - rel_loss) ** hours_per_cluster + total_hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'sum') + decay_n = (1 - rel_loss) ** total_hours_per_cluster lhs = soc_after - soc_before * decay_n - delta_soc_ordered self.add_constraints(lhs == 0, short_name='link') diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py index 2ca950b17..a68333e98 100644 --- a/flixopt/flow_system.py +++ b/flixopt/flow_system.py @@ -214,6 +214,12 @@ def __init__( elif computed_timestep_duration is not None: self.timestep_duration = self.fit_to_model_coords('timestep_duration', computed_timestep_duration) else: + # RangeIndex (segmented systems) requires explicit timestep_duration + if isinstance(self.timesteps, pd.RangeIndex): + raise ValueError( + 'timestep_duration is required when using RangeIndex timesteps (segmented systems). ' + 'Provide timestep_duration explicitly or use DatetimeIndex timesteps.' + ) self.timestep_duration = None # Cluster weight for cluster() optimization (default 1.0) diff --git a/flixopt/io.py b/flixopt/io.py index 2ec6a9edc..d5b055051 100644 --- a/flixopt/io.py +++ b/flixopt/io.py @@ -1598,7 +1598,10 @@ def _fast_get_dataarray(ds: xr.Dataset, name: str, coord_cache: dict[str, xr.Dat Constructed DataArray """ variable = ds.variables[name] - coords = {k: coord_cache[k] for k in variable.dims if k in coord_cache} + var_dims = set(variable.dims) + # Include coordinates whose dims are a subset of the variable's dims + # This preserves both dimension coordinates and auxiliary coordinates + coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} return xr.DataArray(variable, coords=coords, name=name) @staticmethod diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 0ec24b57b..98a4d5c3b 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -1401,6 +1401,16 @@ def cluster( ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering ) temporaly_changing_ds_for_clustering = drop_constant_arrays(ds_slice_for_clustering, dim='time') + + # Guard against empty dataset after removing constant arrays + if not temporaly_changing_ds_for_clustering.data_vars: + filter_info = f'data_vars={data_vars}' if data_vars else 'all variables' + selector_info = f', selector={selector}' if selector else '' + raise ValueError( + f'No time-varying data found for clustering ({filter_info}{selector_info}). ' + f'All variables are constant over time. Check your data_vars filter or input data.' + ) + df_for_clustering = temporaly_changing_ds_for_clustering.to_dataframe() if selector: @@ -1920,13 +1930,15 @@ def _interpolate_charge_state_segmented( position_within_segment = clustering.results.position_within_segment # Decode timestep_mapping into cluster and time indices - # For segmented systems, use n_segments as the divisor (matches expand_data/build_expansion_divisor) + # For segmented systems: + # - Use n_segments for cluster division (matches expand_data/build_expansion_divisor) + # - Use timesteps_per_cluster for time position (actual position within original cluster) if clustering.is_segmented and clustering.n_segments is not None: time_dim_size = clustering.n_segments else: time_dim_size = clustering.timesteps_per_cluster cluster_indices = timestep_mapping // time_dim_size - time_indices = timestep_mapping % time_dim_size + time_indices = timestep_mapping % clustering.timesteps_per_cluster # Get segment index and position for each original timestep seg_indices = segment_assignments.isel(cluster=cluster_indices, time=time_indices) From 68e46953041a6377fdf92caf88733e07714f7b3e Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 16 Jan 2026 23:54:08 +0100 Subject: [PATCH 7/7] perf: Use ds.variables pattern in expand() (2.2x faster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace data_vars.items() iteration with ds.variables pattern to avoid slow _construct_dataarray calls (5502 calls × ~1.5ms each). Before: 3.73s After: 1.72s Co-Authored-By: Claude Opus 4.5 --- flixopt/transform_accessor.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 98a4d5c3b..07a167099 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -2140,14 +2140,24 @@ def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) - return expanded + # Helper to construct DataArray without slow _construct_dataarray + def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray: + variable = ds.variables[name] + var_dims = set(variable.dims) + coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)} + return xr.DataArray(variable, coords=coords, name=name) + # 1. Expand FlowSystem data reduced_ds = self._fs.to_dataset(include_solution=False) clustering_attrs = {'is_clustered', 'n_clusters', 'timesteps_per_cluster', 'clustering', 'cluster_weight'} skip_vars = {'cluster_weight', 'timestep_duration'} # These have special handling data_vars = {} - for name, da in reduced_ds.data_vars.items(): + # Use ds.variables pattern to avoid slow _construct_dataarray calls + coord_cache = {k: v for k, v in reduced_ds.coords.items()} + for name in reduced_ds.data_vars: if name in skip_vars or name.startswith('clustering|'): continue + da = _fast_get_da(reduced_ds, name, coord_cache) # Skip vars with cluster dim but no time dim - they don't make sense after expansion # (e.g., representative_weights with dims ('cluster',) or ('cluster', 'period')) if 'cluster' in da.dims and 'time' not in da.dims: @@ -2164,10 +2174,13 @@ def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) - # 2. Expand solution (with segment total correction for segmented systems) reduced_solution = self._fs.solution - expanded_fs._solution = xr.Dataset( - {name: expand_da(da, name, is_solution=True) for name, da in reduced_solution.data_vars.items()}, - attrs=reduced_solution.attrs, - ) + # Use ds.variables pattern to avoid slow _construct_dataarray calls + sol_coord_cache = {k: v for k, v in reduced_solution.coords.items()} + expanded_sol_vars = {} + for name in reduced_solution.data_vars: + da = _fast_get_da(reduced_solution, name, sol_coord_cache) + expanded_sol_vars[name] = expand_da(da, name, is_solution=True) + expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs) expanded_fs._solution = expanded_fs._solution.reindex(time=original_timesteps_extra) # 3. Combine charge_state with SOC_boundary for intercluster storages