From 56048455e9eadd97cb42cd781bbda5ad14a354e6 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 18:05:27 +0100
Subject: [PATCH 1/7] perf: Use ds.variables to avoid _construct_dataarray
 overhead

Optimize several functions by using ds.variables instead of iterating
over data_vars.items() or accessing ds[name], which triggers slow
_construct_dataarray calls.

Changes:
- io.py: save_dataset_to_netcdf, load_dataset_from_netcdf, _reduce_constant_arrays
- structure.py: from_dataset (use coord_cache pattern)
- core.py: drop_constant_arrays (use numpy operations)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/core.py      | 17 +++++++----
 flixopt/io.py        | 69 +++++++++++++++++++++++++++++---------------
 flixopt/structure.py | 12 +++++++-
 3 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/flixopt/core.py b/flixopt/core.py
index 0470c1995..ba8618e1a 100644
--- a/flixopt/core.py
+++ b/flixopt/core.py
@@ -629,17 +629,24 @@ def drop_constant_arrays(
         Dataset with constant variables removed.
     """
     drop_vars = []
+    # Use ds.variables for faster access (avoids _construct_dataarray overhead)
+    variables = ds.variables
 
-    for name, da in ds.data_vars.items():
+    for name in ds.data_vars:
+        var = variables[name]
         # Skip variables without the dimension
-        if dim not in da.dims:
+        if dim not in var.dims:
             if drop_arrays_without_dim:
                 drop_vars.append(name)
             continue
 
-        # Check if variable is constant along the dimension (ptp < atol)
-        ptp = da.max(dim, skipna=True) - da.min(dim, skipna=True)
-        if (ptp < atol).all().item():
+        # Check if variable is constant along the dimension using numpy (ptp < atol)
+        axis = var.dims.index(dim)
+        data = var.values
+        # Use numpy operations directly for speed
+        with np.errstate(invalid='ignore'):  # Ignore NaN warnings
+            ptp = np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis)
+        if np.all(ptp < atol):
             drop_vars.append(name)
 
     if drop_vars:
diff --git a/flixopt/io.py b/flixopt/io.py
index bbc6ec80b..e2919a89e 100644
--- a/flixopt/io.py
+++ b/flixopt/io.py
@@ -561,14 +561,18 @@ def save_dataset_to_netcdf(
     ds.attrs = {'attrs': json.dumps(ds.attrs)}
 
     # Convert all DataArray attrs to JSON strings
-    for var_name, data_var in ds.data_vars.items():
-        if data_var.attrs:  # Only if there are attrs
-            ds[var_name].attrs = {'attrs': json.dumps(data_var.attrs)}
+    # Use ds.variables to avoid slow _construct_dataarray calls
+    variables = ds.variables
+    for var_name in ds.data_vars:
+        var = variables[var_name]
+        if var.attrs:  # Only if there are attrs
+            var.attrs = {'attrs': json.dumps(var.attrs)}
 
     # Also handle coordinate attrs if they exist
-    for coord_name, coord_var in ds.coords.items():
-        if hasattr(coord_var, 'attrs') and coord_var.attrs:
-            ds[coord_name].attrs = {'attrs': json.dumps(coord_var.attrs)}
+    for coord_name in ds.coords:
+        var = variables[coord_name]
+        if var.attrs:
+            var.attrs = {'attrs': json.dumps(var.attrs)}
 
     # Suppress numpy binary compatibility warnings from netCDF4 (numpy 1->2 transition)
     with warnings.catch_warnings():
@@ -602,25 +606,38 @@ def _reduce_constant_arrays(ds: xr.Dataset) -> xr.Dataset:
         Dataset with constant dimensions reduced.
     """
     new_data_vars = {}
+    variables = ds.variables
+
+    for name in ds.data_vars:
+        var = variables[name]
+        dims = var.dims
+        data = var.values
 
-    for name, da in ds.data_vars.items():
-        if not da.dims or da.size == 0:
-            new_data_vars[name] = da
+        if not dims or data.size == 0:
+            new_data_vars[name] = var
             continue
 
-        # Try to reduce each dimension
-        reduced = da
-        for dim in list(da.dims):
-            if dim not in reduced.dims:
+        # Try to reduce each dimension using numpy operations
+        reduced_data = data
+        reduced_dims = list(dims)
+
+        for _axis, dim in enumerate(dims):
+            if dim not in reduced_dims:
                 continue  # Already removed
-            # Check if constant along this dimension
-            first_slice = reduced.isel({dim: 0})
-            is_constant = (reduced == first_slice).all()
+
+            current_axis = reduced_dims.index(dim)
+            # Check if constant along this axis using numpy
+            first_slice = np.take(reduced_data, 0, axis=current_axis)
+            # Broadcast first_slice to compare
+            expanded = np.expand_dims(first_slice, axis=current_axis)
+            is_constant = np.allclose(reduced_data, expanded, equal_nan=True)
+
             if is_constant:
                 # Remove this dimension by taking first slice
-                reduced = first_slice
+                reduced_data = first_slice
+                reduced_dims.pop(current_axis)
 
-        new_data_vars[name] = reduced
+        new_data_vars[name] = xr.Variable(tuple(reduced_dims), reduced_data, attrs=var.attrs)
 
     return xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs)
 
@@ -754,14 +771,18 @@ def load_dataset_from_netcdf(path: str | pathlib.Path) -> xr.Dataset:
         ds.attrs = json.loads(ds.attrs['attrs'])
 
     # Restore DataArray attrs (before unstacking, as stacked vars have no individual attrs)
-    for var_name, data_var in ds.data_vars.items():
-        if 'attrs' in data_var.attrs:
-            ds[var_name].attrs = json.loads(data_var.attrs['attrs'])
+    # Use ds.variables to avoid slow _construct_dataarray calls
+    variables = ds.variables
+    for var_name in ds.data_vars:
+        var = variables[var_name]
+        if 'attrs' in var.attrs:
+            var.attrs = json.loads(var.attrs['attrs'])
 
     # Restore coordinate attrs
-    for coord_name, coord_var in ds.coords.items():
-        if hasattr(coord_var, 'attrs') and 'attrs' in coord_var.attrs:
-            ds[coord_name].attrs = json.loads(coord_var.attrs['attrs'])
+    for coord_name in ds.coords:
+        var = variables[coord_name]
+        if 'attrs' in var.attrs:
+            var.attrs = json.loads(var.attrs['attrs'])
 
     # Unstack variables if they were stacked during saving
     # Detection: check if any dataset dimension starts with '__stacked__'
diff --git a/flixopt/structure.py b/flixopt/structure.py
index 8df65aae8..d165667bb 100644
--- a/flixopt/structure.py
+++ b/flixopt/structure.py
@@ -1116,7 +1116,17 @@ def from_dataset(cls, ds: xr.Dataset) -> Interface:
             reference_structure.pop('__class__', None)
 
             # Create arrays dictionary from dataset variables
-            arrays_dict = {name: array for name, array in ds.data_vars.items()}
+            # Use ds.variables with coord_cache for faster DataArray construction
+            variables = ds.variables
+            coord_cache = {k: ds.coords[k] for k in ds.coords}
+            arrays_dict = {
+                name: xr.DataArray(
+                    variables[name],
+                    coords={k: coord_cache[k] for k in variables[name].dims if k in coord_cache},
+                    name=name,
+                )
+                for name in ds.data_vars
+            }
 
             # Resolve all references using the centralized method
             resolved_params = cls._resolve_reference_structure(reference_structure, arrays_dict)

From 022f7a4e838a8605cb37592e166210f4a53e5a2a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 18:16:54 +0100
Subject: [PATCH 2/7] perf: Optimize clustering serialization with ds.variables

Use ds.variables for faster access in clustering/base.py:
- _create_reference_structure: original_data and metrics iteration
- compare plot: duration_curve generation with direct numpy indexing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/clustering/base.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py
index d2b46a236..ee0d2bf43 100644
--- a/flixopt/clustering/base.py
+++ b/flixopt/clustering/base.py
@@ -1113,12 +1113,17 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup
         original_data_refs = None
         if include_original_data and self.original_data is not None:
             original_data_refs = []
-            for name, da in self.original_data.data_vars.items():
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            variables = self.original_data.variables
+            for name in self.original_data.data_vars:
+                var = variables[name]
                 ref_name = f'original_data|{name}'
                 # Rename time dim to avoid xarray alignment issues
-                if 'time' in da.dims:
-                    da = da.rename({'time': 'original_time'})
-                arrays[ref_name] = da
+                if 'time' in var.dims:
+                    new_dims = tuple('original_time' if d == 'time' else d for d in var.dims)
+                    arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs)
+                else:
+                    arrays[ref_name] = var
                 original_data_refs.append(f':::{ref_name}')
 
         # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's
@@ -1129,9 +1134,11 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup
         metrics_refs = None
         if self._metrics is not None:
             metrics_refs = []
-            for name, da in self._metrics.data_vars.items():
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            metrics_vars = self._metrics.variables
+            for name in self._metrics.data_vars:
                 ref_name = f'metrics|{name}'
-                arrays[ref_name] = da
+                arrays[ref_name] = metrics_vars[name]
                 metrics_refs.append(f':::{ref_name}')
 
         reference = {
@@ -1415,9 +1422,15 @@ def compare(
 
         if kind == 'duration_curve':
             sorted_vars = {}
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            variables = ds.variables
+            rep_values = ds.coords['representation'].values
+            rep_idx = {rep: i for i, rep in enumerate(rep_values)}
             for var in ds.data_vars:
-                for rep in ds.coords['representation'].values:
-                    values = np.sort(ds[var].sel(representation=rep).values.flatten())[::-1]
+                data = variables[var].values
+                for rep in rep_values:
+                    # Direct numpy indexing instead of .sel()
+                    values = np.sort(data[rep_idx[rep]].flatten())[::-1]
                     sorted_vars[(var, rep)] = values
             # Get length from first sorted array
             n = len(next(iter(sorted_vars.values())))

From cdcc194040f2784708183ff1883bea54568c1fe9 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 22:12:59 +0100
Subject: [PATCH 3/7] perf: Use batch assignment for clustering arrays (24x
 speedup)

_add_clustering_to_dataset was slow due to 210 individual
ds[name] = arr assignments. Each triggers xarray's
expensive dataset_update_method.

Changed to batch assignment with ds.assign(dict):
- Before: ~2600ms for to_dataset with clustering
- After: ~109ms for to_dataset with clustering

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/io.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/flixopt/io.py b/flixopt/io.py
index e2919a89e..2ec6a9edc 100644
--- a/flixopt/io.py
+++ b/flixopt/io.py
@@ -1886,9 +1886,10 @@ def _add_clustering_to_dataset(
             clustering_ref, clustering_arrays = clustering._create_reference_structure(
                 include_original_data=include_original_data
             )
-            # Add clustering arrays with prefix
-            for name, arr in clustering_arrays.items():
-                ds[f'{cls.CLUSTERING_PREFIX}{name}'] = arr
+            # Add clustering arrays with prefix using batch assignment
+            # (individual ds[name] = arr assignments are slow)
+            prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()}
+            ds = ds.assign(prefixed_arrays)
             ds.attrs['clustering'] = json.dumps(clustering_ref)
 
         return ds

From b215959ac354945111d5f257d0c91c4028037002 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 22:20:29 +0100
Subject: [PATCH 4/7] perf: Use ds.variables in _build_reduced_dataset (12%
 faster)

Avoided _construct_dataarray overhead by:
- Using ds.variables instead of ds.data_vars.items()
- Using numpy slicing instead of .isel()
- Passing attrs dict directly instead of DataArray

cluster() benchmark:
- Before: ~10.1s
- After: ~8.9s

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/transform_accessor.py | 45 ++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py
index 05a95ba07..196e9f014 100644
--- a/flixopt/transform_accessor.py
+++ b/flixopt/transform_accessor.py
@@ -525,35 +525,48 @@ def _build_reduced_dataset(
         all_keys = {(p, s) for p in periods for s in scenarios}
         ds_new_vars = {}
 
-        for name, original_da in ds.data_vars.items():
-            if 'time' not in original_da.dims:
-                ds_new_vars[name] = original_da.copy()
+        # Use ds.variables to avoid _construct_dataarray overhead
+        variables = ds.variables
+        coord_cache = {k: ds.coords[k].values for k in ds.coords}
+
+        for name in ds.data_vars:
+            var = variables[name]
+            if 'time' not in var.dims:
+                # No time dimension - wrap Variable in DataArray
+                coords = {d: coord_cache[d] for d in var.dims if d in coord_cache}
+                ds_new_vars[name] = xr.DataArray(var.values, dims=var.dims, coords=coords, attrs=var.attrs, name=name)
             elif name not in typical_das or set(typical_das[name].keys()) != all_keys:
                 # Time-dependent but constant: reshape to (cluster, time, ...)
-                sliced = original_da.isel(time=slice(0, n_reduced_timesteps))
-                other_dims = [d for d in sliced.dims if d != 'time']
-                other_shape = [sliced.sizes[d] for d in other_dims]
+                # Use numpy slicing instead of .isel()
+                time_idx = var.dims.index('time')
+                slices = [slice(None)] * len(var.dims)
+                slices[time_idx] = slice(0, n_reduced_timesteps)
+                sliced_values = var.values[tuple(slices)]
+
+                other_dims = [d for d in var.dims if d != 'time']
+                other_shape = [var.sizes[d] for d in other_dims]
                 new_shape = [actual_n_clusters, n_time_points] + other_shape
-                reshaped = sliced.values.reshape(new_shape)
+                reshaped = sliced_values.reshape(new_shape)
                 new_coords = {'cluster': cluster_coords, 'time': time_coords}
                 for dim in other_dims:
-                    new_coords[dim] = sliced.coords[dim].values
+                    if dim in coord_cache:
+                        new_coords[dim] = coord_cache[dim]
                 ds_new_vars[name] = xr.DataArray(
                     reshaped,
                     dims=['cluster', 'time'] + other_dims,
                     coords=new_coords,
-                    attrs=original_da.attrs,
+                    attrs=var.attrs,
                 )
             else:
                 # Time-varying: combine per-(period, scenario) slices
                 da = self._combine_slices_to_dataarray_2d(
                     slices=typical_das[name],
-                    original_da=original_da,
+                    attrs=var.attrs,
                     periods=periods,
                     scenarios=scenarios,
                 )
-                if TimeSeriesData.is_timeseries_data(original_da):
-                    da = TimeSeriesData.from_dataarray(da.assign_attrs(original_da.attrs))
+                if var.attrs.get('__timeseries_data__', False):
+                    da = TimeSeriesData.from_dataarray(da.assign_attrs(var.attrs))
                 ds_new_vars[name] = da
 
         # Copy attrs but remove cluster_weight
@@ -1639,7 +1652,7 @@ def _combine_slices_to_dataarray_generic(
     @staticmethod
     def _combine_slices_to_dataarray_2d(
         slices: dict[tuple, xr.DataArray],
-        original_da: xr.DataArray,
+        attrs: dict,
         periods: list,
         scenarios: list,
     ) -> xr.DataArray:
@@ -1647,7 +1660,7 @@ def _combine_slices_to_dataarray_2d(
 
         Args:
             slices: Dict mapping (period, scenario) tuples to DataArrays with (cluster, time) dims.
-            original_da: Original DataArray to get attrs from.
+            attrs: Attributes to assign to the result.
             periods: List of period labels ([None] if no periods dimension).
             scenarios: List of scenario labels ([None] if no scenarios dimension).
 
@@ -1660,7 +1673,7 @@ def _combine_slices_to_dataarray_2d(
 
         # Simple case: no period/scenario dimensions
         if not has_periods and not has_scenarios:
-            return slices[first_key].assign_attrs(original_da.attrs)
+            return slices[first_key].assign_attrs(attrs)
 
         # Multi-dimensional: use xr.concat to stack along period/scenario dims
         if has_periods and has_scenarios:
@@ -1678,7 +1691,7 @@ def _combine_slices_to_dataarray_2d(
         # Put cluster and time first (standard order for clustered data)
         result = result.transpose('cluster', 'time', ...)
 
-        return result.assign_attrs(original_da.attrs)
+        return result.assign_attrs(attrs)
 
     def _validate_for_expansion(self) -> Clustering:
         """Validate FlowSystem can be expanded and return clustering info.

From 0754f06f2da159fb7971e64ed3a2462660b676d1 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 22:35:00 +0100
Subject: [PATCH 5/7] perf: Use numpy reshape in _build_typical_das (4.4x
 faster)

Eliminated 451,856 slow pandas .loc calls by using numpy reshape
for segmented clustering data instead of iterating per-cluster.

cluster() with segments benchmark (50 clusters, 4 segments):
- Before: ~93.7s
- After: ~21.1s
- Speedup: 4.4x

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/transform_accessor.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py
index 196e9f014..0ec24b57b 100644
--- a/flixopt/transform_accessor.py
+++ b/flixopt/transform_accessor.py
@@ -195,15 +195,22 @@ def _build_typical_das(
         for key, tsam_result in tsam_aggregation_results.items():
             typical_df = tsam_result.cluster_representatives
             if is_segmented:
-                # Segmented data: MultiIndex (Segment Step, Segment Duration)
-                # Need to extract by cluster (first level of index)
-                for col in typical_df.columns:
-                    data = np.zeros((actual_n_clusters, n_time_points))
-                    for cluster_id in range(actual_n_clusters):
-                        cluster_data = typical_df.loc[cluster_id, col]
-                        data[cluster_id, :] = cluster_data.values[:n_time_points]
+                # Segmented data: MultiIndex with cluster as first level
+                # Each cluster has exactly n_time_points rows (segments)
+                # Extract all data at once using numpy reshape, avoiding slow .loc calls
+                columns = typical_df.columns.tolist()
+
+                # Get all values as numpy array: (n_clusters * n_time_points, n_columns)
+                all_values = typical_df.values
+
+                # Reshape to (n_clusters, n_time_points, n_columns)
+                reshaped = all_values.reshape(actual_n_clusters, n_time_points, -1)
+
+                for col_idx, col in enumerate(columns):
+                    # reshaped[:, :, col_idx] selects all clusters, all time points, single column
+                    # Result shape: (n_clusters, n_time_points)
                     typical_das.setdefault(col, {})[key] = xr.DataArray(
-                        data,
+                        reshaped[:, :, col_idx],
                         dims=['cluster', 'time'],
                         coords={'cluster': cluster_coords, 'time': time_coords},
                     )

From 0203945d6b8f08ac4bb51f3d835569b4a8538083 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 23:06:39 +0100
Subject: [PATCH 6/7] fix: Multiple clustering and IO bug fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- benchmark_io_performance.py: Add Gurobi → HiGHS solver fallback
- components.py: Fix storage decay to use sum (not mean) for hours per cluster
- flow_system.py: Add RangeIndex validation requiring explicit timestep_duration
- io.py: Include auxiliary coordinates in _fast_get_dataarray
- transform_accessor.py: Add empty dataset guard after drop_constant_arrays
- transform_accessor.py: Fix timestep_mapping indexing for segmented clustering

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 benchmarks/benchmark_io_performance.py | 13 ++++++++++++-
 flixopt/components.py                  |  6 +++---
 flixopt/flow_system.py                 |  6 ++++++
 flixopt/io.py                          |  5 ++++-
 flixopt/transform_accessor.py          | 16 ++++++++++++++--
 5 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_io_performance.py b/benchmarks/benchmark_io_performance.py
index 3001850ea..e73032901 100644
--- a/benchmarks/benchmark_io_performance.py
+++ b/benchmarks/benchmark_io_performance.py
@@ -142,7 +142,18 @@ def run_io_benchmarks(
 
     print('\n2. Clustering and solving...')
     fs_clustered = fs.transform.cluster(n_clusters=n_clusters, cluster_duration='1D')
-    fs_clustered.optimize(fx.solvers.GurobiSolver())
+
+    # Try Gurobi first, fall back to HiGHS if not available
+    try:
+        solver = fx.solvers.GurobiSolver()
+        fs_clustered.optimize(solver)
+    except Exception as e:
+        if 'gurobi' in str(e).lower() or 'license' in str(e).lower():
+            print(f'   Gurobi not available ({e}), falling back to HiGHS...')
+            solver = fx.solvers.HighsSolver()
+            fs_clustered.optimize(solver)
+        else:
+            raise
 
     print('\n3. Expanding...')
     fs_expanded = fs_clustered.transform.expand()
diff --git a/flixopt/components.py b/flixopt/components.py
index 481135d1c..6535a1dd3 100644
--- a/flixopt/components.py
+++ b/flixopt/components.py
@@ -1505,11 +1505,11 @@ def _add_linking_constraints(
 
         # Apply self-discharge decay factor (1-loss)^hours to soc_before per Eq. 5
         # relative_loss_per_hour is per-hour, so we need total hours per cluster
-        # Use sum over time to handle both regular and segmented systems
+        # Use sum over time to get total duration (handles both regular and segmented systems)
         # Keep as DataArray to respect per-period/scenario values
         rel_loss = _scalar_safe_reduce(self.element.relative_loss_per_hour, 'time', 'mean')
-        hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'mean')
-        decay_n = (1 - rel_loss) ** hours_per_cluster
+        total_hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'sum')
+        decay_n = (1 - rel_loss) ** total_hours_per_cluster
 
         lhs = soc_after - soc_before * decay_n - delta_soc_ordered
         self.add_constraints(lhs == 0, short_name='link')
diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py
index 2ca950b17..a68333e98 100644
--- a/flixopt/flow_system.py
+++ b/flixopt/flow_system.py
@@ -214,6 +214,12 @@ def __init__(
         elif computed_timestep_duration is not None:
             self.timestep_duration = self.fit_to_model_coords('timestep_duration', computed_timestep_duration)
         else:
+            # RangeIndex (segmented systems) requires explicit timestep_duration
+            if isinstance(self.timesteps, pd.RangeIndex):
+                raise ValueError(
+                    'timestep_duration is required when using RangeIndex timesteps (segmented systems). '
+                    'Provide timestep_duration explicitly or use DatetimeIndex timesteps.'
+                )
             self.timestep_duration = None
 
         # Cluster weight for cluster() optimization (default 1.0)
diff --git a/flixopt/io.py b/flixopt/io.py
index 2ec6a9edc..d5b055051 100644
--- a/flixopt/io.py
+++ b/flixopt/io.py
@@ -1598,7 +1598,10 @@ def _fast_get_dataarray(ds: xr.Dataset, name: str, coord_cache: dict[str, xr.Dat
             Constructed DataArray
         """
         variable = ds.variables[name]
-        coords = {k: coord_cache[k] for k in variable.dims if k in coord_cache}
+        var_dims = set(variable.dims)
+        # Include coordinates whose dims are a subset of the variable's dims
+        # This preserves both dimension coordinates and auxiliary coordinates
+        coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)}
         return xr.DataArray(variable, coords=coords, name=name)
 
     @staticmethod
diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py
index 0ec24b57b..98a4d5c3b 100644
--- a/flixopt/transform_accessor.py
+++ b/flixopt/transform_accessor.py
@@ -1401,6 +1401,16 @@ def cluster(
                     ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering
                 )
                 temporaly_changing_ds_for_clustering = drop_constant_arrays(ds_slice_for_clustering, dim='time')
+
+                # Guard against empty dataset after removing constant arrays
+                if not temporaly_changing_ds_for_clustering.data_vars:
+                    filter_info = f'data_vars={data_vars}' if data_vars else 'all variables'
+                    selector_info = f', selector={selector}' if selector else ''
+                    raise ValueError(
+                        f'No time-varying data found for clustering ({filter_info}{selector_info}). '
+                        f'All variables are constant over time. Check your data_vars filter or input data.'
+                    )
+
                 df_for_clustering = temporaly_changing_ds_for_clustering.to_dataframe()
 
                 if selector:
@@ -1920,13 +1930,15 @@ def _interpolate_charge_state_segmented(
         position_within_segment = clustering.results.position_within_segment
 
         # Decode timestep_mapping into cluster and time indices
-        # For segmented systems, use n_segments as the divisor (matches expand_data/build_expansion_divisor)
+        # For segmented systems:
+        # - Use n_segments for cluster division (matches expand_data/build_expansion_divisor)
+        # - Use timesteps_per_cluster for time position (actual position within original cluster)
         if clustering.is_segmented and clustering.n_segments is not None:
             time_dim_size = clustering.n_segments
         else:
             time_dim_size = clustering.timesteps_per_cluster
         cluster_indices = timestep_mapping // time_dim_size
-        time_indices = timestep_mapping % time_dim_size
+        time_indices = timestep_mapping % clustering.timesteps_per_cluster
 
         # Get segment index and position for each original timestep
         seg_indices = segment_assignments.isel(cluster=cluster_indices, time=time_indices)

From 68e46953041a6377fdf92caf88733e07714f7b3e Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 16 Jan 2026 23:54:08 +0100
Subject: [PATCH 7/7] perf: Use ds.variables pattern in expand() (2.2x faster)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace data_vars.items() iteration with ds.variables pattern to avoid
slow _construct_dataarray calls (5502 calls × ~1.5ms each).

Before: 3.73s
After:  1.72s

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 flixopt/transform_accessor.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py
index 98a4d5c3b..07a167099 100644
--- a/flixopt/transform_accessor.py
+++ b/flixopt/transform_accessor.py
@@ -2140,14 +2140,24 @@ def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) -
 
             return expanded
 
+        # Helper to construct DataArray without slow _construct_dataarray
+        def _fast_get_da(ds: xr.Dataset, name: str, coord_cache: dict) -> xr.DataArray:
+            variable = ds.variables[name]
+            var_dims = set(variable.dims)
+            coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)}
+            return xr.DataArray(variable, coords=coords, name=name)
+
         # 1. Expand FlowSystem data
         reduced_ds = self._fs.to_dataset(include_solution=False)
         clustering_attrs = {'is_clustered', 'n_clusters', 'timesteps_per_cluster', 'clustering', 'cluster_weight'}
         skip_vars = {'cluster_weight', 'timestep_duration'}  # These have special handling
         data_vars = {}
-        for name, da in reduced_ds.data_vars.items():
+        # Use ds.variables pattern to avoid slow _construct_dataarray calls
+        coord_cache = {k: v for k, v in reduced_ds.coords.items()}
+        for name in reduced_ds.data_vars:
             if name in skip_vars or name.startswith('clustering|'):
                 continue
+            da = _fast_get_da(reduced_ds, name, coord_cache)
             # Skip vars with cluster dim but no time dim - they don't make sense after expansion
             # (e.g., representative_weights with dims ('cluster',) or ('cluster', 'period'))
             if 'cluster' in da.dims and 'time' not in da.dims:
@@ -2164,10 +2174,13 @@ def expand_da(da: xr.DataArray, var_name: str = '', is_solution: bool = False) -
 
         # 2. Expand solution (with segment total correction for segmented systems)
         reduced_solution = self._fs.solution
-        expanded_fs._solution = xr.Dataset(
-            {name: expand_da(da, name, is_solution=True) for name, da in reduced_solution.data_vars.items()},
-            attrs=reduced_solution.attrs,
-        )
+        # Use ds.variables pattern to avoid slow _construct_dataarray calls
+        sol_coord_cache = {k: v for k, v in reduced_solution.coords.items()}
+        expanded_sol_vars = {}
+        for name in reduced_solution.data_vars:
+            da = _fast_get_da(reduced_solution, name, sol_coord_cache)
+            expanded_sol_vars[name] = expand_da(da, name, is_solution=True)
+        expanded_fs._solution = xr.Dataset(expanded_sol_vars, attrs=reduced_solution.attrs)
         expanded_fs._solution = expanded_fs._solution.reindex(time=original_timesteps_extra)
 
         # 3. Combine charge_state with SOC_boundary for intercluster storages