From b16a6ff139aedc8ec76b9cfb4da91a815cbf10c0 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 21 Oct 2024 16:47:08 +0100
Subject: [PATCH 001/217] Added required changes to configure.ac to allow comp
 w cuda & hip. Added cuda and hip Makefile.ac

---
 configure.ac         | 78 ++++++++++++++++++++++++++++++++++++++++++++
 src/cuda/Makefile.am | 66 +++++++++++++++++++++++++++++++++++++
 src/hip/Makefile.am  | 55 +++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 src/cuda/Makefile.am
 create mode 100755 src/hip/Makefile.am

diff --git a/configure.ac b/configure.ac
index b0173c6954..30fc62a147 100644
--- a/configure.ac
+++ b/configure.ac
@@ -995,6 +995,78 @@ AH_VERBATIM([__STDC_FORMAT_MACROS],
 #define __STDC_FORMAT_MACROS 1
 #endif])
 
+
+
+# Check for CUDA
+have_cuda="no"
+AC_ARG_WITH([cuda],
+    [AS_HELP_STRING([--with-cuda=PATH],
+       [root directory where CUDA is installed @<:@yes/no@:>@]
+    )],
+    [],
+    [with_cuda="no"]
+)
+if test "x$with_cuda" != "xno"; then
+   if test "x$with_cuda" != "xyes"; then
+      CUDA_CFLAGS="-I$with_cuda/include"
+      CUDA_LIBS="-L$with_cuda/lib -L$with_cuda/lib64 -lcudart"
+      NVCC="$with_cuda/bin/nvcc"
+      have_cuda="yes"
+   else
+      AC_PATH_PROG([NVCC],[nvcc])
+      echo "Found nvcc = $NVCC"
+      if test -n "$NVCC"; then
+         CUDA_ROOT="`dirname $NVCC`/.."
+         CUDA_CFLAGS="-I${CUDA_ROOT}/include"
+         CUDA_LIBS="-L${CUDA_ROOT}/lib -L${CUDA_ROOT}/lib64 -lcudart"
+         have_cuda="yes"
+      fi
+   fi
+   if test "x$have_cuda" != "xno"; then
+      AC_DEFINE([HAVE_CUDA], 1, [The CUDA compiler is installed.])
+   fi
+   CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_LIBS)
+AC_SUBST(NVCC)
+AM_CONDITIONAL([HAVECUDA],[test -n "$NVCC"])
+
+# Check for HIP
+have_hip="no"
+AC_ARG_WITH([hip],
+    [AS_HELP_STRING([--with-hip=PATH],
+       [root directory where HIP is installed @<:@yes/no@:>@]
+    )],
+    [],
+    [with_hip="no"]
+)
+if test "x$with_hip" != "xno"; then
+   if test "x$with_hip" != "xyes"; then
+      HIP_CFLAGS="-I$with_hip/include"
+      HIP_LIBS="-L$with_hip/lib -L$with_hip/lib64"
+      HIPCC="$with_hip/bin/hipcc"
+      have_hip="yes"
+   else
+      AC_PATH_PROG([HIPCC],[hipcc])
+      echo "Found hipcc = $HIPCC"
+      if test -n "$HIPCC"; then
+         HIP_ROOT="`dirname $HIPCC`/.."
+         HIP_CFLAGS="-I${HIP_ROOT}/include"
+         HIP_LIBS="-L${HIP_ROOT}/lib -L${HIP_ROOT}/lib64"
+         have_hip="yes"
+      fi
+   fi
+   if test "x$have_hip" != "xno"; then
+      AC_DEFINE([HAVE_HIP], 1, [The HIP compiler is installed.])
+   fi
+   CFLAGS="${CFLAGS} "
+fi
+AC_SUBST(HIP_CFLAGS)
+AC_SUBST(HIP_LIBS)
+AC_SUBST(HIPCC)
+AM_CONDITIONAL([HAVEHIP],[test -n "$HIPCC"])
+
 # Check for FFTW. We test for this in the standard directories by default,
 # and only disable if using --with-fftw=no or --without-fftw. When a value
 # is given FFTW must be found.
@@ -3246,6 +3318,10 @@ AC_CONFIG_FILES([tests/testSelectOutput.sh], [chmod +x tests/testSelectOutput.sh
 AC_CONFIG_FILES([tests/testFormat.sh], [chmod +x tests/testFormat.sh])
 AC_CONFIG_FILES([tests/testNeutrinoCosmology.sh], [chmod +x tests/testNeutrinoCosmology.sh])
 AC_CONFIG_FILES([tests/output_list_params.yml])
+# cuda .in file
+AC_CONFIG_FILES([src/cuda/Makefile])
+# hip .in file
+AC_CONFIG_FILES([src/hip/Makefile])
 
 # Save the compilation options
 AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure])
@@ -3276,6 +3352,8 @@ AC_MSG_RESULT([
    HDF5 enabled         : $with_hdf5
     - parallel          : $have_parallel_hdf5
    METIS/ParMETIS       : $have_metis / $have_parmetis
+   CUDA enabled         : $have_cuda
+   HIP enabled          : $have_hip
    FFTW3 enabled        : $have_fftw
     - threaded/openmp   : $have_threaded_fftw / $have_openmp_fftw
     - MPI               : $have_mpi_fftw
diff --git a/src/cuda/Makefile.am b/src/cuda/Makefile.am
new file mode 100644
index 0000000000..5fb5bbc34f
--- /dev/null
+++ b/src/cuda/Makefile.am
@@ -0,0 +1,66 @@
+SOURCES_CUDA = GPU_runner_functions.cu tester.cu ../files_for_new_functions/arrays_malloc.cu ../files_for_new_functions/host_device_data_transfer.cu #../runner_main.cu
+include_HEADERS = GPU_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h ../files_for_new_functions/arrays_malloc.h ../files_for_new_functions/host_device_data_transfer.h
+EXTRA_DIST = $(SOURCES_CUDA) $(include_HEADERS)
+
+if HAVECUDA
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+CUDA_MYFLAGS = -D_FORCE_INLINES -O4 -lineinfo -src-in-ptx --maxrregcount=64 -ftz=true -DWITH_CUDA --default-stream per-thread --use_fast_math -lcudadevrt #-dlink -ccbin=gcc 
+CUDA_MYFLAGS += -arch=sm_70
+CUDA_MYFLAGS += --extra-device-vectorization
+
+#CUDA_MYFLAGS = -D_FORCE_INLINES -O3 -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_CUDA -ccbin=gcc -m64 --default-stream per-thread #-dlink
+#CUDA_MYFLAGS += -arch=sm_80 \
+#-gencode=arch=compute_80,code=sm_80 \
+#-gencode=arch=compute_86,code=sm_86 \
+#-gencode=arch=compute_87,code=sm_87 \
+#-gencode=arch=compute_86,code=compute_86
+#CUDA_MYFLAGS += --extra-device-vectorization
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile CUDA code.
+.cu.o:
+	$(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@
+.cu.lo:
+	PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(NVCC) -c $(NVCCFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftCUDA.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftCUDA_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+        $(libswiftCUDA_la_LDFLAGS) \
+        $(LDFLAGS) -o $@
+
+libswiftCUDA_la_SOURCES = $(SOURCES_CUDA)
+libswiftCUDA_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) ../libswiftsim_cuda.la -I../
+libswiftCUDA_la_LIBADD = ../.libs/libswiftsim_cuda.la
+libswiftCUDA_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftCUDA_la_CFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_CXXFLAGS += ../libswiftsim_mpicuda.la
+libswiftCUDA_la_LIBADD += ../.libs/libswiftsim_mpicuda.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) 
+#test_27_cells_LDADD= ../.libs/libswiftsim_cuda.la ../.libs/libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_cuda.la ../libswiftsim_mpicuda.la libswiftCUDA.la $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -L/home/aidan/cuda_7.5/lib64/ -lcudart
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(CUDA_CFLAGS) 
+
+endif
diff --git a/src/hip/Makefile.am b/src/hip/Makefile.am
new file mode 100755
index 0000000000..fc626b8831
--- /dev/null
+++ b/src/hip/Makefile.am
@@ -0,0 +1,55 @@
+SOURCES_HIP = HIP_runner_functions.hip
+include_HEADERS = HIP_runner_functions.h device_functions.h BLOCK_SIZE.h tester.h
+EXTRA_DIST = $(SOURCES_HIP) $(include_HEADERS)
+
+if HAVEHIP
+
+AM_CFLAGS = -I.. $(HDF5_CPPFLAGS)
+HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -DWITH_HIP --offload-arch=gfx90a
+#HIP_MYFLAGS = -D_FORCE_INLINES -O3 -g -v -lineinfo -src-in-ptx --maxrregcount=32 -ftz=true -DWITH_HIP -ccbin=gcc -m64 --default-stream per-thread#-dlink
+
+# Assign a "safe" version number
+AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0
+
+#bin_PROGRAMS = test_27_cells test_125_cells
+
+# Rules to compile HIP code.
+.hip.o:
+	$(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $< -o $@
+.hip.lo:
+	PATH=$(top_srcdir):$(PATH) && cudalt.py $@ $(HIPCC) -c $(HIPFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) $<
+
+# The library. Dummy C library so that we get libtool linking setup.
+lib_LTLIBRARIES = libswiftHIP.la libswiftdummy.la
+
+# Special link command to avoid including CFLAGS which are not understood.
+libswiftHIP_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+        $(libswiftHIP_la_LDFLAGS) \
+        $(LDFLAGS) -o $@
+
+libswiftHIP_la_SOURCES = $(SOURCES_HIP)
+libswiftHIP_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) $(HIP_MYFLAGS) ../libswiftsim_hip.la -I../
+libswiftHIP_la_LIBADD = ../.libs/libswiftsim_hip.la
+libswiftHIP_la_LDFLAGS = $(AM_LDFLAGS)
+
+if HAVEMPI
+libswiftHIP_la_CFLAGS += ../libswiftsim_mpihip.la
+libswiftHIP_la_LIBADD += ../.libs/libswiftsim_mpihip.la
+endif
+
+libswiftdummy_la_SOURCES = dummy.c
+libswiftdummy_la_CFLAGS = $(AM_CFLAGS)
+libswiftdummy_la_LDFLAGS = $(AM_LDFLAGS)
+
+#test_27_cells_SOURCES=test27cells.c
+#test_27_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS) 
+#test_27_cells_LDADD= ../.libs/libswiftsim_hip.la ../.libs/libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_27_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS)
+
+#test_125_cells_SOURCES=test125cells.c
+#test_125_cells_CFLAGS=$(AM_CFLAGS) -DWITH_HIP $(HIP_CFLAGS)
+#test_125_cells_LDADD= ../libswiftsim_hip.la ../libswiftsim_mpihip.la libswiftHIP.la $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS)
+#test_125_cells_LDFLAGS = $(AM_LDFLAGS) $(HIP_CFLAGS) 
+
+endif

From d9ca273e8c1fe2c0b29009d7919d6ab4260b3f68 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 21 Oct 2024 17:04:11 +0100
Subject: [PATCH 002/217] Added cuda and hip linking directives to Makefile.am

---
 Makefile.am | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/Makefile.am b/Makefile.am
index b5ede6fd97..51f34ac1ed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -74,6 +74,23 @@ bin_PROGRAMS += fof_mpi
 endif
 endif
 
+# BUILD CUDA versions as well?
+if HAVECUDA
+bin_PROGRAMS += swift_cuda
+if HAVEMPI
+bin_PROGRAMS += swift_mpicuda
+endif
+endif
+
+
+# BUILD HIP versions as well?
+if HAVEHIP
+bin_PROGRAMS += swift_hip
+if HAVEMPI
+bin_PROGRAMS += swift_mpihip
+endif
+endif
+
 # engine_policy_setaffinity is available?
 if HAVESETAFFINITY
 ENGINE_POLICY_SETAFFINITY=| engine_policy_setaffinity
@@ -91,6 +108,28 @@ swift_mpi_SOURCES = swift.c
 swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"
 swift_mpi_LDADD =  src/libswiftsim_mpi.la argparse/libargparse.la $(MPI_LIBS) $(VELOCIRAPTOR_MPI_LIBS) $(EXTRA_LIBS) $(LD_CSDS)
 
+# Sources for swift_cuda
+swift_cuda_SOURCES = swift.c dummy.C
+swift_cuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_cuda_LDADD =  src/.libs/libswiftsim_cuda.a src/cuda/.libs/libswiftCUDA.a $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_hip
+swift_hip_SOURCES = swift.c dummy.C
+swift_hip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_hip_LDADD =  src/.libs/libswiftsim_hip.a src/hip/.libs/libswiftHIP.a $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64 -L/opt/rocm-5.1.0/lib -lhsa-runtime64 -L/opt/rocm-5.1.0/lib64 -lamd_comgr argparse/.libs/libargparse.a src/.libs/libgrav.la
+
+# Sources for swift_mpicuda, do we need an affinity policy for MPI?
+swift_mpicuda_SOURCES = swift.c dummy.C
+swift_mpicuda_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_CUDA
+swift_mpicuda_LDADD =  src/.libs/libswiftsim_mpicuda.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/cuda/.libs/libswiftCUDA.a $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS) -lcudart
+
+# Sources for swift_mpihip, do we need an affinity policy for MPI?
+swift_mpihip_SOURCES = swift.c dummy.C
+swift_mpihip_CXXFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) $(HIP_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -DWITH_HIP
+swift_mpihip_LDADD =  src/.libs/libswiftsim_mpihip.a argparse/.libs/libargparse.a src/.libs/libgrav.la src/hip/.libs/libswiftHIP.a $(MPI_LIBS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+
 # Sources for fof
 fof_SOURCES = swift_fof.c
 fof_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)"

From ecef19e47fa08936b2f4bae71ba76dbb6d5f2c5b Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 21 Oct 2024 17:09:56 +0100
Subject: [PATCH 003/217] Added AC_PROG_CXX to configure.ac

---
 configure.ac | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/configure.ac b/configure.ac
index 30fc62a147..59fc40aba5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -41,6 +41,10 @@ AC_USE_SYSTEM_EXTENSIONS
 AC_PROG_CC
 AM_PROG_CC_C_O
 
+# Find and test the C++ compiler.
+AC_PROG_CXX
+AC_PROG_CXX_C_O
+
 # We need this for compilation hints and possibly FFTW.
 AX_OPENMP
 

From 5cb37051cccea272f20994676fc86b677be8b21b Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 21 Oct 2024 17:27:06 +0100
Subject: [PATCH 004/217] Added first GPU files:
 src/runner_gpu_pack_functionc.c and src/cuda/part_gpu.h

---
 src/Makefile.am | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index 8099524651..7881a8fff8 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -18,6 +18,9 @@
 # Add the non-standard paths to the included library headers
 AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS)  $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS)
 
+# Add HIP Path
+AM_CFLAGS += -D__HIP_PLATFORM_AMD__
+
 # Assign a "safe" version number
 AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS)
 
@@ -40,6 +43,22 @@ lib_LTLIBRARIES += libswiftsim_mpi.la
 noinst_LTLIBRARIES += libgrav_mpi.la
 endif
 
+# Build a cuda version too?
+if HAVECUDA
+lib_LTLIBRARIES += libswiftsim_cuda.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpicuda.la
+endif
+endif
+
+# Build a hip version too?
+if HAVEHIP
+lib_LTLIBRARIES += libswiftsim_hip.la
+if HAVEMPI
+lib_LTLIBRARIES += libswiftsim_mpihip.la
+endif
+endif
+
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h 
 include_HEADERS += cell_hydro.h cell_stars.h cell_grav.h cell_sinks.h cell_black_holes.h cell_rt.h cell_grid.h
@@ -161,7 +180,7 @@ endif
 AM_SOURCES = space.c space_rebuild.c space_regrid.c space_unique_id.c 
 AM_SOURCES += space_sort.c space_split.c space_extras.c space_first_init.c space_init.c 
 AM_SOURCES += space_cell_index.c space_recycle.c 
-AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c
+AM_SOURCES += runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c runner_gpu_pack_functions.c
 AM_SOURCES += runner_doiact_stars.c runner_doiact_black_holes.c runner_ghost.c
 AM_SOURCES += runner_recv.c runner_pack.c
 AM_SOURCES += runner_sort.c runner_drift.c runner_black_holes.c runner_time_integration.c 
@@ -208,7 +227,7 @@ AM_SOURCES += $(SPHM1RT_RT_SOURCES)
 AM_SOURCES += $(GEAR_RT_SOURCES)
 
 # Include files for distribution, not installation.
-nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h 
+nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h runner_gpu_pack_functions.h
 nobase_noinst_HEADERS += gravity_iact.h kernel_long_gravity.h vector.h accumulate.h cache.h exp.h log.h
 nobase_noinst_HEADERS += runner_doiact_nosort.h runner_doiact_hydro.h runner_doiact_stars.h runner_doiact_black_holes.h runner_doiact_grav.h
 nobase_noinst_HEADERS += runner_doiact_functions_hydro.h runner_doiact_functions_stars.h runner_doiact_functions_black_holes.h 

From 4edf58ef1bf1da377cddb5ac95e0502bc6319d3d Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 21 Oct 2024 17:29:44 +0100
Subject: [PATCH 005/217] ACTUALLY added first GPU files:
 src/runner_gpu_pack_functionc.c and src/cuda/part_gpu.h

---
 src/cuda/part_gpu.h             |  411 +++++++
 src/runner_gpu_pack_functions.c | 1934 +++++++++++++++++++++++++++++++
 src/runner_gpu_pack_functions.h |  160 +++
 3 files changed, 2505 insertions(+)
 create mode 100755 src/cuda/part_gpu.h
 create mode 100755 src/runner_gpu_pack_functions.c
 create mode 100755 src/runner_gpu_pack_functions.h

diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
new file mode 100755
index 0000000000..1af9029416
--- /dev/null
+++ b/src/cuda/part_gpu.h
@@ -0,0 +1,411 @@
+#ifndef PART_GPU_H
+#define PART_GPU_H
+/* Config parameters. */
+#include "../../config.h"
+#include "../align.h"
+typedef int8_t timebin_t;
+
+#ifdef __WITH_CUDA
+extern "C" {
+#endif
+
+#include "/usr/local/cuda-12.3/targets/x86_64-linux/include/vector_types.h"
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+} part_soa;
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle position. */
+  double locx;
+  double locy;
+  double locz;
+
+  /*! Particle predicted velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+
+  /* Density information */
+  /*! Neighbour number count. */
+  float wcount;
+  /*! Derivative of the neighbour number with respect to h. */
+  float wcount_dh;
+  /*! Derivative of density with respect to h */
+  float rho_dh;
+  /*! Particle velocity curl. */
+  float rot_ux;
+  float rot_uy;
+  float rot_uz;
+
+  /* viscosity information */
+  /*! Particle velocity divergence */
+  float div_v;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+} part_aos;
+
+/*Container for particle data requierd for density calcs*/
+typedef struct part_aos_f4_send {
+  /*! Particle position and h -> x, y, z, h */
+  float4 x_p_h;
+
+  /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+  float4 ux_m;
+  /*Markers for where neighbour cell j starts and stops in array indices for pair tasks*/
+  int2 cjs_cje;
+}part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)));
+
+typedef struct part_aos_f4_recv{
+  /* Density information; rho */
+  /*! Derivative of density with respect to h; rho_dh,
+  * Neighbour number count; w_count
+  * * Derivative of the neighbour number with respect to h; w_count_dh */
+  float4 rho_dh_wcount;
+  /*! Particle velocity curl; rot_ux and
+  * velocity divergence; div_v */
+  float4 rot_ux_div_v;
+} part_aos_f4_recv;
+
+/*Container for particle data required for density calcs*/
+typedef struct part_aos_f4 {
+  /*! Particle position and h -> x, y, z, h */
+  float4 x_p_h;
+
+  /*! Particle predicted velocity and mass -> ux, uy, uz, m */
+  float4 ux_m;
+  /* Density information; rho */
+  /*! Derivative of density with respect to h; rho_dh,
+  * Neighbour number count; w_count
+  * * Derivative of the neighbour number with respect to h; w_count_dh */
+  float4 rho_dh_wcount;
+
+  /*! Particle velocity curl; rot_ux and
+  * velocity divergence; div_v */
+  float4 rot_ux_div_v;
+
+} part_aos_f4;
+
+/*Container for particle data required for force calcs*/
+typedef struct part_aos_f {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle predicted velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+  /*! Particle pressure. */
+  float pressure;
+
+  /* Density information */
+  /*! Speed of sound. */
+  float soundspeed;
+  /*! Variable smoothing length term */
+  float f;
+  /*! Derivative of density with respect to h */
+  float balsara;
+  /*! Particle velocity curl. */
+  float alpha_visc;
+  float a_hydrox;
+  float a_hydroy;
+  float a_hydroz;
+  float alpha_diff;
+
+  /* viscosity information */
+  /*! Internal energy */
+  float u;
+  float u_dt;
+  /*! h time derivative */
+  float h_dt;
+  float v_sig;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+  int min_ngb_time_bin;
+} part_aos_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f {
+
+  /*Data required for the calculation:
+  Values read to local GPU memory*/
+  /*! Particle position smoothing length */
+  float4 x_h;
+  /*! Particle predicted velocity and mass */
+  float4 ux_m;
+  /*! Variable smoothing length term f, balsara, timebin
+   * and initial value of min neighbour timebin */
+  float4 f_bals_timebin_mintimebin_ngb;
+  /*! Particle density, pressure, speed of sound & v_sig to read*/
+  float4 rho_p_c_vsigi;
+  /*! Particle Internal energy u, alpha constants for visc and diff */
+  float3 u_alphavisc_alphadiff;
+
+  /*Result: Values output to global GPU memory*/
+  /* change of u and h with dt, v_sig and returned value of
+   * minimum neighbour timebin */
+  float4 udt_hdt_vsig_mintimebin_ngb;
+  /*Particle acceleration vector*/
+  float3 a_hydro;
+
+} part_aos_f4_f;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_send {
+
+  /*Data required for the calculation:
+  Values read to local GPU memory*/
+  /*! Particle position smoothing length */
+  float4 x_h;
+  /*! Particle predicted velocity and mass */
+  float4 ux_m;
+  /*! Variable smoothing length term f, balsara, timebin
+   * and initial value of min neighbour timebin */
+  float4 f_bals_timebin_mintimebin_ngb;
+  /*! Particle density, pressure, speed of sound & v_sig to read*/
+  float4 rho_p_c_vsigi;
+  /*! Particle Internal energy u, alpha constants for visc and diff */
+  float3 u_alphavisc_alphadiff;
+
+  int2 cjs_cje;
+
+} part_aos_f4_f_send;
+
+/*Container for particle data requierd for force calcs*/
+typedef struct part_aos_f4_f_recv {
+
+  /*Result: Values output to global GPU memory*/
+  /* change of u and h with dt, v_sig and returned value of
+   * minimum neighbour timebin */
+  float4 udt_hdt_vsig_mintimebin_ngb;
+  /*Particle acceleration vector*/
+  float3 a_hydro;
+
+} part_aos_f4_f_recv;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_g {
+
+  /*! Particle position. */
+  double x_p;
+  double y_p;
+  double z_p;
+
+  /*! Particle velocity. */
+  float ux;
+  float uy;
+  float uz;
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle density. */
+  float rho;
+
+  /* viscosity information */
+  float visc_alpha;
+  float laplace_u;
+  float alpha_visc_max_ngb;
+  float v_sig;
+
+  float u;
+
+  float soundspeed;
+
+  /* timestep stuff */
+  /*! Time-step length */
+  int time_bin;
+} part_aos_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g {
+
+  /*! Particle position & smoothing length */
+  float4 x_h;
+
+  /*! Particle velocity and mass */
+  float4 ux_m;
+
+  /*! Particle density alpha visc internal energy u and speed of sound c */
+  float4 rho_avisc_u_c;
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax_empty;
+
+} part_aos_f4_g;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_send {
+
+  /*! Particle position & smoothing length */
+  float4 x_h;
+
+  /*! Particle velocity and mass */
+  float4 ux_m;
+
+  /*! Particle density alpha visc internal energy u and speed of sound c */
+  float4 rho_avisc_u_c;
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax;
+
+  /*Data for cell start and end*/
+  int2 cjs_cje;
+
+} part_aos_f4_g_send;
+
+/*Container for particle data requierd for gradient calcs*/
+typedef struct part_aos_f4_g_recv {
+
+  /* viscosity information results */
+  float3 vsig_lapu_aviscmax;
+
+} part_aos_f4_g_recv;
+
+
+#ifdef __WITH_CUDA
+}
+#endif
+
+#endif // PART_GPU_H
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
new file mode 100755
index 0000000000..1394fb0cad
--- /dev/null
+++ b/src/runner_gpu_pack_functions.c
@@ -0,0 +1,1934 @@
+//#include "active.h"
+//#include <cuda_runtime.h>
+//#include <vector>
+//#include "cuda/cell_gpu.h"
+//#include "runner_gpu_functions.cuh"
+/* This object's header. */
+#include "runner.h"
+/* Local headers. */
+#include "active.h"
+#include "engine.h"
+#include "runner_gpu_pack_functions.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+
+//#ifdef WITHCUDA
+//extern "C" {
+//#endif
+
+void runner_doself1_gpu_pack_neat(
+    struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat(c, parts_soa_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos(
+    struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    error("0");
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    error("0");
+  }
+#endif
+  int2 frst_lst_prts = {local_pack_position, local_pack_position + count};
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, frst_lst_prts);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_g(
+    struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_g(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f(
+    struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict c, struct part_aos_f4_f_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void pack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_soa_buffer.x_p[id_in_pack] = p.x[0];
+    parts_soa_buffer.y_p[id_in_pack] = p.x[1];
+    parts_soa_buffer.z_p[id_in_pack] = p.x[2];
+    parts_soa_buffer.tid_p[id_in_pack] = tid;
+    parts_soa_buffer.ux[id_in_pack] = p.v[0];
+    parts_soa_buffer.uy[id_in_pack] = p.v[1];
+    parts_soa_buffer.uz[id_in_pack] = p.v[2];
+    parts_soa_buffer.locx[id_in_pack] = c->loc[0];
+    parts_soa_buffer.locy[id_in_pack] = c->loc[1];
+    parts_soa_buffer.locz[id_in_pack] = c->loc[2];
+    parts_soa_buffer.mass[id_in_pack] = p.mass;
+    parts_soa_buffer.h[id_in_pack] = p.h;
+//    parts_soa_buffer.time_bin[id_in_pack] = p.time_bin;
+    /*Initialise sums to zero before CPU/GPU copy*/
+    parts_soa_buffer.rho[id_in_pack] = 0.f;//p.rho;
+    parts_soa_buffer.rho_dh[id_in_pack] = 0.f;//p.density.rho_dh;
+    parts_soa_buffer.wcount[id_in_pack] = 0.f;//p.density.wcount;
+    parts_soa_buffer.wcount_dh[id_in_pack] = 0.f;//p.density.wcount_dh;
+    parts_soa_buffer.div_v[id_in_pack] = 0.f;//p.viscosity.div_v;
+    parts_soa_buffer.rot_ux[id_in_pack] = 0.f;//p.density.rot_v[0];
+    parts_soa_buffer.rot_uy[id_in_pack] = 0.f;//p.density.rot_v[1];
+    parts_soa_buffer.rot_uz[id_in_pack] = 0.f;//p.density.rot_v[2];
+  }
+}
+
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_p = p.x[0];
+    parts_aos_buffer[id_in_pack].y_p = p.x[1];
+    parts_aos_buffer[id_in_pack].z_p = p.x[2];
+    parts_aos_buffer[id_in_pack].ux = p.v[0];
+    parts_aos_buffer[id_in_pack].uy = p.v[1];
+    parts_aos_buffer[id_in_pack].uz = p.v[2];
+    parts_aos_buffer[id_in_pack].mass = p.mass;
+    parts_aos_buffer[id_in_pack].h = p.h;
+    parts_aos_buffer[id_in_pack].time_bin = 1000;//p.time_bin;
+    /*Initialise sums to zero before CPU/GPU copy*/
+    parts_aos_buffer[id_in_pack].rho = 0.f;//p.rho;
+    parts_aos_buffer[id_in_pack].rho_dh = 0.f;//p.density.rho_dh;
+    parts_aos_buffer[id_in_pack].wcount = 0.f;//p.density.wcount;
+    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;//p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].div_v = 0.f;//p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].rot_ux = 0.f;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rot_uy = 0.f;//p.density.rot_v[1];
+    parts_aos_buffer[id_in_pack].rot_uz = 0.f;//p.density.rot_v[2];
+  }
+}
+
+void pack_neat_pair_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, float3 shift) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_p = p.x[0] - shift.x;
+    parts_aos_buffer[id_in_pack].y_p = p.x[1] - shift.y;
+    parts_aos_buffer[id_in_pack].z_p = p.x[2] - shift.z;
+    parts_aos_buffer[id_in_pack].ux = p.v[0];
+    parts_aos_buffer[id_in_pack].uy = p.v[1];
+    parts_aos_buffer[id_in_pack].uz = p.v[2];
+    parts_aos_buffer[id_in_pack].mass = p.mass;
+    parts_aos_buffer[id_in_pack].h = p.h;
+    parts_aos_buffer[id_in_pack].time_bin = 1000;//p.time_bin;
+    /*Initialise sums to zero before CPU/GPU copy*/
+    parts_aos_buffer[id_in_pack].rho = 0.f;//p.rho;
+    parts_aos_buffer[id_in_pack].rho_dh = 0.f;//p.density.rho_dh;
+    parts_aos_buffer[id_in_pack].wcount = 0.f;//p.density.wcount;
+    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;//p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].div_v = 0.f;//p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].rot_ux = 0.f;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rot_uy = 0.f;//p.density.rot_v[1];
+    parts_aos_buffer[id_in_pack].rot_uz = 0.f;//p.density.rot_v[2];
+  }
+}
+
+extern inline void pack_neat_pair_aos_f4(struct cell * __restrict c, struct part_aos_f4_send * __restrict parts_aos_buffer, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
+	    /*Data to be copied to GPU*/
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+    parts_aos_buffer[id_in_pack].x_p_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos_buffer[id_in_pack].x_p_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos_buffer[id_in_pack].x_p_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos_buffer[id_in_pack].x_p_h.w = c->hydro.parts[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+    parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+  }
+}
+
+void pack_neat_aos_f4(struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos_buffer, int tid, int local_pack_position, int count, int2 frst_lst_prts) {
+
+  struct part ptmps[count];
+  memcpy(ptmps, (c->hydro.parts), count * sizeof(struct part));
+//  ptmps = c->hydro.parts;
+  const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+//    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_p_h.x = ptmps[i].x[0] - cellx;
+    parts_aos_buffer[id_in_pack].x_p_h.y = ptmps[i].x[1] - celly;
+    parts_aos_buffer[id_in_pack].x_p_h.z = ptmps[i].x[2] - cellz;
+    parts_aos_buffer[id_in_pack].x_p_h.w = ptmps[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = ptmps[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = ptmps[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = ptmps[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = ptmps[i].mass;
+//    /*Initialise sums to zero before CPU/GPU copy*/
+//    const float4 zeroes = {0.0, 0.0, 0.0, 0.0};
+//    parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes;
+//    parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes;
+  }
+}
+
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_p = p.x[0];
+    parts_aos_buffer[id_in_pack].y_p = p.x[1];
+    parts_aos_buffer[id_in_pack].z_p = p.x[2];
+    parts_aos_buffer[id_in_pack].ux = p.v[0];
+    parts_aos_buffer[id_in_pack].uy = p.v[1];
+    parts_aos_buffer[id_in_pack].uz = p.v[2];
+    parts_aos_buffer[id_in_pack].mass = p.mass;
+    parts_aos_buffer[id_in_pack].h = p.h;
+    parts_aos_buffer[id_in_pack].time_bin = 1000;
+    parts_aos_buffer[id_in_pack].rho = p.rho;
+    parts_aos_buffer[id_in_pack].visc_alpha = p.viscosity.alpha;
+    parts_aos_buffer[id_in_pack].alpha_visc_max_ngb = p.force.alpha_visc_max_ngb;//p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].v_sig = p.viscosity.v_sig;//p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].soundspeed = p.force.soundspeed;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].u = p.u;//p.density.rot_v[0];
+    /*Initialise sums to zero before CPU/GPU copy*/
+    parts_aos_buffer[id_in_pack].laplace_u = 0.f;//p.density.wcount;
+  }
+}
+
+void pack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer, int tid, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  const float cellx = c->loc[0], celly = c->loc[1],
+              cellz = c->loc[2];
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos_buffer[id_in_pack].x_h.x = p.x[0] - cellx;
+    parts_aos_buffer[id_in_pack].x_h.y = p.x[1] - celly;
+    parts_aos_buffer[id_in_pack].x_h.z = p.x[2] - cellz;
+    parts_aos_buffer[id_in_pack].x_h.w = p.h;
+    parts_aos_buffer[id_in_pack].ux_m.x = p.v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = p.v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = p.v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = p.mass;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = p.rho;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = p.viscosity.alpha;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = p.force.soundspeed;//p.density.rot_v[0];
+  }
+}
+
+extern inline void pack_neat_pair_aos_f4_g(struct cell * __restrict c, struct part_aos_f4_g_send * __restrict parts_aos_buffer, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
+	    /*Data to be copied to GPU*/
+  for (int i = 0; i < count; i++) {
+    const int id_in_pack = i + local_pack_position;
+    parts_aos_buffer[id_in_pack].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos_buffer[id_in_pack].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos_buffer[id_in_pack].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos_buffer[id_in_pack].x_h.w = c->hydro.parts[i].h;
+    parts_aos_buffer[id_in_pack].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos_buffer[id_in_pack].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = c->hydro.parts[i].rho;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = c->hydro.parts[i].viscosity.alpha;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = c->hydro.parts[i].u;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = c->hydro.parts[i].force.soundspeed;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
+    parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
+  }
+}
+
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    const struct part p = ptmps[i];
+    /*Data to be copied to GPU*/
+    parts_aos[id_in_pack].x_p = p.x[0];
+    parts_aos[id_in_pack].y_p = p.x[1];
+    parts_aos[id_in_pack].z_p = p.x[2];
+    parts_aos[id_in_pack].ux = p.v[0];
+    parts_aos[id_in_pack].uy = p.v[1];
+    parts_aos[id_in_pack].uz = p.v[2];
+    parts_aos[id_in_pack].mass = p.mass;
+    parts_aos[id_in_pack].h = p.h;
+    parts_aos[id_in_pack].time_bin = p.time_bin;
+    parts_aos[id_in_pack].min_ngb_time_bin = p.limiter_data.min_ngb_time_bin;
+    parts_aos[id_in_pack].rho = p.rho;
+    parts_aos[id_in_pack].pressure = p.force.pressure;
+    parts_aos[id_in_pack].soundspeed = p.force.soundspeed;
+    parts_aos[id_in_pack].f = p.force.f;
+    parts_aos[id_in_pack].balsara = p.force.balsara;
+    parts_aos[id_in_pack].alpha_visc = p.viscosity.alpha;
+    parts_aos[id_in_pack].a_hydrox = 0.0;
+    parts_aos[id_in_pack].a_hydroy = 0.0;
+    parts_aos[id_in_pack].a_hydroz = 0.0;
+    parts_aos[id_in_pack].alpha_diff = p.diffusion.alpha;
+    parts_aos[id_in_pack].u = p.u;
+    parts_aos[id_in_pack].u_dt = 0.0;
+    parts_aos[id_in_pack].h_dt = 0.0;
+    /*Initialise sums to zero before CPU/GPU copy*/
+    parts_aos[id_in_pack].v_sig = p.viscosity.v_sig;
+  }
+}
+
+void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send *restrict parts_aos, int tid, int local_pack_position, int count) {
+
+//  const struct part *restrict ptmps;
+//  ptmps = c->hydro.parts;
+  const int pp = local_pack_position;
+  const float cellx = c->loc[0];
+  const float celly = c->loc[1];
+  const float cellz = c->loc[2];
+  /*Data to be copied to GPU local memory*/
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].x_h.x = c->hydro.parts[i].x[0] - cellx;
+    parts_aos[i + pp].x_h.y = c->hydro.parts[i].x[1] - celly;
+    parts_aos[i + pp].x_h.z = c->hydro.parts[i].x[2] - cellz;
+    parts_aos[i + pp].x_h.w = c->hydro.parts[i].h;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos[i + pp].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos[i + pp].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos[i + pp].ux_m.w = c->hydro.parts[i].mass;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y = c->hydro.parts[i].force.balsara;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w = c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+    parts_aos[i + pp].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+    parts_aos[i + pp].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+    parts_aos[i + pp].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+  }
+  for (int i = 0; i < count; i++) {
+    parts_aos[i + pp].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+    parts_aos[i + pp].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
+    parts_aos[i + pp].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+  }
+
+}
+
+extern inline void pack_neat_pair_aos_f4_f(struct cell * __restrict c, struct part_aos_f4_f_send * __restrict parts_aos, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
+	//  const struct part *restrict ptmps;
+	//  ptmps = c->hydro.parts;
+	  const int pp = local_pack_position;
+	  /*Data to be copied to GPU local memory*/
+	  for (int i = 0; i < count; i++) {
+		const int id = i + pp;
+	    parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+	    parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+	    parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+	    parts_aos[id].x_h.w = c->hydro.parts[i].h;
+	    parts_aos[id].ux_m.x = c->hydro.parts[i].v[0];
+	    parts_aos[id].ux_m.y = c->hydro.parts[i].v[1];
+	    parts_aos[id].ux_m.z = c->hydro.parts[i].v[2];
+	    parts_aos[id].ux_m.w = c->hydro.parts[i].mass;
+	    parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
+	    parts_aos[id].f_bals_timebin_mintimebin_ngb.y = c->hydro.parts[i].force.balsara;
+	    parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
+	    parts_aos[id].f_bals_timebin_mintimebin_ngb.w = c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+	    parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+	    parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+	    parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+	    parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+	    parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+	    parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
+	    parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+	    parts_aos[id].cjs_cje.x = cstarts.x;
+	    parts_aos[id].cjs_cje.y = cstarts.y;
+	  }
+}
+
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat(c, parts_soa_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_g(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+
+void runner_doself1_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4_g(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	  message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void runner_doself1_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *c, struct part_aos_f4_f_recv *parts_aos_buffer,
+       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count, e);
+  }
+#endif
+
+  /* Copy particle data from CPU buffers to cells */
+  unpack_neat_aos_f4_f(c, parts_aos_buffer, tid, local_pack_position, count, e);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+}
+
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+//  struct part *ptmps;
+//  ptmps=c->hydro.parts;
+
+//  memcpy(&rho[0], &parts_soa_buffer.rho[local_pack_position], count * sizeof(float));
+//	fprintf(stderr, "count %i\n", count);
+//  memcpy(rho, &parts_soa_buffer.rho[local_pack_position], count * sizeof(float));
+//  memcpy(rho_dh, &parts_soa_buffer.rho_dh[local_pack_position], count * sizeof(float));
+//  memcpy(wcount, &parts_soa_buffer.wcount[local_pack_position], count * sizeof(float));
+//  memcpy(wcount_dh, &parts_soa_buffer.wcount_dh[local_pack_position], count * sizeof(float));
+//  memcpy(div_v, &parts_soa_buffer.div_v[local_pack_position], count * sizeof(float));
+//  memcpy(rot_ux, &parts_soa_buffer.rot_ux[local_pack_position], count * sizeof(float));
+//  memcpy(rot_uy, &parts_soa_buffer.rot_uy[local_pack_position], count * sizeof(float));
+//  memcpy(rot_uz, &parts_soa_buffer.rot_uz[local_pack_position], count * sizeof(float));
+  float *rho = &parts_soa_buffer.rho[local_pack_position];// = calloc(count, sizeof(float));//
+  float *rho_dh  = &parts_soa_buffer.rho_dh[local_pack_position];// = calloc(count, sizeof(float));//
+  float *wcount  = &parts_soa_buffer.wcount[local_pack_position];// = calloc(count, sizeof(float));//
+  float *wcount_dh = &parts_soa_buffer.wcount_dh[local_pack_position];// = calloc(count, sizeof(float));//
+  float *div_v  = &parts_soa_buffer.div_v[local_pack_position];// = calloc(count, sizeof(float));//
+  float *rot_ux  = &parts_soa_buffer.rot_ux[local_pack_position];// = calloc(count, sizeof(float));//
+  float *rot_uy  = &parts_soa_buffer.rot_uy[local_pack_position];// = calloc(count, sizeof(float));//
+  float *rot_uz  = &parts_soa_buffer.rot_uz[local_pack_position];// = calloc(count, sizeof(float));//
+
+//  fprintf(stderr, "rho %f rho %f\n", rho[1], parts_soa_buffer.rho[local_pack_position+1]);
+  for (int i = 0; i < count; i++) {
+//    int id_in_pack = i + local_pack_position;
+//    struct part *part_cpu = &c->hydro.parts[i];
+    struct part *pi = &c->hydro.parts[i];
+//    if (part_is_inhibited(pi, e)) {
+//      fprintf(stderr, "inhibited part\n");
+//      continue;
+//    }
+//    const int pi_active = part_is_active(pi, e);
+//    if (pi_active) {
+      pi->rho += rho[i];
+//      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
+      pi->density.rho_dh += rho_dh[i];
+      pi->density.wcount += wcount[i];
+      pi->density.wcount_dh += wcount_dh[i];
+      pi->viscosity.div_v += div_v[i];
+      pi->density.rot_v[0] += rot_ux[i];
+      pi->density.rot_v[1] += rot_uy[i];
+      pi->density.rot_v[2] += rot_uz[i];
+
+//      c->hydro.parts[i].rho += rho[i];
+//      c->hydro.parts[i].density.rho_dh += rho_dh[i];
+//      c->hydro.parts[i].density.wcount += wcount[i];
+//      c->hydro.parts[i].density.wcount_dh += wcount_dh[i];
+//      c->hydro.parts[i].viscosity.div_v += div_v[i];
+//      c->hydro.parts[i].density.rot_v[0] += rot_ux[i];
+//      c->hydro.parts[i].density.rot_v[1] += rot_uy[i];
+//      c->hydro.parts[i].density.rot_v[2] += rot_uz[i];
+
+//      c->hydro.parts[i].rho += parts_tmp->rho[i];
+//      c->hydro.parts[i].density.rho_dh += parts_tmp->rho_dh[i];
+//      c->hydro.parts[i].density.wcount += parts_tmp->wcount[i];
+//      c->hydro.parts[i].density.wcount_dh += parts_tmp->wcount_dh[i];
+//      c->hydro.parts[i].viscosity.div_v += parts_tmp->div_v[i];
+//      c->hydro.parts[i].density.rot_v[0] += parts_tmp->rot_ux[i];
+//      c->hydro.parts[i].density.rot_v[1] += parts_tmp->rot_uy[i];
+//      c->hydro.parts[i].density.rot_v[2] += parts_tmp->rot_uz[i];
+
+//      part_cpu[i].rho += parts_soa_buffer.rho[i];
+//      part_cpu[i].density.rho_dh += parts_soa_buffer.rho_dh[i];
+//      part_cpu[i].density.wcount += parts_soa_buffer.wcount[i];
+//      part_cpu[i].density.wcount_dh += parts_soa_buffer.wcount_dh[i];
+//      part_cpu[i].viscosity.div_v += parts_soa_buffer.div_v[i];
+//      part_cpu[i].density.rot_v[0] += parts_soa_buffer.rot_ux[i];
+//      part_cpu[i].density.rot_v[1] += parts_soa_buffer.rot_uy[i];
+//      part_cpu[i].density.rot_v[2] += parts_soa_buffer.rot_uz[i];
+//    }
+//    else fprintf(stderr,"a part is not active\n");
+  }
+//  c->hydro.parts=ptmps;
+}
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+	//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
+	//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
+	//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
+	//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
+	//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
+	//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
+	//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
+	//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
+	  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
+	  for (int i = 0; i < count; i++) {
+
+	      struct part_aos p_tmp = parts_tmp[i];
+	      struct part *p = &c->hydro.parts[i];
+	      p->rho += p_tmp.rho;
+	      p->density.rho_dh += p_tmp.rho_dh;
+	      p->density.wcount += p_tmp.wcount;
+	      p->density.wcount_dh += p_tmp.wcount_dh;
+	      p->viscosity.div_v += p_tmp.div_v;
+	      p->density.rot_v[0] += p_tmp.rot_ux;
+	      p->density.rot_v[1] += p_tmp.rot_uy;
+	      p->density.rot_v[2] += p_tmp.rot_uz;
+	  }
+}
+#include <stdatomic.h>
+void unpack_neat_aos_f4(struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+	//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
+	//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
+	//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
+	//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
+	//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
+	//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
+	//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
+	//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
+	  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+	  for (int i = 0; i < count; i++) {
+
+	      struct part_aos_f4_recv p_tmp = parts_tmp[i];
+	      float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
+	      float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
+	      struct part *p = &c->hydro.parts[i];
+
+	      p->rho += rho_dh_wcount.x;
+	      p->density.rho_dh += rho_dh_wcount.y;
+	      p->density.wcount += rho_dh_wcount.z;
+	      p->density.wcount_dh += rho_dh_wcount.w;
+	      p->density.rot_v[0] += rot_ux_div_v.x;
+	      p->density.rot_v[1] += rot_ux_div_v.y;
+	      p->density.rot_v[2] += rot_ux_div_v.z;
+	      p->viscosity.div_v += rot_ux_div_v.w;
+//	      fprintf(stderr, "rho %f div_v %f\n", p_tmp.rho_dh_wcount.x, p_tmp.rot_ux_div_v.w);
+	  }
+}
+
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+	  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
+	  for (int i = 0; i < count; i++) {
+	      struct part_aos_g p_tmp = parts_tmp[i];
+	      struct part *p = &c->hydro.parts[i];
+	      const float v_sig = p->viscosity.v_sig;
+	      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+	      p->diffusion.laplace_u += p_tmp.laplace_u;
+	      const float max_ngb = p->force.alpha_visc_max_ngb;
+	      p->force.alpha_visc_max_ngb = max(p_tmp.alpha_visc_max_ngb, max_ngb);
+	  }
+
+}
+
+void unpack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+	  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+	  for (int i = 0; i < count; i++) {
+	      struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+	      struct part *p = &c->hydro.parts[i];
+	      const float v_sig = p->viscosity.v_sig;
+	      p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+	      p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+	      const float max_ngb = p->force.alpha_visc_max_ngb;
+	      p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+	  }
+
+}
+
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+	  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
+	  for (int i = 0; i < count; i++) {
+	      struct part_aos_f p_tmp = parts_tmp[i];
+	      struct part *p = &c->hydro.parts[i];
+	      p->a_hydro[0] += p_tmp.a_hydrox;
+	      p->a_hydro[1] += p_tmp.a_hydroy;
+	      p->a_hydro[2] += p_tmp.a_hydroz;
+	      p->u_dt += p_tmp.u_dt;
+	      p->force.h_dt += p_tmp.h_dt;
+//	      p->limiter_data.min_ngb_time_bin = min(p_tmp.min_ngb_time_bin, p->limiter_data.min_ngb_time_bin);
+	      p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
+	      const float v_sig = p->viscosity.v_sig;
+	      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+//	      p->viscosity.v_sig = p_tmp.v_sig;
+
+//          fprintf(stderr, "ax %f ay %f az %f\n", p_tmp.a_hydrox, p_tmp.a_hydroy, p_tmp.a_hydroz);
+	  }
+
+}
+
+void unpack_neat_aos_f4_f(struct cell *restrict c, struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+//	  struct part_aos_f4_f_recv *restrict parts_tmp = &parts_aos_buffer[local_pack_position];
+	  int pp = local_pack_position;
+	  for (int i = 0; i < count; i++) {
+//	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+//	      struct part *restrict p = &c->hydro.parts[i];
+	      c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
+	      c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
+	      c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
+	  }
+	  for (int i = 0; i < count; i++) {
+	      c->hydro.parts[i].viscosity.v_sig = fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z, c->hydro.parts[i].viscosity.v_sig);
+	      c->hydro.parts[i].limiter_data.min_ngb_time_bin = (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+	  }
+	  for (int i = 0; i < count; i++) {
+	      c->hydro.parts[i].u_dt += parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
+	      c->hydro.parts[i].force.h_dt += parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y;
+	  }
+
+}
+
+void unpack_neat_pair(struct runner *r, struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+//  struct part *ptmps;
+//  ptmps=c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+//    struct part *pi = &c->hydro.parts[i];
+//    if (part_is_inhibited(pi, e)) {
+//      fprintf(stderr, "inhibited part\n");
+//      continue;
+//    }
+//    const int pi_active = part_is_active(pi, e);
+//    if (pi_active) {
+      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
+      c->hydro.parts[i].density.rho_dh += parts_soa_buffer.rho_dh[id_in_pack];
+      c->hydro.parts[i].density.wcount += parts_soa_buffer.wcount[id_in_pack];
+      c->hydro.parts[i].density.wcount_dh += parts_soa_buffer.wcount_dh[id_in_pack];
+      c->hydro.parts[i].viscosity.div_v += parts_soa_buffer.div_v[id_in_pack];
+      c->hydro.parts[i].density.rot_v[0] += parts_soa_buffer.rot_ux[id_in_pack];
+      c->hydro.parts[i].density.rot_v[1] += parts_soa_buffer.rot_uy[id_in_pack];
+      c->hydro.parts[i].density.rot_v[2] += parts_soa_buffer.rot_uz[id_in_pack];
+//      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i, parts_soa_buffer.rho[id_in_pack]);
+//    }
+//    else fprintf(stderr,"a part is not active\n");
+  }
+//  c->hydro.parts=ptmps;
+}
+
+void unpack_neat_pair_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+
+//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
+//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
+//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
+//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
+//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
+//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
+//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
+//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
+  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
+//  struct part *ptmps;
+//  ptmps=c->hydro.parts;
+//  struct part *part_cpu = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+//    int id_in_pack = i + local_pack_position;
+//      struct part_aos part_gpu = parts_aos_buffer[id_in_pack];
+//    struct part *pi = &c->hydro.parts[i];
+//    if (part_is_inhibited(pi, e)) {
+//      fprintf(stderr, "inhibited part\n");
+//      continue;
+//    }
+//    const int pi_active = part_is_active(pi, e);
+//    if (pi_active) {
+//      if(parts_aos_buffer[id_in_pack].time_bin == 1000)(*count1000)++ ;//fprintf(stderr, "timebin %i\n", parts_aos_buffer[id_in_pack].time_bin);
+//      else if(parts_aos_buffer[id_in_pack].time_bin == 20)(*count20)++ ;//fprintf(stderr, "timebin %i\n", parts_aos_buffer[id_in_pack].time_bin);
+//      else fprintf(stderr, "not 20 or 1000\n");
+//
+      struct part_aos p_tmp = parts_tmp[i];
+      struct part *p = &c->hydro.parts[i];
+      p->rho += p_tmp.rho;
+      p->density.rho_dh += p_tmp.rho_dh;
+      p->density.wcount += p_tmp.wcount;
+      p->density.wcount_dh += p_tmp.wcount_dh;
+      p->viscosity.div_v += p_tmp.div_v;
+      p->density.rot_v[0] += p_tmp.rot_ux;
+      p->density.rot_v[1] += p_tmp.rot_uy;
+      p->density.rot_v[2] += p_tmp.rot_uz;
+
+//      c->hydro.parts[i].rho += parts_aos_buffer[id_in_pack].rho;
+//      c->hydro.parts[i].density.rho_dh += parts_aos_buffer[id_in_pack].rho_dh;
+//      c->hydro.parts[i].density.wcount += parts_aos_buffer[id_in_pack].wcount;
+//      c->hydro.parts[i].density.wcount_dh += parts_aos_buffer[id_in_pack].wcount_dh;
+//      c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[id_in_pack].div_v;
+//      c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[id_in_pack].rot_ux;
+//      c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[id_in_pack].rot_uy;
+//      c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[id_in_pack].rot_uz;
+
+//      part_cpu[i].rho += part_gpu.rho;
+//      part_cpu[i].density.rho_dh += part_gpu.rho_dh;
+//      part_cpu[i].density.wcount += part_gpu.wcount;
+//      part_cpu[i].density.wcount_dh += part_gpu.wcount_dh;
+//      part_cpu[i].viscosity.div_v += part_gpu.div_v;
+//      part_cpu[i].density.rot_v[0] += part_gpu.rot_ux;
+//      part_cpu[i].density.rot_v[1] += part_gpu.rot_uy;
+//      part_cpu[i].density.rot_v[2] += part_gpu.rot_uz;
+//      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i, parts_soa_buffer.rho[id_in_pack]);
+//    }
+//    else fprintf(stderr,"a part is not active\n");
+  }
+//  c->hydro.parts=ptmps;
+}
+
+void unpack_neat_pair_aos_f4(struct runner *r, struct cell * restrict c, struct part_aos_f4_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
+		int count, struct engine *e){
+
+//  struct part_aos_f4_recv * restrict parts_tmp = &parts_aos_buffer[local_pack_position];
+  if (cell_is_active_hydro(c, e)){
+  int pp = local_pack_position;
+  for (int i = 0; i < count; i++) {
+	  int j = i + pp;
+      c->hydro.parts[i].rho += parts_aos_buffer[j].rho_dh_wcount.x;
+      c->hydro.parts[i].density.rho_dh += parts_aos_buffer[j].rho_dh_wcount.y;
+      c->hydro.parts[i].density.wcount += parts_aos_buffer[j].rho_dh_wcount.z;
+      c->hydro.parts[i].density.wcount_dh += parts_aos_buffer[j].rho_dh_wcount.w;
+      c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[j].rot_ux_div_v.x;
+      c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[j].rot_ux_div_v.y;
+      c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[j].rot_ux_div_v.z;
+      c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[j].rot_ux_div_v.w;
+  }
+  }
+
+}
+
+void unpack_neat_pair_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+      struct part_aos_g p_tmp = parts_tmp[i];
+      struct part *p = &c->hydro.parts[i];
+      p->viscosity.v_sig = p_tmp.v_sig;
+      p->diffusion.laplace_u += p_tmp.laplace_u;
+      p->force.alpha_visc_max_ngb = p_tmp.alpha_visc_max_ngb;
+  }
+}
+
+void unpack_neat_pair_aos_f4_g(struct runner *r, struct cell * restrict c, struct part_aos_f4_g_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
+		int count, struct engine *e){
+//  struct part_aos_f4_recv * restrict parts_tmp = &parts_aos_buffer[local_pack_position];
+//  int pp = local_pack_position;
+//  for (int i = 0; i < count; i++) {
+//	  int j = i + pp;
+//	  c->hydro.parts[i].viscosity.v_sig = parts_aos_buffer[j].vsig_lapu_aviscmax.x;
+//	  c->hydro.parts[i].diffusion.laplace_u += parts_aos_buffer[j].vsig_lapu_aviscmax.y;
+//	  c->hydro.parts[i].force.alpha_visc_max_ngb = parts_aos_buffer[j].vsig_lapu_aviscmax.z;
+//  }
+  if (cell_is_active_hydro(c, e)){
+
+  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+      struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+      struct part *p = &c->hydro.parts[i];
+      const float v_sig = p->viscosity.v_sig;
+      p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+      p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+      const float max_ngb = p->force.alpha_visc_max_ngb;
+      p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+  }
+  }
+}
+
+
+void unpack_neat_pair_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+      struct part_aos_f p_tmp = parts_tmp[i];
+      struct part *p = &c->hydro.parts[i];
+      p->a_hydro[0] += p_tmp.a_hydrox;
+      p->a_hydro[1] += p_tmp.a_hydroy;
+      p->a_hydro[2] += p_tmp.a_hydroz;
+      p->u_dt += p_tmp.u_dt;
+      p->force.h_dt += p_tmp.h_dt;
+      const float v_sig = p->viscosity.v_sig;
+      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+      p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
+//	      p->viscosity.v_sig = p_tmp.v_sig;
+  }
+}
+
+void unpack_neat_pair_aos_f4_f(struct runner *r, struct cell * restrict c, struct part_aos_f4_f_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
+	int count, struct engine *e){
+//	  struct part_aos_f4_f_recv *restrict parts_tmp = &parts_aos_buffer[local_pack_position];
+	if (cell_is_active_hydro(c, e)){
+    int pp = local_pack_position;
+	for (int i = 0; i < count; i++) {
+//	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+//	      struct part *restrict p = &c->hydro.parts[i];
+	  int j = i + pp;
+      c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[j].a_hydro.x;
+	  c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y;
+	  c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z;
+	  c->hydro.parts[i].viscosity.v_sig = fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z, c->hydro.parts[i].viscosity.v_sig);
+	  c->hydro.parts[i].limiter_data.min_ngb_time_bin = (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+	  c->hydro.parts[i].u_dt += parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x;
+	  c->hydro.parts[i].force.h_dt += parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y;
+	}
+	}
+}
+
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair(r, ci, parts_soa_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair(r, cj, parts_soa_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_recv *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_g(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_g(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_g_recv *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
+	message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+//  if (c->hydro.count == 0)
+//    return;
+  if (!cell_is_active_hydro(ci, e)){
+	  message("Inactive cell\n");
+	return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+# ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  local_pack_position += count_ci;
+//  for (int i = 0; i < count_ci; i++){
+//    struct part *p = &ci->hydro.parts[i];
+//    fprintf(stderr, "ax %f, ay %f, az %f, u_dt %f, h_dt %f\n", p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->u_dt, p->force.h_dt);
+//  }
+//	      p->viscosity.v_sig = p_tmp.v_sig;
+  /* Pack the particle data into CPU-side buffers*/
+//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_f_recv *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+
+	  /* Anything to do here? */
+	//  if (c->hydro.count == 0)
+	//    return;
+	  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
+		message("Inactive cell\n");
+		return;
+	  }
+	  int count_ci = ci->hydro.count;
+	  int count_cj = cj->hydro.count;
+	  int local_pack_position = (*pack_length);
+
+	# ifdef SWIFT_DEBUG_CHECKS
+	  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+	    fprintf(stderr,
+	            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+	            "%i pointer to pack_length is %i, local_pack_position is % i, "
+	            "count is %i\n",
+	            (*pack_length), pack_length, local_pack_position, count_ci, e);
+	  }
+	#endif
+
+	  /* Pack the particle data into CPU-side buffers*/
+	//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+	  unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+	  local_pack_position += count_ci;
+	  /* Pack the particle data into CPU-side buffers*/
+	//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
+	  unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+	  /* Increment pack length accordingly */
+	  (*pack_length) += count_ci + count_cj;
+	//  if(r->cpuid == 0)exit(0);
+}
+
+void runner_dopair_gpu_pack_neat(
+    struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat(c, parts_soa_buffer, tid, local_pack_position, count);
+  /* Increment pack length accordingly */
+  (*pack_length) += count;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0)
+    return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat(ci, parts_soa_buffer, tid, local_pack_position, count_ci);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat(cj, parts_soa_buffer, tid, local_pack_position, count_cj);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj, float3 shift_tmp) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0)
+    return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
+  pack_neat_pair_aos(ci, parts_aos_buffer, tid, local_pack_position, count_ci, shift_i);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  pack_neat_pair_aos(cj, parts_aos_buffer, tid, local_pack_position, count_cj, shift_j);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+	    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_send * restrict parts_aos_buffer,
+		int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp){
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0)
+    return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
+
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+
+  pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
+
+  pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_g(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0)
+    return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_g(ci, parts_aos_buffer, tid, local_pack_position, count_ci);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_g(cj, parts_aos_buffer, tid, local_pack_position, count_cj);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_g_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp) {
+
+	  TIMER_TIC;
+
+	  /* Anything to do here? */
+	  if (ci->hydro.count == 0)
+	    return;
+
+	  int local_pack_position = (*pack_length);
+
+	#ifdef SWIFT_DEBUG_CHECKS
+	  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+	    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+	    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+	    error();
+	  }
+	#endif
+
+	  /* Pack the particle data into CPU-side buffers*/
+	  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
+	  const int lpp1 = local_pack_position;
+
+	  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+	  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+
+	  pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+
+	  local_pack_position += count_ci;
+	  /* Pack the particle data into CPU-side buffers*/
+	  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+	  const int lpp2 = local_pack_position;
+
+	  pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
+	  /* Increment pack length accordingly */
+	  (*pack_length) += count_ci + count_cj;
+
+	  if (timer)
+	    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+
+  TIMER_TIC;
+
+  /* Anything to do here? */
+  if (ci->hydro.count == 0)
+    return;
+
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f(ci, parts_aos_buffer, tid, local_pack_position, count_ci);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  pack_neat_aos_f(cj, parts_aos_buffer, tid, local_pack_position, count_cj);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_f_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp) {
+
+	  TIMER_TIC;
+
+	  /* Anything to do here? */
+	  if (ci->hydro.count == 0)
+	    return;
+
+	  int local_pack_position = (*pack_length);
+
+	#ifdef SWIFT_DEBUG_CHECKS
+	  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+	    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+	    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+	    error();
+	  }
+	#endif
+
+	  /* Pack the particle data into CPU-side buffers*/
+	  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
+	  const int lpp1 = local_pack_position;
+
+	  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+
+	  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+
+	  pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+
+	  local_pack_position += count_ci;
+	  /* Pack the particle data into CPU-side buffers*/
+	  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+	  const int lpp2 = local_pack_position;
+
+	  pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
+	  /* Increment pack length accordingly */
+	  (*pack_length) += count_ci + count_cj;
+
+	  if (timer)
+	    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void runner_doself1_gpu_pack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp) {
+
+  TIMER_TIC;
+  //  fprintf(stderr,"Entered outer packing code!\n");
+
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  /* Recurse? */
+  //  if (c->split) {
+  ////	fprintf(stderr,"Entered recursive packing code!\n");
+  //    for (int k = 0; k < 8; k++){
+  //      if (c->progeny[k] != NULL){
+  //    	  runner_doself1_gpu_pack(r, c, timer, pack_length,
+  //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
+  //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
+  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount, wcount_dh,
+  //    rho_dh, rot_u, rot_v, 	  rot_w, div_v, div_v_previous_step, alpha_visc,
+  //    v_sig, laplace_u, alpha_diff, f, soundspeed, 	  h_dt, balsara, pressure,
+  //    alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+  //    	  to_be_synchronized, count_max_parts_tmp, fgpuin);
+  //    	  fprintf(stderr,"working on a split cell\n");
+  //      }
+  //    }
+  //  }
+  //  else {
+  //	    fprintf(stderr,"Entered inner packing code!\n");
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    exit(0);
+  }
+  // Pack the particle data
+  pack(c, x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox, a_hydroy,
+       a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx, locy, locz, widthx,
+       widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_u, rot_v,
+       rot_w, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+       alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+       time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
+       local_pack_position, count);
+  // Increment pack length accordingly
+  (*pack_length) += count;
+  //  }
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+          int *tid_p, long long *id, float *ux, float *uy, float *uz,
+          float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+          float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+          float *locx, float *locy, float *locz, float *widthx, float *widthy,
+          float *widthz, float *h_max, int *count_p, float *wcount,
+          float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+          float *rot_w, float *div_v, float *div_v_previous_step,
+          float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+          float *f, float *soundspeed, float *h_dt, float *balsara,
+          float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
+          timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+          char *to_be_synchronized, int local_pack_position, int count) {
+
+  const struct part *ptmps;
+  ptmps = c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    x_p[id_in_pack] = ptmps[i].x[0];
+    y_p[id_in_pack] = ptmps[i].x[1];
+    z_p[id_in_pack] = ptmps[i].x[2];
+    //    id[id_in_pack]=ptmps[i].id;
+    //    count_p[id_in_pack]=count;
+    tid_p[id_in_pack] = tid;
+    //    h_max[id_in_pack]=c->hydro.h_max;
+    ux[id_in_pack] = ptmps[i].v[0];
+    uy[id_in_pack] = ptmps[i].v[1];
+    uz[id_in_pack] = ptmps[i].v[2];
+    //	a_hydrox[id_in_pack]=ptmps[i].a_hydro[0];
+    //	a_hydroy[id_in_pack]=ptmps[i].a_hydro[1];
+    //	a_hydroz[id_in_pack]=ptmps[i].a_hydro[2];
+    locx[id_in_pack] = c->loc[0];
+    locy[id_in_pack] = c->loc[1];
+    locz[id_in_pack] = c->loc[2];
+    mass[id_in_pack] = ptmps[i].mass;
+    h[id_in_pack] = ptmps[i].h;
+    //	u[id_in_pack]=ptmps[i].u;
+    //	u_dt[id_in_pack]=ptmps[i].u_dt;
+    //////////////////////////////////////////////////////
+    rho[id_in_pack] = 0.f;//ptmps[i].rho;
+    /////////////////////////////////////////////////////
+    //	div_v_previous_step[id_in_pack]=ptmps[i].viscosity.div_v_previous_step;
+    //	alpha_visc[id_in_pack]=ptmps[i].viscosity.alpha;
+    //	v_sig[id_in_pack]=ptmps[i].viscosity.v_sig;
+    //	laplace_u[id_in_pack]=ptmps[i].diffusion.laplace_u;
+    //	alpha_diff[id_in_pack]=ptmps[i].diffusion.alpha;
+    //	f[id_in_pack]=ptmps[i].force.f;
+    //	soundspeed[id_in_pack]=ptmps[i].force.soundspeed;
+    //	h_dt[id_in_pack]=ptmps[i].force.h_dt;
+    //	balsara[id_in_pack]=ptmps[i].force.balsara;
+    //	pressure[id_in_pack]=ptmps[i].force.pressure;
+//    time_bin[id_in_pack] = ptmps[i].time_bin;
+    //	wakeup[id_in_pack]=ptmps[i].limiter_data.wakeup;
+    //	min_ngb_time_bin[id_in_pack]=ptmps[i].limiter_data.min_ngb_time_bin;
+    //	to_be_synchronized[id_in_pack]=ptmps[i].limiter_data.to_be_synchronized;
+    ///////////////////////////////////////////////////////////////////
+    wcount[id_in_pack] = 0.f;//ptmps[i].density.wcount;
+    wcount_dh[id_in_pack] = 0.f;//ptmps[i].density.wcount_dh;
+    rho_dh[id_in_pack] = 0.f;//ptmps[i].density.rho_dh;
+    div_v[id_in_pack] = 0.f;//ptmps[i].viscosity.div_v;
+    rot_u[id_in_pack] = 0.f;//ptmps[i].density.rot_v[0];
+    rot_v[id_in_pack] = 0.f;//ptmps[i].density.rot_v[1];
+    rot_w[id_in_pack] = 0.f;//ptmps[i].density.rot_v[2];
+    ///////////////////////////////////////////////////////////////////
+  }
+}
+
+void runner_doself1_gpu_unpack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp, struct engine *e) {
+  TIMER_TIC;
+  //  fprintf(stderr, "got into pack function\n");
+  /* Anything to do here? */
+  if (c->hydro.count == 0)
+    return;
+  if (!cell_is_active_hydro(c, e))
+	return;
+  /* Anything to do here? */
+  /* Recurse? */
+  //  if (c->split) {
+  //	  fprintf(stderr,"working on a split cell\n");
+  //    for (int k = 0; k < 8; k++){
+  //      if (c->progeny[k] != NULL){
+  //    	  runner_doself1_gpu_unpack(r, c, timer, pack_length,
+  //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
+  //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
+  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount, wcount_dh,
+  //    rho_dh, rot_u, rot_v, 	  rot_w, div_v, div_v_previous_step, alpha_visc,
+  //    v_sig, laplace_u, alpha_diff, f, soundspeed, 	  h_dt, balsara, pressure,
+  //    alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+  //    	  to_be_synchronized, count_max_parts_tmp, fgpuin);
+  //    	  fprintf(stderr,"working on a split cell\n");
+  //      }
+  //    }
+  //  } else {
+  int count = c->hydro.count;
+  int local_pack_position = (*pack_length);
+  if (local_pack_position + count >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), local_pack_position, count);
+    //	      exit(0);
+  }
+  // Pack the particle data
+  unpack(c, x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox, a_hydroy,
+         a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx, locy, locz, widthx,
+         widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_u,
+         rot_v, rot_w, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+         alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+         time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
+         local_pack_position, count, e);
+//  for (int i = *pack_length; i < count+*pack_length; i++) {
+//  for (int i = 0; i < count; i++) {
+//	  message("wcount is %f", c->hydro.parts[i].density.wcount);
+//  }
+  // Increment pack length accordingly
+  (*pack_length) += count;
+  //  }
+  if (timer)
+    TIMER_TOC(timer_doself_gpu_pack);
+}
+
+void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+            int *tid_p, long long *id, float *ux, float *uy, float *uz,
+            float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+            float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+            float *locx, float *locy, float *locz, float *widthx, float *widthy,
+            float *widthz, float *h_max, int *count_p, float *wcount,
+            float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+            float *rot_w, float *div_v, float *div_v_previous_step,
+            float *alpha_visc, float *v_sig, float *laplace_u,
+            float *alpha_diff, float *f, float *soundspeed, float *h_dt,
+            float *balsara, float *pressure, float *alpha_visc_max_ngb,
+            timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+            char *to_be_synchronized, int local_pack_position, int count,
+            struct engine *e) {
+
+  //  struct part *ptmps;
+  //  ptmps=c->hydro.parts;
+  for (int i = 0; i < count; i++) {
+    int id_in_pack = i + local_pack_position;
+    struct part *pi = &c->hydro.parts[i];
+    if (part_is_inhibited(pi, e)) {
+      fprintf(stderr, "inhibited part\n");
+      continue;
+    }
+    const int pi_active = part_is_active(pi, e);
+    if (!pi_active)fprintf(stderr, "Inactive part\n");
+    else if (pi_active) {
+      //    c->hydro.parts[i].rho = rho[id_in_pack];
+      //    c->hydro.parts[i].viscosity.div_v = div_v[id_in_pack];
+      //    c->hydro.parts[i].density.rho_dh = rho_dh[id_in_pack];
+      //    c->hydro.parts[i].density.wcount = wcount[id_in_pack];
+      //    c->hydro.parts[i].density.wcount_dh = wcount_dh[id_in_pack];
+      //    c->hydro.parts[i].density.rot_v[0] = rot_u[id_in_pack];
+      //    c->hydro.parts[i].density.rot_v[1] = rot_v[id_in_pack];
+      //    c->hydro.parts[i].density.rot_v[2] = rot_w[id_in_pack];
+      pi->rho += rho[id_in_pack];
+      pi->viscosity.div_v += div_v[id_in_pack];
+      pi->density.rho_dh += rho_dh[id_in_pack];
+      pi->density.wcount += wcount[id_in_pack];
+      pi->density.wcount_dh += wcount_dh[id_in_pack];
+      pi->density.rot_v[0] += rot_u[id_in_pack];
+      pi->density.rot_v[1] += rot_v[id_in_pack];
+      pi->density.rot_v[2] += rot_w[id_in_pack];
+    }
+    //    else fprintf(stderr,"a part is not active\n");
+  }
+  //  c->hydro.parts=ptmps;
+}
+//#ifdef WITHCUDA
+//}
+//#endif
diff --git a/src/runner_gpu_pack_functions.h b/src/runner_gpu_pack_functions.h
new file mode 100755
index 0000000000..797e06519e
--- /dev/null
+++ b/src/runner_gpu_pack_functions.h
@@ -0,0 +1,160 @@
+#include "cuda/part_gpu.h"
+void runner_doself1_gpu_pack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat(
+		    struct runner *r, struct cell *c, struct part_soa parts_soa,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos(
+		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4(
+		    struct runner *r, struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_g(
+		    struct runner *r, struct cell *c, struct part_aos_g *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_g(
+		    struct runner *r, struct cell *c, struct part_aos_f4_g_send *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f(
+		    struct runner *r, struct cell *c, struct part_aos_f *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_f(
+		    struct runner *r, struct cell * restrict c, struct part_aos_f4_f_send * restrict parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_forc_aos(
+		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_grad_aos(
+		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
+			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, struct part_soa parts_soa,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *restrict c, struct part_aos_f4_f_recv * restrict parts_aos_buffer,
+int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+          int *tid_p, long long *id, float *ux, float *uy, float *uz,
+          float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+          float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+          float *locx, float *locy, float *locz, float *widthx, float *widthy,
+          float *widthz, float *h_max, int *count_p, float *wcount,
+          float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+          float *rot_w, float *div_v, float *div_v_previous_step,
+          float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+          float *f, float *soundspeed, float *h_dt, float *balsara,
+          float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
+          timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+          char *to_be_synchronized, int local_pack_position, int count);
+void pack_neat(struct cell *c, struct part_soa parts_soa, int tid, int local_pack_position, int count);
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count);
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count);
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, int local_pack_position, int count);
+void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer, int tid, int local_pack_position, int count, int2 frst_lst_prts);
+void pack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer, int tid, int local_pack_position, int count);
+void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send *restrict parts_aos, int tid, int local_pack_position, int count);
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4(struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4_f(struct cell *restrict c, struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
+            int *tid_p, long long *id, float *ux, float *uy, float *uz,
+            float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
+            float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+            float *locx, float *locy, float *locz, float *widthx, float *widthy,
+            float *widthz, float *h_max, int *count_p, float *wcount,
+            float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
+            float *rot_w, float *div_v, float *div_v_previous_step,
+            float *alpha_visc, float *v_sig, float *laplace_u,
+            float *alpha_diff, float *f, float *soundspeed, float *h_dt,
+            float *balsara, float *pressure, float *alpha_visc_max_ngb,
+            timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+            char *to_be_synchronized, int local_pack_position, int count,
+            struct engine *e);
+void runner_doself1_gpu_unpack(
+    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
+    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
+    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
+    float *locx, float *locy, float *locz, float *widthx, float *widthy,
+    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
+    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
+    float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_pack_neat(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4(
+    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_g(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
+
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_g_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f(
+    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_f_send * restrict parts_aos_buffer,
+	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_g_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_f_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+

From 0c90ab5c122792591b495b7e2b17a8d58fb2f90b Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 22 Oct 2024 17:18:01 +0100
Subject: [PATCH 006/217] Added more files for GPU code. Seems to work fine
 aside from config.h not being found by the cuda library

---
 src/Makefile.am                               |   27 +
 src/cuda/BLOCK_SIZE.h                         |   12 +
 src/cuda/GPU_runner_functions.cu              | 4224 +++++++++++++++++
 src/cuda/GPU_runner_functions.h               |  116 +
 src/cuda/device_functions.h                   |  149 +
 src/cuda/kernel_definitions.cu                |  115 +
 src/cuda/tester.h                             |    9 +
 src/files_for_new_functions/arrays_malloc.cu  |  295 ++
 src/files_for_new_functions/arrays_malloc.h   |   50 +
 .../host_device_data_transfer.cu              |  529 +++
 .../host_device_data_transfer.h               |  176 +
 11 files changed, 5702 insertions(+)
 create mode 100755 src/cuda/BLOCK_SIZE.h
 create mode 100755 src/cuda/GPU_runner_functions.cu
 create mode 100755 src/cuda/GPU_runner_functions.h
 create mode 100755 src/cuda/device_functions.h
 create mode 100755 src/cuda/kernel_definitions.cu
 create mode 100755 src/cuda/tester.h
 create mode 100755 src/files_for_new_functions/arrays_malloc.cu
 create mode 100755 src/files_for_new_functions/arrays_malloc.h
 create mode 100755 src/files_for_new_functions/host_device_data_transfer.cu
 create mode 100755 src/files_for_new_functions/host_device_data_transfer.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 7881a8fff8..bfb38a5929 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -545,6 +545,33 @@ libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) -version-in
 libswiftsim_mpi_la_SHORTNAME = mpi
 libswiftsim_mpi_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
 
+# Sources and flags for regular CUDA library
+libswiftsim_cuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_cuda_la_CFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_CXXFLAGS = $(AM_CFLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_cuda_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_cuda_la_SHORTNAME = cuda
+libswiftsim_cuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for regular HIP library
+libswiftsim_hip_la_SOURCES = $(AM_SOURCES)
+libswiftsim_hip_la_CFLAGS = $(AM_CFLAGS) $(HIP_CFLAGS) -DWITH_HIP
+libswiftsim_hip_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) $(HIP_LIBS) -lamdhip64
+libswiftsim_hip_la_SHORTNAME = hip
+libswiftsim_hip_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav.la
+
+# Sources and flags for MPI CUDA library
+libswiftsim_mpicuda_la_SOURCES = $(AM_SOURCES)
+libswiftsim_mpicuda_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_CXXFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) $(CUDA_CFLAGS) -DWITH_CUDA
+libswiftsim_mpicuda_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) $(CUDA_LIBS)
+libswiftsim_mpicuda_la_SHORTNAME = mpicuda
+libswiftsim_mpicuda_la_LIBADD = $(GRACKLE_LIBS) $(VELOCIRAPTOR_LIBS) $(MPI_LIBS) libgrav_mpi.la
+
+#subdir
+SUBDIRS = . cuda
+SUBDIRS += . hip
+
 # Versioning. If any sources change then update the version_string.h file with
 # the current git revision and package version.
 # May have a checkout without a version_string.h file and no git command (tar/zip
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
new file mode 100755
index 0000000000..f3897234a3
--- /dev/null
+++ b/src/cuda/BLOCK_SIZE.h
@@ -0,0 +1,12 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+
+#define BLOCK_SIZE 64
+#define N_TASKS_PER_PACK_SELF 64
+#define N_TASKS_BUNDLE_SELF 8
+
+#define BLOCK_SIZE_PAIR 64
+#define N_TASKS_PER_PACK_PAIR 32
+#define N_TASKS_BUNDLE_PAIR 4
+
+#endif // BLOCK_SIZE_H
diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
new file mode 100755
index 0000000000..a0ca64ca6a
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.cu
@@ -0,0 +1,4224 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "../../config.h"
+
+#ifndef BLOCK_SIZE_H
+#include "BLOCK_SIZE.h"
+#endif
+
+#include "GPU_runner_functions.h"
+#include "device_functions.h"
+#include "part_gpu.h"
+#include <cuda_profiler_api.h>
+
+#ifdef WITH_CUDA
+}
+#endif
+
+/* function to initialise GPU and printout GPU name*/
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+  // cuda
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void tester(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  if (pid < last_part_in_task_blocks) {
+    parts_soa.tid_p[pid] = 1;
+  }
+//  if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
+//	  printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid], last_part_in_task_blocks);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_self_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+//  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+//      if ((j != pid) && (j < last_part_in_task_blocks) &&
+//          timebin[j_block] != time_bin_inhibited) {
+//      if ((j < last_part_in_task_blocks) &&
+//    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
+        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+//        if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) {
+          Found_neighbours=1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if(hi<1.f/256.f)printf("h < dx\n");
+//          if(hi<1.f/256.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+//	float wi, wi_dx;
+//	d_kernel_deval(0.f, &wi, &wi_dx);
+//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+//    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS(
+    struct part_aos *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, double * d_cell_x,
+	double * d_cell_y, double * d_cell_z) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+//  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  struct part_aos ipart = parts_aos[pid];
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
+    cellz = d_cell_z[ttid];
+    hi = ipart.h, hig2 = hi * hi * kernel_gamma2;
+    mi = ipart.mass;
+    uxi = ipart.ux;
+    uyi = ipart.uy;
+    uzi = ipart.uz;
+    pix = ipart.x_p - cellx;
+    piy = ipart.y_p - celly;
+    piz = ipart.z_p - cellz;
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  int *timebin = (int *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+//  struct parts_aos jparts[count];
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos jpart = parts_aos[j];
+    x_p_tmp[threadIdx.x] = jpart.x_p;
+    y_p_tmp[threadIdx.x] = jpart.y_p;
+    z_p_tmp[threadIdx.x] = jpart.z_p;
+    h_tmp[threadIdx.x] = jpart.h;
+    mass_tmp[threadIdx.x] = jpart.mass;
+    ux_tmp[threadIdx.x] = jpart.ux;
+    uy_tmp[threadIdx.x] = jpart.uy;
+    uz_tmp[threadIdx.x] = jpart.uz;
+    timebin[threadIdx.x] = jpart.time_bin;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          Found_neighbours=1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+//	float wi, wi_dx;
+//	d_kernel_deval(0.f, &wi, &wi_dx);
+//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+//    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+    parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+    parts_aos[pid].div_v = div_vi;
+    parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi,
+    parts_aos[pid].rot_uz = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+//template <typename T>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+//#include <cuda/barrier>
+__global__ void DOSELF_GPU_AOS_F4(
+		struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv,
+		const float d_a, const float d_H,
+	const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 vars_f4[];
+
+//  auto group = cooperative_groups::this_thread_block();
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+//  cuda::barrier<cuda::thread_scope_system> bar;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  first_part_in_task_blocks = first_last_parts.x;
+  last_part_in_task_blocks = first_last_parts.y;
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+  const float hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 * __restrict__ x_p_h_tmp = (float4 *)&vars_f4[0];
+  float4 * __restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos_f4_send pj = parts_send[j];
+    x_p_h_tmp[threadIdx.x] = pj.x_p_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float4 x_p_h_j = x_p_h_tmp[j_block];
+        const float4 ux_m_j = ux_m_tmp[j_block];
+        const float xij = x_pi.x - x_p_h_j.x,
+        		    yij = x_pi.y - x_p_h_j.y,
+        		    zij = x_pi.z - x_p_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          const float r = sqrtf(r2);
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+          /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+          res_rho.x += mj * wi;
+          res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+          res_rho.z += wi;
+          res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                dvz = ux_pi.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+          /*Add to sums of rot_u and div_v*/
+          res_rot.x += faci * curlvrx;
+          res_rot.y += faci * curlvry;
+          res_rot.z += faci * curlvrz;
+          res_rot.w -= faci * dvdr;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_recv[pid].rho_dh_wcount = res_rho;
+    parts_recv[pid].rot_ux_div_v = res_rot;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+                           double * d_cell_x,
+						   double * d_cell_y, double * d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(int),
+                               stream>>>(
+      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
+	  d_cell_y, d_cell_z);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+struct first_part{
+	int list[32];
+};
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		                   float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int2 *d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4<<<gridShape, BLOCK_SIZE,
+                               2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_G(
+    struct part_aos_g *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, double * d_cell_x,
+	double * d_cell_y, double * d_cell_z) {
+  extern __shared__ float varsg[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+//  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sig;
+  float u = 0.f;
+  float laplace_u = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
+    cellz = d_cell_z[ttid];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p - cellx;
+    piy = parts_aos[pid].y_p - celly;
+    piz = parts_aos[pid].z_p - cellz;
+    ci = parts_aos[pid].soundspeed;
+    v_sig = parts_aos[pid].v_sig;
+    u = parts_aos[pid].u;
+    laplace_u = parts_aos[pid].laplace_u;
+    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&varsg[0];
+  float *y_p_tmp = (float *)&varsg[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&varsg[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&varsg[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&varsg[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&varsg[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&varsg[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&varsg[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&varsg[BLOCK_SIZE * 8];
+  float *alpha_tmp = (float *)&varsg[BLOCK_SIZE * 9];
+  float *u_tmp = (float *)&varsg[BLOCK_SIZE * 10];
+  float *rho_tmp = (float *)&varsg[BLOCK_SIZE * 11];
+  int *timebin = (int *)&varsg[BLOCK_SIZE * 12];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+//      if ((j != pid) && (j < last_part_in_task_blocks) &&
+//          timebin[j_block] != time_bin_inhibited) {
+//      if ((j < last_part_in_task_blocks) &&
+//    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          Found_neighbours=1;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          v_sig = max(v_sig, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+          laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = alpha_tmp[j_block];
+          alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+    parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_G(
+		struct part_aos_f4_g_send * __restrict__ parts_send, struct part_aos_f4_g_recv * __restrict__ parts_recv,
+		const float d_a, const float d_H,
+	    const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 varsf4_g[];
+
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  int first_part_in_task_blocks = first_last_parts.x;
+  int last_part_in_task_blocks = first_last_parts.y;
+//  __syncthreads();
+  const int pid = threadid + first_part_in_task_blocks;
+
+  /*Keep this*/
+  float v_sig = 0.f;
+  float alpha_visc_max_ngb = 0.f;
+  /////////////
+
+  struct part_aos_f4_g_send pi = parts_send[pid];
+  float4 x_h_i = pi.x_h;
+  float4 ux_m_i = pi.ux_m;
+  float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+  float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+  const float hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 * __restrict__ x_h_tmp = (float4 *)&varsf4_g[0];
+  float4 * __restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE];
+  float4 * __restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+
+    int j = b + threadIdx.x;
+
+    struct part_aos_f4_g_send pj = parts_send[j];
+    x_h_tmp[threadIdx.x] = pj.x_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    rho_avisc_u_c_tmp[threadIdx.x] = pj.rho_avisc_u_c;
+
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+    	float4 x_h_j = x_h_tmp[j_block];
+    	float4 ux_m_j = ux_m_tmp[j_block];
+    	float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block];
+        /* Compute the pairwise distance. */
+        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, zij = x_h_i.z - x_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+                dvz = ux_m_i.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor = (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+          vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = rho_avisc_u_c_j.y;
+          vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+//	  printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x, vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
+    parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F(
+    struct part_aos_f *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, double * d_cell_x,
+	double * d_cell_y, double * d_cell_z) {
+  extern __shared__ float varsf[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = 0.0;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sigi;
+  float ui = 0.f;
+  float u_dti = 0.f;
+  float laplace_ui = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float pressurei = 0.0;
+  float alphavisci = 0.0;
+  float alphadiffi = 0.0;
+  float fi = 0.0;
+  float balsarai = 0.0;
+  float ahydroxi = 0.0;
+  float ahydroyi = 0.0;
+  float ahydrozi = 0.0;
+  float h_dti = 0.0;
+  int min_ngb_time_bin = 0;
+  if (pid < last_part_in_task_blocks) {
+    ttid = task_id;
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
+    cellz = d_cell_z[ttid];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p - cellx;
+    piy = parts_aos[pid].y_p - celly;
+    piz = parts_aos[pid].z_p - cellz;
+    ci = parts_aos[pid].soundspeed;
+    fi = parts_aos[pid].f;
+    v_sigi = parts_aos[pid].v_sig;
+    ui = parts_aos[pid].u;
+    rhoi = parts_aos[pid].rho;
+    pressurei = parts_aos[pid].pressure;
+    balsarai = parts_aos[pid].balsara;
+    alphavisci = parts_aos[pid].alpha_visc;
+    alphadiffi = parts_aos[pid].alpha_diff;
+    min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+//    laplace_u = parts_aos[pid].laplace_u;
+//    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&varsf[0];
+  float *y_p_tmp = (float *)&varsf[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&varsf[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&varsf[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&varsf[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&varsf[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&varsf[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&varsf[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&varsf[BLOCK_SIZE * 8];
+  float *alphavisc_tmp = (float *)&varsf[BLOCK_SIZE * 9];
+  float *alphadiff_tmp = (float *)&varsf[BLOCK_SIZE * 10];
+  float *u_tmp = (float *)&varsf[BLOCK_SIZE * 11];
+  float *rho_tmp = (float *)&varsf[BLOCK_SIZE * 12];
+  float *pressure_tmp = (float *)&varsf[BLOCK_SIZE * 13];
+  float *f_tmp = (float *)&varsf[BLOCK_SIZE * 14];
+  float *balsara_tmp = (float *)&varsf[BLOCK_SIZE * 15];
+  int *timebin = (int *)&varsf[BLOCK_SIZE * 16];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+    alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+    pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+    f_tmp[threadIdx.x] = parts_aos[j].f;
+    balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+//          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+//          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = h_tmp[j_block];
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+//          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+//          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+//          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+//
+//          /* Signal velocity */
+          const float v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - fi / mj;
+          const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+          /* Balsara term */
+          const float balsaraj = balsara_tmp[j_block];
+          /* Construct the full viscosity term */
+          const float rhoj = rho_tmp[j_block];
+          const float pressurej = pressure_tmp[j_block];
+          const float rho_ij = rhoi + rhoj;
+          const float alpha = alphavisci + alphavisc_tmp[j_block];
+          const float visc =
+              -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydroxi -= mj * acc * xij;
+          ahydroyi -= mj * acc * yij;
+          ahydrozi -= mj * acc * zij;
+//          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, pressurei, pressurej);
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          const float press_sum = pressurei + pressurej;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows the
+           * alpha from the highest pressure particle to dominate, so that the
+           * diffusion limited particles always take precedence - another trick to
+           * allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+              (press_sum);
+          if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+          const float v_diff = alpha_diff * 0.5f *
+                               (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+                                fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term =
+              v_diff * (ui - u_tmp[j_block]) * (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          u_dti += du_dt_i * mj;
+          if(mj == 0.f)printf("zero mass mj %f\n", mj);
+
+          /* Get the time derivative for h. */
+          h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient loop but
+           * due to some possible synchronisation problems this is here as a _quick
+           * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+          v_sigi = max(v_sigi, v_sig);
+          int time_bin_j = timebin[j_block];
+          if(time_bin_j > 0)min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+//          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+    parts_aos[pid].v_sig = v_sigi;
+    parts_aos[pid].h_dt = h_dti;
+    parts_aos[pid].u_dt = u_dti;
+    parts_aos[pid].a_hydrox = ahydroxi;
+    parts_aos[pid].a_hydroy = ahydroyi;
+    parts_aos[pid].a_hydroz = ahydrozi;
+    parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void DOSELF_GPU_AOS_F4_F(
+	struct part_aos_f4_f_send * __restrict__ parts_send, struct part_aos_f4_f_recv * __restrict__ parts_recv,
+	const float d_a, const float d_H,
+    const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+
+  extern __shared__ float4 varsf4_f[];
+
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  int first_part_in_task_blocks, last_part_in_task_blocks;
+//  first_part_in_task_blocks = d_task_first_part[task_id],
+//  last_part_in_task_blocks = d_task_last_part[task_id];
+  int2 first_last_parts = d_task_first_part_f4[task_id];
+  first_part_in_task_blocks = first_last_parts.x;
+  last_part_in_task_blocks = first_last_parts.y;
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  const part_aos_f4_f_send pi = parts_send[pid];
+  float4 x_h_i = pi.x_h;
+  float4 ux_m_i = pi.ux_m;
+  float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+  float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+  float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+  const float mi = ux_m_i.w;
+  int Found_neighbours = 0;
+  float pressurei = rho_p_c_vsig_i.y;
+  const float ci = rho_p_c_vsig_i.z;
+  float3 ahydro = {0.0, 0.0, 0.0};
+  float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+  udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+  float hi = x_h_i.w;
+  float hig2 = hi * hi * kernel_gamma2;
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 * __restrict__ x_h_tmp = (float4 *)&varsf4_f[0];
+  float4 * __restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE];
+  float4 * __restrict__ f_b_t_mintbinngb_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 2];
+  float4 * __restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3];
+  float3 * __restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    struct part_aos_f4_f_send pj = parts_send[j];
+    x_h_tmp[threadIdx.x] = pj.x_h;
+    ux_m_tmp[threadIdx.x] = pj.ux_m;
+    f_b_t_mintbinngb_tmp[threadIdx.x] = pj.f_bals_timebin_mintimebin_ngb;
+    rho_p_c_vsig_tmp[threadIdx.x] = pj.rho_p_c_vsigi;
+//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_avisc_adiff_tmp[threadIdx.x] = pj.u_alphavisc_alphadiff;
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+    	float4 x_h_j = x_h_tmp[j_block];
+    	float4 ux_m_j = ux_m_tmp[j_block];
+    	float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block];
+    	float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block];
+    	float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block];
+        const float xij = x_h_i.x - x_h_j.x,
+        		    yij = x_h_i.y - x_h_j.y,
+        		    zij = x_h_i.z - x_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+//          /* Recover some data */
+          const float mj = ux_m_j.w;
+//          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = x_h_j.w;
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+//          /* Compute dv dot r */
+          float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+                dvz = ux_m_i.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+//          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+//          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+//
+//          /* Signal velocity */
+          const float cj = rho_p_c_vsig_j.z;
+          const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+          const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+          /* Construct the full viscosity term */
+          const float pressurej = rho_p_c_vsig_j.y;
+          const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+          const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+          const float visc =
+              -0.25f * alpha * v_sig * mu_ij * (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+          const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+          const float P_over_rho2_i = pressurei / (rhoi2) * f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj2) * f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydro.x -= mj * acc * xij;
+          ahydro.y -= mj * acc * yij;
+          ahydro.z -= mj * acc * zij;
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows the
+           * alpha from the highest pressure particle to dominate, so that the
+           * diffusion limited particles always take precedence - another trick to
+           * allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+              (pressurei + pressurej);
+          if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+          const float v_diff = alpha_diff * 0.5f *
+                               (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+                                fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term =
+              v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+			  (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+          /* Get the time derivative for h. */
+          udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient loop but
+           * due to some possible synchronisation problems this is here as a _quick
+           * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+          udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+          unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+          unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+          if(time_bin_j > 0)f_b_t_mintbinngb_i.w =
+        		  min(min_tb_i, time_bin_j);
+//          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+	udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+    parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+    parts_recv[pid].a_hydro = ahydro;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_naive(
+	struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci, int *d_task_first_part_cj,
+	int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited) {
+
+  extern __shared__ float vars[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  __shared__ int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+  first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+  last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+  first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+  last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+  __syncthreads();
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + first_part_in_task_blocks_ci;
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks_ci) {
+    ttid = parts_soa_ci.tid_p[pid];
+    first_part = d_task_first_part_ci[ttid];
+    last_part = d_task_last_part_ci[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa_ci.locx[pid], celly = parts_soa_ci.locy[pid],
+    cellz = parts_soa_ci.locz[pid];
+    hi = parts_soa_ci.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa_ci.mass[pid];
+    uxi = parts_soa_ci.ux[pid];
+    uyi = parts_soa_ci.uy[pid];
+    uzi = parts_soa_ci.uz[pid];
+    pix = parts_soa_ci.x_p[pid] - cellx;
+    piy = parts_soa_ci.y_p[pid] - celly;
+    piz = parts_soa_ci.z_p[pid] - cellz;
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa_cj.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa_cj.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa_cj.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa_cj.h[j];
+    mass_tmp[threadIdx.x] = parts_soa_cj.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa_cj.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa_cj.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa_cj.uz[j];
+    timebin[threadIdx.x] = parts_soa_cj.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+//      if ((j != pid) && (j < last_part_in_task_blocks) &&
+//          timebin[j_block] != time_bin_inhibited) {
+//      if ((j < last_part_in_task_blocks) &&
+//    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks_cj) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+//        const float hj = h_tmp[j_block], hjg2 = hj * hj * kernel_gamma2;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
+        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+//        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f/dx)*(0.01f/dx)) {
+          Found_neighbours=1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if(hi<1.f/dx)printf("h < dx\n");
+//          if(hi<1.f/256.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks_ci) {
+    parts_soa_ci.rho[pid] = rhoi, parts_soa_ci.rho_dh[pid] = rho_dhi;
+    parts_soa_ci.wcount[pid] = wcounti, parts_soa_ci.wcount_dh[pid] = wcount_dhi;
+    parts_soa_ci.div_v[pid] = div_vi;
+    parts_soa_ci.rot_ux[pid] = rot_uxi, parts_soa_ci.rot_uy[pid] = rot_uyi;
+    parts_soa_ci.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci,
+	          int *d_task_first_part_cj, int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+              const char *loop_type, cudaStream_t stream, int bid, int block_size, int count_tasks, int tasksperbundle,
+              int max_parts_i, int max_parts_j, int numBlocks_y, int tid, int offset, int bundle_first_task, int time_bin_inhibited) {
+
+
+  int max_parts = max(max_parts_j, max_parts_i);
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  /*Do ci*/
+  runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj, d_task_last_part_ci,
+	  d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+
+//  numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+//  gridShape = dim3(numBlocks_x, numBlocks_y);
+//  nBlocks_per_task = numBlocks_x;
+  /*Now do cj*/
+  runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci, d_task_last_part_cj,
+	  d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIRGPU(
+	struct part_soa parts_soa, int pid,
+	int last_part_in_task_blocks_ci, int first_part_in_task_blocks_cj,
+	int last_part_in_task_blocks_cj, float d_a, float d_H,
+	int time_bin_inhibited, float *vars) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+
+  if (pid < last_part_in_task_blocks_ci) {
+	cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+	cellz = parts_soa.locz[pid];
+	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+	mi = parts_soa.mass[pid];
+	uxi = parts_soa.ux[pid];
+	uyi = parts_soa.uy[pid];
+	uzi = parts_soa.uz[pid];
+	pix = parts_soa.x_p[pid] - cellx;
+	piy = parts_soa.y_p[pid] - celly;
+	piz = parts_soa.z_p[pid] - cellz;
+  }
+
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
+	   b += BLOCK_SIZE) {
+	int j = b + threadIdx.x;
+	x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+	y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+	z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+	h_tmp[threadIdx.x] = parts_soa.h[j];
+	mass_tmp[threadIdx.x] = parts_soa.mass[j];
+	ux_tmp[threadIdx.x] = parts_soa.ux[j];
+	uy_tmp[threadIdx.x] = parts_soa.uy[j];
+	uz_tmp[threadIdx.x] = parts_soa.uz[j];
+	timebin[threadIdx.x] = parts_soa.time_bin[j];
+	__syncthreads();
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  j = j_block + b;
+	  if (j < last_part_in_task_blocks_cj) {
+		/* Compute the pairwise distance. */
+		const float pjx = x_p_tmp[j_block] - cellx;
+		const float pjy = y_p_tmp[j_block] - celly;
+		const float pjz = z_p_tmp[j_block] - cellz;
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+
+		if (r2 < hig2 && r2 > (0.01f/dx)*(0.01f/dx)) {
+		  Found_neighbours=1;
+		  const float r = sqrt(r2);
+		  /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+		  /* Get the kernel for hi. */
+		  if(hi<1.f/dx)printf("h < dx\n");
+		  const float h_inv = 1.f / hi;
+		  const float ui = r * h_inv;
+		  float wi, wi_dx;
+
+		  d_kernel_deval(ui, &wi, &wi_dx);
+
+		  rhoi += mj * wi;
+		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+		  wcounti += wi;
+		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+		  const float r_inv = 1.f / r;
+		  const float faci = mj * wi_dx * r_inv;
+
+		  /* Compute dv dot r */
+		  float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+				dvz = uzi - uz_tmp[j_block];
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+		  div_vi -= faci * dvdr;
+
+		  /* Compute dv cross r */
+		  float curlvrx = dvy * zij - dvz * yij;
+		  float curlvry = dvz * xij - dvx * zij;
+		  float curlvrz = dvx * yij - dvy * xij;
+
+		  rot_uxi += faci * curlvrx;
+		  rot_uyi += faci * curlvry;
+		  rot_uzi += faci * curlvrz;
+		}
+	  }
+	}
+	__syncthreads();
+  }
+  if (pid < last_part_in_task_blocks_ci) {
+	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+	parts_soa.div_v[pid] = div_vi;
+	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+	parts_soa.rot_uz[pid] = rot_uzi;
+  }
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPU(
+	struct part_soa parts_soa, int pid, const int ci_start,
+	const int ci_end, const int cj_start,
+	const int cj_end, float d_a, float d_H,
+	float *vars_pair, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+  if (pid < ci_end) {
+	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+	mi = parts_soa.mass[pid];
+	uxi = parts_soa.ux[pid];
+	uyi = parts_soa.uy[pid];
+	uzi = parts_soa.uz[pid];
+	pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+	piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+	piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair[0];
+  float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	const int tid_x = threadIdx.x;
+	int j = b + tid_x;
+	x_p_tmp[tid_x] = parts_soa.x_p[j];
+	y_p_tmp[tid_x] = parts_soa.y_p[j];
+	z_p_tmp[tid_x] = parts_soa.z_p[j];
+//	h_tmp[tid_x] = parts_soa.h[j];
+	mass_tmp[tid_x] = parts_soa.mass[j];
+	ux_tmp[tid_x] = parts_soa.ux[j];
+	uy_tmp[tid_x] = parts_soa.uy[j];
+	uz_tmp[tid_x] = parts_soa.uz[j];
+	timebin[tid_x] = parts_soa.time_bin[j];
+
+	__syncthreads();
+	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+		const float pjx = x_p_tmp[j_block] - shift_x_j;
+		const float pjy = y_p_tmp[j_block] - shift_y_j;
+		const float pjz = z_p_tmp[j_block] - shift_z_j;
+
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+//		const float xij = (pix - pjx) * flip_order, yij = (piy - pjy) * flip_order, zij = (piz - pjz) * flip_order;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		if (r2 < hig2) {
+		  /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+		  const float r = sqrt(r2);
+		  /* Get the kernel for hi. */
+		  const float h_inv = 1.f / hi;
+		  const float ui = r * h_inv;
+		  float wi, wi_dx;
+
+		  d_kernel_deval(ui, &wi, &wi_dx);
+
+		  rhoi += mj * wi;
+		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+		  wcounti += wi;
+		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+		  const float r_inv = 1.f / r;
+		  const float faci = mj * wi_dx * r_inv;
+		  /* Compute dv dot r */
+		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+		  dvz = uzi - uz_tmp[j_block];
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+		  /* Compute dv cross r */
+		  const float curlvrx = dvy * zij - dvz * yij;
+		  const float curlvry = dvz * xij - dvx * zij;
+		  const float curlvrz = dvx * yij - dvy * xij;
+
+		  div_vi -= faci * dvdr;
+
+		  rot_uxi += faci * curlvrx;
+		  rot_uyi += faci * curlvry;
+		  rot_uzi += faci * curlvrz;
+		}
+	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
+	__syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+	parts_soa.div_v[pid] = div_vi;
+	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+	parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOS(
+	struct part_aos *parts_aos, int pid, const int ci_start,
+	const int ci_end, const int cj_start,
+	const int cj_end, float d_a, float d_H,
+	float *vars_pair_aos, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+  if (pid < ci_end) {
+	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+	mi = parts_aos[pid].mass;
+	uxi = parts_aos[pid].ux;
+	uyi = parts_aos[pid].uy;
+	uzi = parts_aos[pid].uz;
+	pix = parts_aos[pid].x_p;// - d_shift_x[task_id_tmp];
+	piy = parts_aos[pid].y_p;// - d_shift_y[task_id_tmp];
+	piz = parts_aos[pid].z_p;// - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aos[0];
+  float *y_p_tmp = (float *)&x_p_tmp[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  int *timebin = (int *)&uz_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	const int tid_x = threadIdx.x;
+	int j = b + tid_x;
+	x_p_tmp[tid_x] = parts_aos[j].x_p;
+	y_p_tmp[tid_x] = parts_aos[j].y_p;
+	z_p_tmp[tid_x] = parts_aos[j].z_p;
+//	h_tmp[tid_x] = parts_aos[j].h;
+	mass_tmp[tid_x] = parts_aos[j].mass;
+	ux_tmp[tid_x] = parts_aos[j].ux;
+	uy_tmp[tid_x] = parts_aos[j].uy;
+	uz_tmp[tid_x] = parts_aos[j].uz;
+	timebin[tid_x] = parts_aos[j].time_bin;
+//	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+//	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+//	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+	__syncthreads();
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+		const float pjx = x_p_tmp[j_block];// - shift_x_j;
+		const float pjy = y_p_tmp[j_block];// - shift_y_j;
+		const float pjz = z_p_tmp[j_block];// - shift_z_j;
+
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+//		const float xij = (pix - pjx) * flip_order, yij = (piy - pjy) * flip_order, zij = (piz - pjz) * flip_order;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		if (r2 < hig2) {
+		  /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+		  const float r = sqrt(r2);
+		  /* Get the kernel for hi. */
+		  const float h_inv = 1.f / hi;
+		  const float ui = r * h_inv;
+		  float wi, wi_dx;
+
+		  d_kernel_deval(ui, &wi, &wi_dx);
+
+		  rhoi += mj * wi;
+		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+		  wcounti += wi;
+		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+		  const float r_inv = 1.f / r;
+		  const float faci = mj * wi_dx * r_inv;
+		  /* Compute dv dot r */
+		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+		  dvz = uzi - uz_tmp[j_block];
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+		  /* Compute dv cross r */
+		  const float curlvrx = dvy * zij - dvz * yij;
+		  const float curlvry = dvz * xij - dvx * zij;
+		  const float curlvrz = dvx * yij - dvy * xij;
+
+		  div_vi -= faci * dvdr;
+
+		  rot_uxi += faci * curlvrx;
+		  rot_uyi += faci * curlvry;
+		  rot_uzi += faci * curlvrz;
+//		  if(timebin[j_block] != 1000 && timebin[j_block] != 20)printf("incorrect timebin %i\n", timebin[j_block]);
+		}
+	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
+	__syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+//	printf("timebin %i\n", parts_aos[pid].time_bin);
+	parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+	parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+	parts_aos[pid].div_v = div_vi;
+	parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi;
+	parts_aos[pid].rot_uz = rot_uzi;
+	parts_aos[pid].time_bin = 20;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF4(
+	struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv, int pid,
+	const int ci_start, const int ci_end, const int cj_start, const int cj_end, float d_a, float d_H, float4 *vars_pair_aos_f4) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+//  if (pid < ci_end) {
+	hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+//  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float4 * __restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0];
+  float4 * __restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	const int tid_x = threadIdx.x;
+	int j = b + tid_x;
+    struct part_aos_f4_send pj = parts_send[j];
+	x_p_h_tmp[tid_x] = pj.x_p_h;
+	ux_m_tmp[tid_x] = pj.ux_m;
+	__syncthreads();
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+		const float4 x_p_h_j = x_p_h_tmp[j_block];
+        const float4 ux_m_j = ux_m_tmp[j_block];
+
+		const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+		zij = x_pi.z - x_p_h_j.z;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		if (r2 < hig2) {
+		  /* Recover some data */
+		  const float mj = ux_m_j.w;
+		  const float r = sqrt(r2);
+		  /* Get the kernel for hi. */
+		  const float h_inv = 1.f / hi;
+		  const float ui = r * h_inv;
+		  float wi, wi_dx;
+
+		  d_kernel_deval(ui, &wi, &wi_dx);
+		  /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+		  res_rho.x += mj * wi;
+		  res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+		  res_rho.z += wi;
+		  res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+		  const float r_inv = 1.f / r;
+		  const float faci = mj * wi_dx * r_inv;
+		  /* Compute dv dot r */
+		  const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+		  dvz = ux_pi.z - ux_m_j.z;
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+		  /* Compute dv cross r */
+		  const float curlvrx = dvy * zij - dvz * yij;
+		  const float curlvry = dvz * xij - dvx * zij;
+		  const float curlvrz = dvx * yij - dvy * xij;
+
+		  res_rot.x += faci * curlvrx;
+		  res_rot.y += faci * curlvry;
+		  res_rot.z += faci * curlvrz;
+		  res_rot.w -= faci * dvdr;
+		}
+	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
+	__syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_recv[pid].rho_dh_wcount = res_rho;
+	parts_recv[pid].rot_ux_div_v = res_rot;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4(const struct part_aos_f4_send pi,
+	struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv, int pid,
+	const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+//  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_pi = pi.x_p_h;
+  const float4 ux_pi = pi.ux_m;
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+//  if (pid < ci_end) {
+	hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+//  }
+
+//  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j ++) {
+    struct part_aos_f4_send pj = parts_send[j];
+
+	const float4 x_p_h_j = pj.x_p_h;
+	const float4 ux_m_j = pj.ux_m;
+
+	const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+	zij = x_pi.z - x_p_h_j.z;
+	const float r2 = xij * xij + yij * yij + zij * zij;
+//	printf("r2 %f \n", r2);
+	if (r2 < hig2) {
+	  /* Recover some data */
+	  const float mj = ux_m_j.w;
+	  const float r = sqrt(r2);
+	  /* Get the kernel for hi. */
+	  const float h_inv = 1.f / hi;
+	  const float ui = r * h_inv;
+	  float wi, wi_dx;
+
+	  d_kernel_deval(ui, &wi, &wi_dx);
+	  /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+	  res_rho.x += mj * wi;
+	  res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+	  res_rho.z += wi;
+	  res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+	  const float r_inv = 1.f / r;
+	  const float faci = mj * wi_dx * r_inv;
+	  /* Compute dv dot r */
+	  const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+	  dvz = ux_pi.z - ux_m_j.z;
+	  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+	  /* Compute dv cross r */
+	  const float curlvrx = dvy * zij - dvz * yij;
+	  const float curlvry = dvz * xij - dvx * zij;
+	  const float curlvrz = dvx * yij - dvy * xij;
+
+	  res_rot.x += faci * curlvrx;
+	  res_rot.y += faci * curlvry;
+	  res_rot.z += faci * curlvrz;
+	  res_rot.w -= faci * dvdr;
+	}
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+//  if (pid >= ci_start && pid < ci_end) {
+    parts_recv[pid].rho_dh_wcount = res_rho;
+	parts_recv[pid].rot_ux_div_v = res_rot;
+//  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSG(
+	struct part_aos_g *parts_aos, int pid, const int ci_start,
+	const int ci_end, const int cj_start,
+	const int cj_end, float d_a, float d_H,
+	float *vars_pair_aosg, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sig;
+  float u = 0.f;
+  float laplace_u = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float ci = 0.0;
+
+  int count_i = cj_start;
+  if (pid < ci_end) {
+	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+	mi = parts_aos[pid].mass;
+	uxi = parts_aos[pid].ux;
+	uyi = parts_aos[pid].uy;
+	uzi = parts_aos[pid].uz;
+	ci = parts_aos[pid].soundspeed;
+	v_sig = parts_aos[pid].v_sig;
+	u = parts_aos[pid].u;
+	laplace_u = parts_aos[pid].laplace_u;
+	alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+
+	pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+	piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+	piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+  }
+
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aosg[0];
+  float *y_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 8];
+  float *alpha_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 9];
+  float *u_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 10];
+  float *rho_tmp = (float *)&vars_pair_aosg[BLOCK_SIZE * 11];
+  int *timebin = (int *)&vars_pair_aosg[BLOCK_SIZE * 12];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	const int tid_x = threadIdx.x;
+	int j = b + tid_x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+		const float pjx = x_p_tmp[j_block] - shift_x_j;
+		const float pjy = y_p_tmp[j_block] - shift_y_j;
+		const float pjz = z_p_tmp[j_block] - shift_z_j;
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		if (r2 < hig2) {
+		  /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+		  const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+		  /* Get the kernel for hi. */
+		  const float h_inv = 1.f / hi;
+		  float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+		  /* Compute dv dot r */
+		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+		              dvz = uzi - uz_tmp[j_block];
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+          /* Signal velocity */
+          const float new_v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          /* Update if we need to */
+          v_sig = max(v_sig, new_v_sig);
+          /* Calculate Del^2 u for the thermal diffusion coefficient. */
+          /* Need to get some kernel values F_ij = wi_dx */
+		  const float ui = r * h_inv;
+		  d_kernel_deval(ui, &wi, &wi_dx);
+
+          const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
+          laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
+
+          /* Set the maximal alpha from the previous step over the neighbours
+           * (this is used to limit the diffusion in hydro_prepare_force) */
+          const float alpha_j = alpha_tmp[j_block];
+          alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
+		}
+	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
+	__syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+    parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
+    parts_aos[pid].alpha_visc_max_ngb = alpha_visc_max_ngb;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4G(const struct part_aos_f4_g_send pi,
+	struct part_aos_f4_g_send * __restrict__ parts_send, struct part_aos_f4_g_recv * __restrict__ parts_recv, int pid,
+	const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float hi = 0.0, hig2 = 0.0;
+
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+
+  float4 res_rho = {0.0, 0.0, 0.0, 0.0};
+  float4 res_rot = {0.0, 0.0, 0.0, 0.0};
+//  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_h_i = pi.x_h;
+  const float4 ux_m_i = pi.ux_m;
+  const float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
+  float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
+
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+//  if (pid < ci_end) {
+	hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+//  }
+
+//  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j ++) {
+    struct part_aos_f4_g_send pj = parts_send[j];
+
+	const float4 x_h_j = pj.x_h;
+	const float4 ux_m_j = pj.ux_m;
+    const float4 rho_avisc_u_c_j = pj.rho_avisc_u_c;
+	const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, zij = x_h_i.z - x_h_j.z;
+	const float r2 = xij * xij + yij * yij + zij * zij;
+//	printf("r2 %f \n", r2);
+	if (r2 < hig2) {
+        const float r = sqrt(r2);
+        const float r_inv = 1.f / r;
+        /* Recover some data */
+        const float mj = ux_m_j.w;
+        /* Get the kernel for hi. */
+        const float h_inv = 1.f / hi;
+        float wi, wi_dx;
+        /* Cosmology terms for the signal velocity */
+        const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+        const float a2_Hubble = d_a * d_a * d_H;
+        /* Compute dv dot r */
+        float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+              dvz = ux_m_i.z - ux_m_j.z;
+        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+        /* Add Hubble flow */
+        const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+        /* Are the particles moving towards each others ? */
+        const float omega_ij = min(dvdr_Hubble, 0.f);
+        const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+        /* Signal velocity */
+        const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+        /* Update if we need to */
+        vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+        /* Calculate Del^2 u for the thermal diffusion coefficient. */
+        /* Need to get some kernel values F_ij = wi_dx */
+        const float ui = r * h_inv;
+        d_kernel_deval(ui, &wi, &wi_dx);
+
+        const float delta_u_factor = (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+        vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+        /* Set the maximal alpha from the previous step over the neighbours
+         * (this is used to limit the diffusion in hydro_prepare_force) */
+        const float alpha_j = rho_avisc_u_c_j.y;
+        vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+	}
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+//  if (pid >= ci_start && pid < ci_end) {
+  parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
+//  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NONSYMGPUAOSF(
+	struct part_aos_f *parts_aos, int pid, const int ci_start,
+	const int ci_end, const int cj_start,
+	const int cj_end, float d_a, float d_H,
+	float *vars_pair_aosf, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+
+  float ci = 0.0, cj = 0.0;
+  float hi = 0.0, hig2 = 0.0;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float div_vi = 0.0;
+  int Found_neighbours = 0;
+  float v_sigi;
+  float ui = 0.f;
+  float u_dti = 0.f;
+  float laplace_ui = 0.0;
+  float alpha_visc_max_ngb = 0.0;
+  float pressurei = 0.0;
+  float alphavisci = 0.0;
+  float alphadiffi = 0.0;
+  float fi = 0.0;
+  float balsarai = 0.0;
+  float ahydroxi = 0.0;
+  float ahydroyi = 0.0;
+  float ahydrozi = 0.0;
+  float h_dti = 0.0;
+  int min_ngb_time_bin = 0;
+  if (pid < ci_end) {
+	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+	mi = parts_aos[pid].mass;
+	uxi = parts_aos[pid].ux;
+	uyi = parts_aos[pid].uy;
+	uzi = parts_aos[pid].uz;
+	ci = parts_aos[pid].soundspeed;
+	fi = parts_aos[pid].f;
+	v_sigi = parts_aos[pid].v_sig;
+	ui = parts_aos[pid].u;
+	rhoi = parts_aos[pid].rho;
+	pressurei = parts_aos[pid].pressure;
+	balsarai = parts_aos[pid].balsara;
+	alphavisci = parts_aos[pid].alpha_visc;
+	alphadiffi = parts_aos[pid].alpha_diff;
+	min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+	pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+	piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+	piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+  }
+//  if (threadIdx.x == 0) {
+//    first_part_tid_0 = first_part;
+//    last_part_tid_0 = last_part;
+//  }
+//  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars_pair_aosf[0];
+  float *y_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 7];
+  float *cj_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 8];
+  float *alphavisc_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 9];
+  float *alphadiff_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 10];
+  float *u_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 11];
+  float *rho_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 12];
+  float *pressure_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 13];
+  float *f_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 14];
+  float *balsara_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 15];
+  int *timebin = (int *)&vars_pair_aosf[BLOCK_SIZE * 16];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	int j = b + threadIdx.x;
+	x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+	y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+	z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+	h_tmp[threadIdx.x] = parts_aos[j].h;
+	mass_tmp[threadIdx.x] = parts_aos[j].mass;
+	ux_tmp[threadIdx.x] = parts_aos[j].ux;
+	uy_tmp[threadIdx.x] = parts_aos[j].uy;
+	uz_tmp[threadIdx.x] = parts_aos[j].uz;
+	timebin[threadIdx.x] = parts_aos[j].time_bin;
+	cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+	u_tmp[threadIdx.x] = parts_aos[j].u;
+	rho_tmp[threadIdx.x] = parts_aos[j].rho;
+	alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+	alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+	pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+	f_tmp[threadIdx.x] = parts_aos[j].f;
+	balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+	__syncthreads();
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+		/* Compute the pairwise distance. */
+		const float pjx = x_p_tmp[j_block] - shift_x_j;
+		const float pjy = y_p_tmp[j_block] - shift_y_j;
+		const float pjz = z_p_tmp[j_block] - shift_z_j;
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		if (r2 < hig2) {
+
+		  //          /* Cosmology terms for the signal velocity */
+		  const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+		  const float a2_Hubble = d_a * d_a * d_H;
+		  const float r = sqrt(r2);
+		  const float r_inv = 1.f / r;
+//          /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+//          /* Get the kernel for hi. */
+		  const float hi_inv = 1.f / hi;
+		  const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+		  const float xi = r * hi_inv;
+		  float wi, wi_dx;
+		  d_kernel_deval(xi, &wi, &wi_dx);
+		  const float wi_dr = hid_inv * wi_dx;
+		  /* Get the kernel for hj. */
+		  const float hj = h_tmp[j_block];
+		  const float hj_inv = 1.0f / hj;
+		  const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+		  const float xj = r * hj_inv;
+		  float wj, wj_dx;
+		  d_kernel_deval(xj, &wj, &wj_dx);
+		  const float wj_dr = hjd_inv * wj_dx;
+//          /* Compute dv dot r */
+		  float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+				dvz = uzi - uz_tmp[j_block];
+		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+//          /* Add Hubble flow */
+		  const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+//          /* Are the particles moving towards each others ? */
+		  const float omega_ij = min(dvdr_Hubble, 0.f);
+		  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+//
+//          /* Signal velocity */
+		  const float v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+		  /* Variable smoothing length term */
+		  const float f_ij = 1.f - fi / mj;
+		  const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+		  /* Balsara term */
+		  const float balsaraj = balsara_tmp[j_block];
+		  /* Construct the full viscosity term */
+		  const float rhoj = rho_tmp[j_block];
+		  const float pressurej = pressure_tmp[j_block];
+		  const float rho_ij = rhoi + rhoj;
+		  const float alpha = alphavisci + alphavisc_tmp[j_block];
+		  const float visc =
+			  -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+		  /* Convolve with the kernel */
+		  const float visc_acc_term =
+			  0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+		  /* Compute gradient terms */
+		  const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+		  const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+		  /* SPH acceleration term */
+		  const float sph_acc_term =
+			  (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+		  /* Assemble the acceleration */
+		  const float acc = sph_acc_term + visc_acc_term;
+		  /* Use the force Luke ! */
+		  ahydroxi -= mj * acc * xij;
+		  ahydroyi -= mj * acc * yij;
+		  ahydrozi -= mj * acc * zij;
+//          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, pressurei, pressurej);
+		  /* Get the time derivative for u. */
+		  const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+		  /* Viscosity term */
+		  const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+		  const float press_sum = pressurei + pressurej;
+		  /* Diffusion term */
+		  /* Combine the alpha_diff into a pressure-based switch -- this allows the
+		   * alpha from the highest pressure particle to dominate, so that the
+		   * diffusion limited particles always take precedence - another trick to
+		   * allow the scheme to work with thermal feedback. */
+		  float alpha_diff =
+			  (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+			  (press_sum);
+		  if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+		  const float v_diff = alpha_diff * 0.5f *
+							   (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+								fabsf(fac_mu * r_inv * dvdr_Hubble));
+		  /* wi_dx + wj_dx / 2 is F_ij */
+		  const float diff_du_term =
+			  v_diff * (ui - u_tmp[j_block]) * (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+		  /* Assemble the energy equation term */
+		  const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+		  /* Internal energy time derivative */
+		  u_dti += du_dt_i * mj;
+		  if(mj == 0.f)printf("zero mass mj %f\n", mj);
+
+		  /* Get the time derivative for h. */
+		  h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+		  /* Update if we need to; this should be guaranteed by the gradient loop but
+		   * due to some possible synchronisation problems this is here as a _quick
+		   * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+		  v_sigi = max(v_sigi, v_sig);
+		  int time_bin_j = timebin[j_block];
+		  if(time_bin_j > 0)min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+//          printf("Got in\n");
+		}
+	  }
+	}
+	__syncthreads();
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+	parts_aos[pid].v_sig = v_sigi;
+	parts_aos[pid].h_dt = h_dti;
+	parts_aos[pid].u_dt = u_dti;
+	parts_aos[pid].a_hydrox = ahydroxi;
+	parts_aos[pid].a_hydroy = ahydroyi;
+	parts_aos[pid].a_hydroz = ahydrozi;
+	parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+//    printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi, ahydroyi, ahydrozi);
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2NAIVEGPUAOSF4F(const struct part_aos_f4_f_send pi,
+	struct part_aos_f4_f_send * __restrict__ parts_send, struct part_aos_f4_f_recv * __restrict__ parts_recv, int pid,
+	const int cj_start, const int cj_end, float d_a, float d_H) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  int Found_neighbours = 0;
+
+//  const part_aos_f4_send pi = parts_send[pid];
+  const float4 x_h_i = pi.x_h;
+  const float4 ux_m_i = pi.ux_m;
+
+  float4 f_b_t_mintbinngb_i = pi.f_bals_timebin_mintimebin_ngb;
+  const float4 rho_p_c_vsig_i = pi.rho_p_c_vsigi;
+  const float3 u_avisc_adiff_i = pi.u_alphavisc_alphadiff;
+
+  const float mi = ux_m_i.w;
+  const float pressurei = rho_p_c_vsig_i.y;
+  const float ci = rho_p_c_vsig_i.z;
+  float3 ahydro = {0.0, 0.0, 0.0};
+  float4 udt_hdt_vsig_mintbinngb = {0.0, 0.0, 0.0, 0.0};
+  udt_hdt_vsig_mintbinngb.z = rho_p_c_vsig_i.w;
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+
+  const float hi = x_h_i.w;
+  const float hig2 = hi * hi * kernel_gamma2;
+
+//  printf("js %i je %i\n", cj_start, cj_end);
+  /*Particles copied in blocks to shared memory*/
+  for (int j = cj_start; j < cj_end; j ++) {
+    struct part_aos_f4_f_send pj = parts_send[j];
+	const float4 x_h_j = pj.x_h;
+	const float4 ux_m_j = pj.ux_m;
+    const float4 f_b_t_mintbinngb_j = pj.f_bals_timebin_mintimebin_ngb;
+    const float4 rho_p_c_vsig_j = pj.rho_p_c_vsigi;
+//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    const float3 u_avisc_adiff_j = pj.u_alphavisc_alphadiff;
+	const float xij = x_h_i.x - x_h_j.x,
+			    yij = x_h_i.y - x_h_j.y,
+			    zij = x_h_i.z - x_h_j.z;
+	const float r2 = xij * xij + yij * yij + zij * zij;
+//	printf("r2 %f \n", r2);
+	if (r2 < hig2) {
+        //          /* Cosmology terms for the signal velocity */
+        const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+        const float a2_Hubble = d_a * d_a * d_H;
+        const float r = sqrt(r2);
+        const float r_inv = 1.f / r;
+//          /* Recover some data */
+        const float mj = ux_m_j.w;
+//          /* Get the kernel for hi. */
+        const float hi_inv = 1.f / hi;
+        const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+        const float xi = r * hi_inv;
+        float wi, wi_dx;
+        d_kernel_deval(xi, &wi, &wi_dx);
+        const float wi_dr = hid_inv * wi_dx;
+        /* Get the kernel for hj. */
+        const float hj = x_h_j.w;
+        const float hj_inv = 1.0f / hj;
+        const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+        const float xj = r * hj_inv;
+        float wj, wj_dx;
+        d_kernel_deval(xj, &wj, &wj_dx);
+        const float wj_dr = hjd_inv * wj_dx;
+//          /* Compute dv dot r */
+        float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+              dvz = ux_m_i.z - ux_m_j.z;
+        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+//          /* Add Hubble flow */
+        const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+//          /* Are the particles moving towards each others ? */
+        const float omega_ij = min(dvdr_Hubble, 0.f);
+        const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+//
+//          /* Signal velocity */
+        const float cj = rho_p_c_vsig_j.z;
+        const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+        /* Variable smoothing length term */
+        const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+        const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+        /* Construct the full viscosity term */
+        const float pressurej = rho_p_c_vsig_j.y;
+        const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+        const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+        const float visc =
+            -0.25f * alpha * v_sig * mu_ij * (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+        /* Convolve with the kernel */
+        const float visc_acc_term =
+            0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+        /* Compute gradient terms */
+        const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+        const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+        const float P_over_rho2_i = pressurei / (rhoi2) * f_ij;
+        const float P_over_rho2_j = pressurej / (rhoj2) * f_ji;
+
+        /* SPH acceleration term */
+        const float sph_acc_term =
+            (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+        /* Assemble the acceleration */
+        const float acc = sph_acc_term + visc_acc_term;
+        /* Use the force Luke ! */
+        ahydro.x -= mj * acc * xij;
+        ahydro.y -= mj * acc * yij;
+        ahydro.z -= mj * acc * zij;
+        /* Get the time derivative for u. */
+        const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+        /* Viscosity term */
+        const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+        /* Diffusion term */
+        /* Combine the alpha_diff into a pressure-based switch -- this allows the
+         * alpha from the highest pressure particle to dominate, so that the
+         * diffusion limited particles always take precedence - another trick to
+         * allow the scheme to work with thermal feedback. */
+        float alpha_diff =
+            (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+            (pressurei + pressurej);
+        if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+        const float v_diff = alpha_diff * 0.5f *
+                             (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+                              fabsf(fac_mu * r_inv * dvdr_Hubble));
+        /* wi_dx + wj_dx / 2 is F_ij */
+        const float diff_du_term =
+            v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+			  (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+        /* Assemble the energy equation term */
+        const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+        /* Internal energy time derivative */
+        udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+        /* Get the time derivative for h. */
+        udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+        /* Update if we need to; this should be guaranteed by the gradient loop but
+         * due to some possible synchronisation problems this is here as a _quick
+         * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+        udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+        unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+        unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+        if(time_bin_j > 0)f_b_t_mintbinngb_i.w =
+      		  min(min_tb_i, time_bin_j);
+//          printf("Got in\n");
+	}
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+//  if (pid >= ci_start && pid < ci_end) {
+  udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+  parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
+  parts_recv[pid].a_hydro = ahydro;
+//  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void DOPAIR2GPU(
+	struct part_soa parts_soa, int pid, const int ci_start,
+	const int ci_end, const int cj_start,
+	const int cj_end, float d_a, float d_H,
+	int time_bin_inhibited, float *vars_pair, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp) {
+
+  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float cellxj = 0.0, cellyj = 0.0, cellzj = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  double pix = 0.0;
+  double piy = 0.0;
+  double piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+  int count_i = cj_start;
+//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
+//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+
+  if (pid < ci_end) {
+	cellx = parts_soa.locx[pid];
+	celly = parts_soa.locy[pid];
+	cellz = parts_soa.locz[pid];
+	const int j = cj_start;
+	cellxj = parts_soa.locx[j];
+	cellyj = parts_soa.locy[j];
+	cellzj = parts_soa.locz[j];
+	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+	mi = parts_soa.mass[pid];
+	uxi = parts_soa.ux[pid];
+	uyi = parts_soa.uy[pid];
+	uzi = parts_soa.uz[pid];
+	pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+	piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+	piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+  }
+
+  int n_neighbours = 0;
+  float av_dist = 0.f;
+  float av_distx = 0.f;
+  float av_disty = 0.f;
+  float av_distz = 0.f;
+  float distby2h = 0.f;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  double *x_p_tmp = (double *)&vars_pair[0];
+  double *y_p_tmp = (double *)&x_p_tmp[BLOCK_SIZE];
+  double *z_p_tmp = (double *)&y_p_tmp[BLOCK_SIZE];
+  float *h_tmp = (float *)&z_p_tmp[BLOCK_SIZE];
+  float *mass_tmp = (float *)&h_tmp[BLOCK_SIZE];
+  float *ux_tmp = (float *)&mass_tmp[BLOCK_SIZE];
+  float *uy_tmp = (float *)&ux_tmp[BLOCK_SIZE];
+  float *uz_tmp = (float *)&uy_tmp[BLOCK_SIZE];
+  timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
+  float *rho_tmp = (float *)&timebin[BLOCK_SIZE];
+  float *rho_dh_tmp = (float *)&rho_tmp[BLOCK_SIZE];
+  float *wcount_tmp = (float *)&rho_dh_tmp[BLOCK_SIZE];
+  float *wcount_dh_tmp = (float *)&wcount_tmp[BLOCK_SIZE];
+  float *div_v_tmp = (float *)&wcount_dh_tmp[BLOCK_SIZE];
+  float *rot_ux_tmp = (float *)&div_v_tmp[BLOCK_SIZE];
+  float *rot_uy_tmp = (float *)&rot_ux_tmp[BLOCK_SIZE];
+  float *rot_uz_tmp = (float *)&rot_uy_tmp[BLOCK_SIZE];
+
+  /*Particles copied in blocks to shared memory*/
+  for (int b = cj_start; b < cj_end;
+	   b += BLOCK_SIZE) {
+	const int tid_x = threadIdx.x;
+	int j = b + tid_x;
+	x_p_tmp[tid_x] = parts_soa.x_p[j];
+	y_p_tmp[tid_x] = parts_soa.y_p[j];
+	z_p_tmp[tid_x] = parts_soa.z_p[j];
+	h_tmp[tid_x] = parts_soa.h[j];
+	mass_tmp[tid_x] = parts_soa.mass[j];
+	ux_tmp[tid_x] = parts_soa.ux[j];
+	uy_tmp[tid_x] = parts_soa.uy[j];
+	uz_tmp[tid_x] = parts_soa.uz[j];
+	timebin[tid_x] = parts_soa.time_bin[j];
+	rho_tmp[tid_x] = 0.f;
+	rho_dh_tmp[tid_x] = 0.f;
+	wcount_tmp[tid_x] = 0.f;
+	wcount_dh_tmp[tid_x] = 0.f;
+	div_v_tmp[tid_x] = 0.f;
+	rot_ux_tmp[tid_x] = 0.f;
+	rot_uy_tmp[tid_x] = 0.f;
+	rot_uz_tmp[tid_x] = 0.f;
+	__syncthreads();
+	const double shift_x_j = d_shift_x[task_id_tmp + 1];
+	const double shift_y_j = d_shift_y[task_id_tmp + 1];
+	const double shift_z_j = d_shift_z[task_id_tmp + 1];
+	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
+	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+	  int jj = b + j_block;
+	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+		const double pjx = x_p_tmp[j_block] - shift_x_j;
+		const double pjy = y_p_tmp[j_block] - shift_y_j;
+		const double pjz = z_p_tmp[j_block] - shift_z_j;
+
+		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+//		const float xij = pjx - pix, yij = pjy - piy, zij = pjz - piz;
+		const float r2 = xij * xij + yij * yij + zij * zij;
+		const float hj =  h_tmp[j_block];
+		const float hjg2 = hj * hj * kernel_gamma2;
+//		if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z %f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx);
+		/* Compute dv dot r */
+		const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+		dvz = uzi - uz_tmp[j_block];
+		const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+		/* Compute dv cross r */
+		const float curlvrx = dvy * zij - dvz * yij;
+		const float curlvry = dvz * xij - dvx * zij;
+		const float curlvrz = dvx * yij - dvy * xij;
+
+		const float r = sqrt(r2);
+		if (r2 < hig2) {
+		  /* Recover some data */
+		  const float mj = mass_tmp[j_block];
+		  /* Get the kernel for hi. */
+//		  if(hi<1.f/dx)printf("h < dx\n");
+		  const float h_inv = 1.f / hi;
+		  const float ui = r * h_inv;
+		  float wi, wi_dx;
+
+		  d_kernel_deval(ui, &wi, &wi_dx);
+
+		  rhoi += mj * wi;
+		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+		  wcounti += wi;
+		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+		  const float r_inv = 1.f / r;
+		  const float faci = mj * wi_dx * r_inv;
+
+		  div_vi -= faci * dvdr;
+
+		  rot_uxi += faci * curlvrx;
+		  rot_uyi += faci * curlvry;
+		  rot_uzi += faci * curlvrz;
+//
+		}
+		if (r2 < hjg2) {
+		  /* Recover some data */
+		  /* Get the kernel for hi. */
+		  const float hj_inv = 1.f / hj;
+		  const float uj = r * hj_inv;
+		  float wj, wj_dx;
+
+		  d_kernel_deval(uj, &wj, &wj_dx);
+
+//		  atomicAdd(&rho_tmp[j_block], mi * wj);
+		  atomicAdd(&parts_soa.rho[j], mi * wj);
+//		  atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension * wj + uj * wj_dx));
+		  atomicAdd(&parts_soa.rho_dh[j], -mi * (hydro_dimension * wj + uj * wj_dx));
+
+//		  atomicAdd(&wcount_tmp[j_block], wj);
+		  atomicAdd(&parts_soa.wcount[j], wj);
+//		  atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension * wj + uj * wj_dx));
+		  atomicAdd(&parts_soa.wcount_dh[j], -(hydro_dimension * wj + uj * wj_dx));
+
+		  const float r_inv = 1.f / r;
+		  const float facj = mi * wj_dx * r_inv;
+
+//		  atomicAdd(&div_v_tmp[j_block], -facj * dvdr);
+		  atomicAdd(&parts_soa.div_v[j], -facj * dvdr);
+
+//		  atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx);
+//		  atomicAdd(&rot_uy_tmp[j_block], facj * curlvry);
+//		  atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz);
+		  atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx);
+		  atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
+		  atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
+//		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v %f rotux %f rotuy %f rotuz %f\n"
+//				 ,rhoi, rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+		} /*if r2<hjg2 */
+	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
+	__syncthreads();
+//	if(j < cj_end){
+//	  atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]);
+//	  atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]);
+//	}
+//	__syncthreads();
+//	parts_soa.rho[j] += rho_tmp[threadIdx.x];
+//	parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x];
+//	parts_soa.wcount[j] += wcount_tmp[threadIdx.x];
+//	parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x];
+//	parts_soa.div_v[j] += div_v_tmp[threadIdx.x];
+//	parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x];
+//	parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x];
+//	parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x];
+  } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
+  if (pid >= ci_start && pid < ci_end) {
+//	if(n_neighbours > 0){
+//		distby2h = distby2h/n_neighbours;
+//		av_dist = av_dist/(n_neighbours*dx);
+//	}
+//    av_distx = av_distx/(n_neighbours*dx);
+//    av_disty = av_disty/(n_neighbours*dx);
+//    av_distz = av_distz/(n_neighbours*dx);
+	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+	parts_soa.div_v[pid] = div_vi;
+	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+	parts_soa.rot_uz[pid] = rot_uzi;
+//	if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi);
+  }
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU(
+	struct part_soa parts_soa, int *d_task_first_part_ci, int *d_task_first_part_cj,
+	int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited) {
+
+  extern __shared__ float vars[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+
+  first_part_in_task_blocks_ci = d_task_first_part_ci[task_id];
+  last_part_in_task_blocks_ci = d_task_last_part_ci[task_id];
+  first_part_in_task_blocks_cj = d_task_first_part_cj[task_id];
+  last_part_in_task_blocks_cj = d_task_last_part_cj[task_id];
+
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + first_part_in_task_blocks_ci;
+
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIRGPU(
+  		parts_soa, pid, last_part_in_task_blocks_ci,
+		first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H,
+		time_bin_inhibited, vars);
+//  __syncthreads();
+  // Now we start calculations for particles in cell i
+  const int pjd = threadid + last_part_in_task_blocks_ci;
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIRGPU(
+  		parts_soa, pjd, last_part_in_task_blocks_cj,
+  		first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H,
+  		time_bin_inhibited, vars);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_sym_density_GPU(
+	struct part_soa parts_soa, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  // Now we start calculations for particles in cell i
+  const int pid = threadid + ci_start;
+
+  /*Don't ever put me in an if statement. I've got __syncthreads inside*/
+  DOPAIR2GPU(
+  		parts_soa, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp);
+//  __syncthreads();
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_nonsym_density_GPU(
+	struct part_soa parts_soa, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPU(
+  		parts_soa, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+
+  /*Necessary evil to stop parts from j and i co-existing on shared memory for sums*/
+  __syncthreads();
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPU(
+  		parts_soa, pjd, cj_start, cj_end,
+		ci_start, ci_end, d_a, d_H,
+		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU(
+	struct part_soa parts_soa, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPU(
+  		parts_soa, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU(
+	struct part_soa parts_soa, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPU(
+  		parts_soa, pjd, cj_start, cj_end,
+		ci_start, ci_end, d_a, d_H,
+		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos(
+	struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aos[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOS(
+  		parts_aos, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos(
+	struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aos[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOS(
+  		parts_aos, pjd, cj_start, cj_end,
+		ci_start, ci_end, d_a, d_H,
+		vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f4(
+	struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, int4 *fparti_fpartj_lparti_lpartj_dens,
+	float d_a, float d_H, int bundle_first_task) {
+
+  extern __shared__ float4 vars_pair_i_f4[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+  const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+  const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+  const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H, vars_pair_i_f4);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f4(
+		struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, int4 *fparti_fpartj_lparti_lpartj_dens,
+			float d_a, float d_H, int bundle_first_task) {
+
+  extern __shared__ float4 vars_pair_j_f4[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
+  const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
+  const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
+  const int cj_end =fparti_fpartj_lparti_lpartj_dens[task_id].w;
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H, vars_pair_j_f4);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_density_GPU_aos_f4(
+	struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+
+//  extern __shared__ float4 vars_pair_i_f4[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+//  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  if(pid < bundle_first_part + bundle_n_parts){
+    const struct part_aos_f4_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+
+  /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+  }
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_g(
+	struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosg[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOSG(
+  		parts_aos, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_g(
+	struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosg[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOSG(
+  		parts_aos, pjd, cj_start, cj_end,
+		ci_start, ci_end, d_a, d_H,
+		vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_gradient_GPU_aos_f4(
+	struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
+	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+
+//  extern __shared__ float4 vars_pair_i_f4[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+//  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  if(pid < bundle_first_part + bundle_n_parts){
+    const struct part_aos_f4_g_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+  /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+  }
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_ci_density_GPU_aos_f(
+	struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosf[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /* Start calculations for particles in cell i
+  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pid = threadid + ci_start;
+  const int flip_i = 1;
+  DOPAIR2NONSYMGPUAOSF(
+  		parts_aos, pid, ci_start, ci_end,
+		cj_start, cj_end, d_a, d_H,
+		vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_cj_density_GPU_aos_f(
+	struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
+	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
+	, double *d_shift_y, double *d_shift_z) {
+
+  extern __shared__ float vars_pair_aosf[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  const int task_id_tmp = 2 * task_id;
+  const int ci_start = d_task_first_parts_pair[task_id_tmp];
+  const int ci_end = d_task_last_parts_pair[task_id_tmp];
+  const int cj_start = d_task_first_parts_pair[task_id_tmp + 1];
+  const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
+
+  /*Now do cj
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
+  const int pjd = threadid + cj_start;
+  const int flip_j = -1;
+  DOPAIR2NONSYMGPUAOSF(
+  		parts_aos, pjd, cj_start, cj_end,
+		ci_start, ci_end, d_a, d_H,
+		vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__global__ void runner_do_pair_force_GPU_aos_f4(
+	struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
+	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+
+//  extern __shared__ float4 vars_pair_i_f4[];
+//  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int pid = bundle_first_part + threadid;
+//  const int task_id = bundle_first_part + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  if(pid < bundle_first_part + bundle_n_parts){
+    const struct part_aos_f4_f_send pi = parts_send[pid];
+    const int cj_start = pi.cjs_cje.x;
+    const int cj_end = pi.cjs_cje.y;
+  /* Start calculations for particles in cell i */
+    DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+  }
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair1_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, int time_bin_inhibited, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max(max_parts_j, max_parts_i);
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+//  fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
+//		  "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
+//		  max_parts_i, max_parts_j, numBlocks_x, numBlocks_y, BLOCK_SIZE);
+
+  /*Do ci & cj*/
+//  fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n", BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts);
+
+//  runner_do_pair_sym_density_GPU<<<gridShape, BLOCK_SIZE,
+//		  13 * BLOCK_SIZE * sizeof(float) +
+//		  3 * BLOCK_SIZE * sizeof(double) +
+//              BLOCK_SIZE * sizeof(timebin_t),
+//          stream>>>(
+//      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+//      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
+
+  runner_do_pair_nonsym_density_GPU<<<gridShape, BLOCK_SIZE,
+		  5 * BLOCK_SIZE * sizeof(float) +
+		  3 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(timebin_t),
+          stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_ci_density_GPU<<<gridShape, BLOCK_SIZE,
+		  5 * BLOCK_SIZE * sizeof(float) +
+		  3 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(timebin_t),
+          stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_cj_density_GPU<<<gridShape, BLOCK_SIZE,
+		  5 * BLOCK_SIZE * sizeof(float) +
+		  3 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(timebin_t),
+          stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_ci_density_GPU_aos<<<gridShape, BLOCK_SIZE,
+		  5 * BLOCK_SIZE * sizeof(float) +
+		  3 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_cj_density_GPU_aos<<<gridShape, BLOCK_SIZE,
+		  5 * BLOCK_SIZE * sizeof(float) +
+		  3 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens){
+
+	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+	  int nBlocks_per_task = numBlocks_x;
+
+
+	  runner_do_pair_ci_density_GPU_aos_f4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+			  parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, bundle_first_task);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+	      float d_a, float d_H, cudaStream_t stream,
+		  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+  runner_do_pair_cj_density_GPU_aos_f4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+		  parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, bundle_first_task);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
+
+	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+	  int nBlocks_per_task = numBlocks_x;
+
+//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+	  runner_do_pair_density_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_ci_density_GPU_aos_g<<<gridShape, BLOCK_SIZE,
+		  12 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_cj_density_GPU_aos_g<<<gridShape, BLOCK_SIZE,
+		  12 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_gradient_gpu_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
+
+	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+	  int nBlocks_per_task = numBlocks_x;
+
+//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+	  runner_do_pair_gradient_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopairci_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_i;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_ci_density_GPU_aos_f<<<gridShape, BLOCK_SIZE,
+		  17 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopaircj_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z) {
+
+
+  int max_parts = max_parts_j;
+  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+
+
+  runner_do_pair_cj_density_GPU_aos_f<<<gridShape, BLOCK_SIZE,
+		  17 * BLOCK_SIZE * sizeof(float) +
+              BLOCK_SIZE * sizeof(int),
+          stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
+      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void runner_dopair_branch_force_gpu_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
+
+	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+	  int nBlocks_per_task = numBlocks_x;
+
+//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+	  runner_do_pair_force_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+__global__ void runner_do_self_density_GPU_naive(
+	    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
+	    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+	    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id];
+  last_part_in_task_blocks = d_task_last_part[task_id];
+
+  const int pid = threadid + first_part_in_task_blocks;
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  int Found_neighbours = 0;
+
+  if (pid < last_part_in_task_blocks) {
+	ttid = parts_soa.tid_p[pid];
+	first_part = d_task_first_part[ttid];
+	last_part = d_task_last_part[ttid];
+	count = last_part - first_part;
+	cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+	cellz = parts_soa.locz[pid];
+	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+	mi = parts_soa.mass[pid];
+	uxi = parts_soa.ux[pid];
+	uyi = parts_soa.uy[pid];
+	uzi = parts_soa.uz[pid];
+	pix = parts_soa.x_p[pid] - cellx;
+	piy = parts_soa.y_p[pid] - celly;
+	piz = parts_soa.z_p[pid] - cellz;
+
+    int n_neighbours = 0;
+
+    /*Naive loop over neighbours*/
+    for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+    	       b += BLOCK_SIZE) {
+      for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+    	int j = j_block + b;
+    	if (j < last_part_in_task_blocks) {
+    	  const float x_p_tmp = parts_soa.x_p[j];
+    	  const float y_p_tmp = parts_soa.y_p[j];
+    	  const float z_p_tmp = parts_soa.z_p[j];
+    	  const float h_tmp = parts_soa.h[j];
+    	  const float mass_tmp = parts_soa.mass[j];
+    	  const float ux_tmp = parts_soa.ux[j];
+    	  const float uy_tmp = parts_soa.uy[j];
+    	  const float uz_tmp = parts_soa.uz[j];
+    	  const timebin_t timebin = parts_soa.time_bin[j];
+
+		  /* Compute the pairwise distance. */
+		  const float pjx = x_p_tmp - cellx;
+		  const float pjy = y_p_tmp - celly;
+		  const float pjz = z_p_tmp - cellz;
+		  const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+		  const float r2 = xij * xij + yij * yij + zij * zij;
+		  const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2;
+		  if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+			Found_neighbours=1;
+			const float r = sqrt(r2);
+			/* Recover some data */
+			const float mj = mass_tmp;
+			/* Get the kernel for hi. */
+			if(hi<1.f/128.f)printf("h < dx\n");
+			const float h_inv = 1.f / hi;
+			const float ui = r * h_inv;
+			float wi, wi_dx;
+
+			d_kernel_deval(ui, &wi, &wi_dx);
+
+			rhoi += mj * wi;
+			rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+			wcounti += wi;
+			wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+			const float r_inv = 1.f / r;
+			const float faci = mj * wi_dx * r_inv;
+
+			/* Compute dv dot r */
+			float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp,
+				dvz = uzi - uz_tmp;
+			const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+			div_vi -= faci * dvdr;
+
+			/* Compute dv cross r */
+			float curlvrx = dvy * zij - dvz * yij;
+			float curlvry = dvz * xij - dvx * zij;
+			float curlvrz = dvx * yij - dvy * xij;
+
+			rot_uxi += faci * curlvrx;
+			rot_uyi += faci * curlvry;
+			rot_uzi += faci * curlvrz;
+		  }
+    	}
+      }
+    }
+//    float wi, wi_dx;
+//    d_kernel_deval(0.f, &wi, &wi_dx);
+    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_tester_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream, int bid,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y, int tid,
+                           int offset, int bundle_first_task, int max_parts,
+                           int time_bin_inhibited) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  tester<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  runner_do_self_density_GPU<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+                           double * d_cell_x,
+						   double * d_cell_y, double * d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_G<<<gridShape, BLOCK_SIZE,
+                               12 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(int),
+                               stream>>>(
+      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
+	  d_cell_y, d_cell_z);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+                           cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int2 * d_task_first_part_f4) {
+
+	dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+	int nBlocks_per_task = numBlocks_x;
+	DOSELF_GPU_AOS_F4_G<<<gridShape, BLOCK_SIZE,
+	                        3 * BLOCK_SIZE * sizeof(float4), stream>>>(
+	      parts_send, parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+                           double * d_cell_x,
+						   double * d_cell_y, double * d_cell_z) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F<<<gridShape, BLOCK_SIZE,
+                               16 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(int),
+                               stream>>>(
+      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
+	  d_cell_y, d_cell_z);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send, struct part_aos_f4_f_recv *d_parts_recv, float d_a, float d_H,
+                           cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int2 * d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4_F<<<gridShape, BLOCK_SIZE,
+                                   4 * BLOCK_SIZE * sizeof(float4) +
+                                   BLOCK_SIZE * sizeof(float3),
+                                   stream>>>(
+      d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
+//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/cuda/GPU_runner_functions.h b/src/cuda/GPU_runner_functions.h
new file mode 100755
index 0000000000..d43fc6f2ff
--- /dev/null
+++ b/src/cuda/GPU_runner_functions.h
@@ -0,0 +1,116 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts);
+void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+						   double * d_cell_x,
+						   double * d_cell_y, double * d_cell_z);
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, float d_a, float d_H,
+        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+        int bundle_first_task, int2 *d_task_first_part_f4);
+void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+                           double * d_cell_x,
+							double * d_cell_y, double * d_cell_z);
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+        int bundle_first_task, int2 * d_task_first_part_f4);
+void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
+                           int *d_task_last_part, float d_a, float d_H,
+                           const char *loop_type, cudaStream_t stream,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y,
+                           int bundle_first_task, int max_parts,
+                           double * d_cell_x,
+							double * d_cell_y, double * d_cell_z);
+void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+        int bundle_first_task, int2 * d_task_first_part_f4);
+void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci,
+	          int *d_task_first_part_cj, int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+              const char *loop_type, cudaStream_t stream, int bid, int block_size, int count_tasks, int tasksperbundle,
+              int max_parts_i, int max_parts_j, int numBlocks_y, int tid, int offset, int bundle_first_task, int max_active_bin);
+void runner_dopair1_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+        int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+		  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+		  int numBlocks_y, int tid, int offset, int bundle_first_task, int max_active_bin, double * d_shift_x,
+		  double * d_shift_y, double * d_shift_z);
+void runner_dopairci_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopaircj_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopairci_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopaircj_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopair_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
+void runner_dopaircj_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopair_branch_gradient_gpu_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
+void runner_dopairci_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
+			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
+			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
+			  , double * d_shift_y, double * d_shift_z);
+void runner_dopair_branch_force_gpu_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
+		      float d_a, float d_H, cudaStream_t stream,
+			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/cuda/device_functions.h b/src/cuda/device_functions.h
new file mode 100755
index 0000000000..6e4edac6ac
--- /dev/null
+++ b/src/cuda/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+//#include "../dimension.h"
+//#include "../error.h"
+//#include "../inline.h"
+//#include "../minmax.h"
+//#include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one                                              \
+  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim                                                   \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one                                          \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+  const float x2 = x * x;
+  return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+  return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+  return x * x;
+
+#else
+
+  error("The dimension is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+  return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+  return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+  return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+  return sqrtf(x); /* x^(1/2) */
+
+#else
+
+  error("The adiabatic index is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float * __restrict__ W,
+                               float * __restrict__ dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+      3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
+      -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
+      0.f,  0.f,  0.f,  0.f}; /* 1 < u */
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  /* First two terms of the polynomial ... */
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+
+  w = fmaxf(w, 0.f);
+  dw_dx = fminf(dw_dx, 0.f);
+
+  /* Return everything */
+  *W = w * kernel_constant * kernel_gamma_inv_dim;
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // DEVICE_FUNCTIONS_H
diff --git a/src/cuda/kernel_definitions.cu b/src/cuda/kernel_definitions.cu
new file mode 100755
index 0000000000..82e749725d
--- /dev/null
+++ b/src/cuda/kernel_definitions.cu
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+//#ifndef restrict
+//#define restrict __restrict__
+//#endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef __cplusplus prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cell_gpu.h"
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+/* function to initialise and printout GPU name*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+  // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void CPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+  int id = ci_gpu->hydro.parts[0].id;
+  printf("id of first part %d\n", id);
+  // Do stuff here for interactions on CPU but using the temporary GPU arrays
+  //	const int count_i = ci_gpu->hydro.count;
+  //  	const int count_j = cj_gpu->hydro.count;
+  //	system("pause");
+  /* Anything to do here? */
+  //  	if (!cell_is_active_hydro(ci_gpu, e) && !cell_is_active_hydro(cj_gpu,
+  //  e)) return;
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *restrict ci_gpu) {
+  int count = ci_gpu->hydro.count;
+  int numBlocks = (count + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+  struct cell_gpu *d_ci_gpu;
+  cudaMalloc((void **)&d_ci_gpu, sizeof(cell_gpu));
+
+  cudaMemcpy(d_ci_gpu, ci_gpu, sizeof(cell_gpu), cudaMemcpyHostToDevice);
+  SPH_Sum_Self<<<numBlocks, BLOCK_SIZE>>>(d_ci_gpu);
+  cudaMemcpy(ci_gpu, d_ci_gpu, sizeof(cell_gpu), cudaMemcpyDeviceToHost);
+}
+#ifdef WITH_CUDA
+}
+#endif
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+__device__ void SPH_Sum_Self(cell_gpu *d_ci_gpu) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = index;
+  float sumLoc, xi, yi, zi;
+  struct part_gpu *restrict parts = d_ci_gpu->hydro.parts;
+  xi = parts[i].x[0];
+  yi = parts[i].x[1];
+  zi = parts[i].x[2];
+  sumLoc = 0.f;
+  float h = parts[i].h, mass = parts[i].mass, rho = parts[i].rho;
+  const int count = d_ci_gpu->hydro.count;
+  //__shared__ float sh_x[BLOCK_SIZE], sh_y[BLOCK_SIZE];
+  // copy neighbour particles data to shared memory
+  // for (unsigned int j1=0; j1<n; j1+=BLOCK_SIZE){
+  // float xj=
+  //}
+  for (int j = 0; j < count; j++) {
+    float xj = parts[j].x[0], yj = parts[j].x[1], zj = parts[j].x[2];
+    float rad = sqrt((xj - xi) * (xj - xi) + (yj - yi) * (yj - yi));
+    float q = rad / h;
+    float q4 = 1.f - 0.5f * q;
+    q4 = q4 * q4 * q4 * q4;
+    float w = q4 * (2.0f * q + 1.0f);
+    float v = mass / rho;
+    if (q < 2.0f)
+      sumLoc = sumLoc + w * v * 7.0 * 7.0 / (4.0 * 22.0 * h * h);
+  }
+  // d_Particles[i].ker_sum=sumLoc;
+}
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/cuda/tester.h b/src/cuda/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/cuda/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/files_for_new_functions/arrays_malloc.cu b/src/files_for_new_functions/arrays_malloc.cu
new file mode 100755
index 0000000000..11726e8528
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.cu
@@ -0,0 +1,295 @@
+#include "cuda/part_gpu.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+#include "arrays_malloc.h"
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp){
+	///////////Malloc Host arrays
+	cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int));
+	cudaMallocHost((void **)&parts_soa->id, count_max_parts_tmp * sizeof(long long));
+	cudaMallocHost((void **)&parts_soa->mass, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->u_dt, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->SPH_sum, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->x_p, count_max_parts_tmp * sizeof(double));
+	cudaMallocHost((void **)&parts_soa->y_p, count_max_parts_tmp * sizeof(double));
+	cudaMallocHost((void **)&parts_soa->z_p, count_max_parts_tmp * sizeof(double));
+	cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->a_hydrox, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->a_hydroy, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->a_hydroz, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->locx, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->locy, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->locz, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->widthx, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->widthy, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->widthz, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->h_max, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->count_p, count_max_parts_tmp * sizeof(int));
+	cudaMallocHost((void **)&parts_soa->wcount, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->wcount_dh, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->rho_dh, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->rot_ux, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->rot_uy, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->rot_uz, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->div_v, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->div_v_previous_step,
+				   count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->alpha_visc, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->v_sig, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->laplace_u, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->alpha_diff, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->soundspeed, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->h_dt, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->balsara, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->pressure, count_max_parts_tmp * sizeof(float));
+	cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb,
+				   count_max_parts_tmp * sizeof(float));
+	/* timestep stuff */
+	cudaMallocHost((void **)&parts_soa->time_bin, count_max_parts_tmp * sizeof(timebin_t));
+	cudaMallocHost((void **)&parts_soa->wakeup, count_max_parts_tmp * sizeof(timebin_t));
+	cudaMallocHost((void **)&parts_soa->min_ngb_time_bin,
+				   count_max_parts_tmp * sizeof(timebin_t));
+	cudaMallocHost((void **)&parts_soa->to_be_synchronized,
+				   count_max_parts_tmp * sizeof(char));
+}
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp){
+	////////now malloc variables for particle data on the GPU. Sheesh
+	  fprintf(stderr, "before malloc\n");
+	  cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp);
+	  fprintf(stderr, "after malloc\n");
+	  cudaMalloc((void **)&(d_parts_soa.id), sizeof(long long) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.a_hydrox), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.a_hydroy), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.a_hydroz), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.SPH_sum), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.widthx), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.widthy), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.widthz), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.h_max), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.count_p), sizeof(int) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.wcount), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.wcount_dh), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.rho_dh), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.rot_ux), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.rot_uy), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.rot_uz), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.div_v), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.div_v_previous_step),
+	             sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.alpha_visc), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.v_sig), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.laplace_u), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.alpha_diff), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.soundspeed), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.balsara), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.pressure), sizeof(float) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb),
+	             sizeof(float) * count_max_parts_tmp);
+	  /* timestep stuff */
+	  cudaMalloc((void **)&(d_parts_soa.time_bin), sizeof(timebin_t) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.wakeup), sizeof(timebin_t) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin),
+	             sizeof(timebin_t) * count_max_parts_tmp);
+	  cudaMalloc((void **)&(d_parts_soa.to_be_synchronized),
+	             sizeof(char) * count_max_parts_tmp);
+}
+
+cudaError_t cudaAllocInt(int ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(int));
+}
+cudaError_t cudaAllocFloat(float ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(float));
+}
+cudaError_t cudaAllocDouble(double ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(double));
+}
+cudaError_t cudaAllocLonglong(long long ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(long long));
+}
+cudaError_t cudaAllocChar(char ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(char));
+}
+cudaError_t cudaAllocTimebin(timebin_t ** d_var, int elements){
+    return cudaMalloc((void**)d_var, elements * sizeof(timebin_t));
+}
+
+
+void allocate_device_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
+		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
+		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
+		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
+		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
+		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
+		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
+        char **d_to_be_synchronized, int count_max_parts_tmp){
+	////////Malloc variables for particle data on the GPU. Sheesh, that's a lot
+
+	      size_t free_byte ;
+          size_t total_byte ;
+
+          cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+          double free = (double)free_byte;
+          double available = (double)total_byte;
+          double used = (available - free);
+//          message("free %lf used %lf", free/10.E8, used/10.E8);
+
+          cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp);
+          cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp);
+		  cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp);
+		  cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp);
+		  cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp);
+          cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_h, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_u, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp);
+		  cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_f, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp);
+		  cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp);
+		  /* timestep stuff */
+		  cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp);
+		  cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp);
+		  cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp);
+		  cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp);
+//		  cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+//		  double free_end = (double)free_byte;
+//		  available = (double)total_byte;
+//		  double used_end = (available - free_end);
+//          message("cuda malloc self free %lf GB used %lf GB used to allocate self"
+//        		  " data %lf MB", free_end/10.E8, used_end/10.E8, (used_end - used)/10.E5);
+//          message("at end of malloc dirty: %s",
+//		  	       cudaGetErrorString(cu_error));
+#ifdef CUDA_DEBUG
+		  if (cu_error != cudaSuccess) {
+		    fprintf(stderr,
+		  	"CUDA error at end of malloc dirty: %s\n",
+		  	cudaGetErrorString(cu_error));
+		  	exit(0);
+		  }
+#endif
+
+
+}
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp){
+	////////now malloc variables for particle data on the GPU. Sheesh
+
+		  cudaMalloc((void **) tid_test, sizeof(int) * count_max_parts_tmp);
+
+		  cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
+		  fprintf(stderr,
+			"malloc tid: %s\n",
+			cudaGetErrorString(cu_error));
+
+		  if (cu_error != cudaSuccess) {
+			fprintf(stderr,
+			"CUDA error with malloc tid: %s\n",
+			cudaGetErrorString(cu_error));
+			exit(0);
+		  }
+
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type, int count_max_parts_tmp){
+	allocate_host(parts_soa, count_max_parts_tmp);
+}
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type, int count_max_parts_tmp){
+	allocate_device(d_parts_soa, count_max_parts_tmp);
+}
+
+void device_malloc_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
+		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
+		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
+		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
+		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
+		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
+		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
+        char **d_to_be_synchronized, int count_max_parts_tmp){
+
+	allocate_device_dirty(d_tid_p, d_id, d_x_p, d_y_p, d_z_p,
+			d_ux, d_uy, d_uz, d_a_hydrox, d_a_hydroy, d_a_hydroz,
+			 d_mass, d_h , d_u, d_u_dt, d_rho, d_locx, d_locy,
+			 d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p,
+			 d_wcount, d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz,
+			 d_div_v, d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u,
+			 d_alpha_diff, d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure,
+			 d_alpha_visc_max_ngb, d_time_bin, d_wakeup, d_min_ngb_time_bin,
+	         d_to_be_synchronized, count_max_parts_tmp);
+
+}
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp){
+
+	allocate_device_test(tid_test, count_max_parts_tmp);
+
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/arrays_malloc.h b/src/files_for_new_functions/arrays_malloc.h
new file mode 100755
index 0000000000..798dc7895b
--- /dev/null
+++ b/src/files_for_new_functions/arrays_malloc.h
@@ -0,0 +1,50 @@
+#include "cuda/part_gpu.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <error.h>
+
+cudaError_t cudaAllocInt(int ** d_var, int elements);
+cudaError_t cudaAllocFloat(float ** d_var, int elements);
+cudaError_t cudaAllocDouble(double ** d_var, int elements);
+cudaError_t cudaAllocLonglong(long long ** d_var, int elements);
+cudaError_t cudaAllocChar(char ** d_var, int elements);
+cudaError_t cudaAllocTimebin(timebin_t ** d_var, int elements);
+
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp);
+
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp);
+
+void allocate_device_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
+		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
+		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
+		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
+		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
+		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
+		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
+        char **d_to_be_synchronized, int count_max_parts_tmp);
+
+void allocate_device_test(int **tid_test, int count_max_parts_tmp);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_malloc(struct part_soa *parts_soa, int alloc_type, int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_malloc(struct part_soa d_parts_soa, int alloc_type, int count_max_parts_tmp);
+
+void device_malloc_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
+		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
+		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
+		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
+		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
+		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
+		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
+        char **d_to_be_synchronized, int count_max_parts_tmp);
+
+void device_malloc_test(int **tid_test, int count_max_parts_tmp);
+
diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu
new file mode 100755
index 0000000000..238e345e5f
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.cu
@@ -0,0 +1,529 @@
+#include "cuda/part_gpu.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp){
+//	int * tid_h;
+//	cudaMallocHost((void **)&tid_h,
+//			count_max_parts_tmp * sizeof(int));
+	for (int i =0; i< count_max_parts_tmp; i++){
+		tid_h[i] = 100;
+//		fprintf(stderr,"tid_h %i\n", tid_h[i]);
+	}
+
+	cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
+	cudaDeviceSynchronize();
+//	cudaFree(tid_h);
+}
+
+void device2host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp){
+	int *tid_p = parts_soa.tid_p;
+	cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int), cudaMemcpyDeviceToHost);
+	for (int i =0; i< count_max_parts_tmp; i++){
+		fprintf(stderr,"tid is %i\n", tid_h[i]);
+	}
+}
+
+void device2device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp){
+	cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp){
+
+	host2device_test(d_tid_p, tid_h, count_max_parts_tmp);
+
+}
+
+void device_host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp){
+
+	device2host_test(parts_soa, tid_h, count_max_parts_tmp);
+
+}
+
+void device_device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp){
+
+	device2device_test(tid_p, parts_soa, count_max_parts_tmp);
+
+}
+
+void device2host_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp){
+  cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int), cudaMemcpyDeviceToHost);
+}
+void device_host_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp){
+
+	device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, count_max_parts_tmp);
+
+}
+
+void device2device_density(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream){
+
+  cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p,
+				  sizeof(int *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->h), &h,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p,
+				  sizeof(double *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p,
+				  sizeof(double *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p,
+				  sizeof(double *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz,
+				  sizeof(float *), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin,
+				  sizeof(timebin_t *), cudaMemcpyHostToDevice, stream);
+
+}
+
+
+void host2device_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp){
+  cudaError_t cu_error;
+  cudaMemcpy(&tid_p, &(parts_soa_buffer.tid_p),
+		  count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp){
+
+	host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, count_max_parts_tmp);
+
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream){
+
+	device2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+				 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+				 mass, h , u, u_dt, rho, locx, locy,
+				 locz, widthx, widthy, widthz, h_max, count_p,
+				 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+				 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+				 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+				 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+		         to_be_synchronized, count_max_parts_tmp, stream);
+
+}
+
+void host2device_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+  cudaError_t cu_error;
+  cudaMemcpyAsync(&tid_p[first_part_tmp], &(parts_soa_buffer.tid_p[first_part_tmp]),
+				  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice,
+				  stream);
+  cudaMemcpyAsync(&locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&time_bin[first_part_tmp],
+				  &parts_soa_buffer.time_bin[first_part_tmp],
+				  bundle_n_parts * sizeof(timebin_t),
+				  cudaMemcpyHostToDevice, stream);
+}
+
+void host2device_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+
+//  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+  cudaError_t cu_error;
+//  cudaMemcpyAsync(&tid_p[first_part_tmp], &(parts_soa_buffer.tid_p[first_part_tmp]),
+//				  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice,
+//				  stream);
+//  cudaMemcpyAsync(&locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
+//				  bundle_n_parts * sizeof(float),
+//				  cudaMemcpyHostToDevice, stream);
+//  cudaMemcpyAsync(&locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
+//				  bundle_n_parts * sizeof(float),
+//				  cudaMemcpyHostToDevice, stream);
+//  cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
+//				  bundle_n_parts * sizeof(float),
+//				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
+				  bundle_n_parts * sizeof(double),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
+				  bundle_n_parts * sizeof(float),
+				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&time_bin[first_part_tmp],
+				  &parts_soa_buffer.time_bin[first_part_tmp],
+				  bundle_n_parts * sizeof(timebin_t),
+				  cudaMemcpyHostToDevice, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+
+	host2device_async_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
+
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp_i, int bundle_n_parts, cudaStream_t stream){
+
+	host2device_async_density_pair(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, first_part_tmp_i, bundle_n_parts,
+			 stream);
+
+}
+
+void device2host_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+  cudaError_t cu_error;
+
+  cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+				&rho_dh[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+				&wcount[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+				&wcount_dh[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], &div_v[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+				&rot_ux[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+				&rot_uy[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+				&rot_uz[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+}
+
+void device2host_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+  cudaError_t cu_error;
+//  fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i, bundle_n_parts_j);
+//  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+
+  cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
+				&rho_dh[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
+				&wcount[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
+				&wcount_dh[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], &div_v[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
+				&rot_ux[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
+				&rot_uy[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
+				&rot_uz[first_part_tmp],
+				bundle_n_parts * sizeof(float),
+				cudaMemcpyDeviceToHost, stream);
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+
+	device2host_async_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
+
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+
+	device2host_async_density_pair(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
+			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
+			 mass, h , u, u_dt, rho, locx, locy,
+			 locz, widthx, widthy, widthz, h_max, count_p,
+			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
+			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
+
+}
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized){
+
+    parts_soa->tid_p = tid_p;
+    parts_soa->locx = locx;
+    parts_soa->locy = locy;
+    parts_soa->locz = locz;
+    parts_soa->h = h;
+    parts_soa->mass = mass;
+    parts_soa->x_p = x_p;
+    parts_soa->y_p = y_p;
+    parts_soa->z_p = z_p;
+    parts_soa->rho = rho;
+    parts_soa->rho_dh = rho_dh;
+    parts_soa->wcount = wcount;
+    parts_soa->wcount_dh = wcount_dh;
+    parts_soa->ux = ux;
+    parts_soa->uy = uy;
+    parts_soa->uz = uz;
+    parts_soa->div_v = div_v;
+    parts_soa->rot_ux = rot_ux;
+    parts_soa->rot_uy = rot_uy;
+    parts_soa->rot_uz = rot_uz;
+    parts_soa->time_bin = time_bin;
+
+}
+
+#ifdef WITH_CUDA
+}
+#endif
diff --git a/src/files_for_new_functions/host_device_data_transfer.h b/src/files_for_new_functions/host_device_data_transfer.h
new file mode 100755
index 0000000000..c97b4a5d49
--- /dev/null
+++ b/src/files_for_new_functions/host_device_data_transfer.h
@@ -0,0 +1,176 @@
+#include "cuda/part_gpu.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device2host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp);
+
+void device2device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
+
+void device_host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp);
+
+void device_device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp);
+
+void device2host_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp);
+
+void device_host_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp);
+
+void device2device_density(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream);
+
+
+void host2device_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_bind(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void host_device_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_host_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+
+/*Function to be overloaded using different part_soa structs
+ * and allocate their internal arrays
+ * alloc_type 0 for density, 1 for force, 2 for gradient*/
+void device_device_async_bind(struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized);
+
+void host_device_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+
+void device_host_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
+		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
+		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
+		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
+		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
+		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts_i, int bundle_n_parts_j,
+		cudaStream_t stream);

From f19c35a79e3a36cb7645076c421b89e934cb4d0a Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 22 Oct 2024 17:54:14 +0100
Subject: [PATCH 007/217] Added ifdefs to a few files to a) Stop can't find
 config.h errors and b to stop an error related to prototype overloading in
 dummy.c in src/cuda

---
 src/clocks.h                   |   5 +-
 src/error.h                    |   4 +
 src/hip/BLOCK_SIZE.h           |  10 +++
 src/hip/HIP_runner_functions.h |  22 +++++
 src/hip/device_functions.h     | 149 +++++++++++++++++++++++++++++++++
 src/hip/tester.h               |   9 ++
 src/memuse.h                   |   5 +-
 7 files changed, 202 insertions(+), 2 deletions(-)
 create mode 100755 src/hip/BLOCK_SIZE.h
 create mode 100755 src/hip/HIP_runner_functions.h
 create mode 100755 src/hip/device_functions.h
 create mode 100755 src/hip/tester.h

diff --git a/src/clocks.h b/src/clocks.h
index e39d8e8195..4cc7cdaac7 100644
--- a/src/clocks.h
+++ b/src/clocks.h
@@ -20,8 +20,11 @@
 #define SWIFT_CLOCKS_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
-
+#endif
 /* System includes. */
 #include <sys/times.h>
 
diff --git a/src/error.h b/src/error.h
index a9b7481cf4..806b74f123 100644
--- a/src/error.h
+++ b/src/error.h
@@ -22,7 +22,11 @@
 #define SWIFT_ERROR_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
+#endif
 
 /* Some standard headers. */
 #include <stdio.h>
diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
new file mode 100755
index 0000000000..b476b4d766
--- /dev/null
+++ b/src/hip/BLOCK_SIZE.h
@@ -0,0 +1,10 @@
+#ifndef BLOCK_SIZE_H
+#define BLOCK_SIZE_H
+#ifdef WITH_CUDA
+// extern "C" {
+#endif
+#define BLOCK_SIZE 512
+#ifdef WITH_CUDA
+//}
+#endif
+#endif // BLOCK_SIZE_H
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
new file mode 100755
index 0000000000..b85772f6b0
--- /dev/null
+++ b/src/hip/HIP_runner_functions.h
@@ -0,0 +1,22 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "part_gpu.h"
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, int *d_bundle_first_part,
+                           int *d_bundle_last_part, float d_a, float d_H,
+                           const char *loop_type, hipStream_t stream, int bid,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y, int tid,
+                           int offset, int bundle_first_task, int max_parts,
+                           int max_active_bin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
new file mode 100755
index 0000000000..2cba0e9829
--- /dev/null
+++ b/src/hip/device_functions.h
@@ -0,0 +1,149 @@
+#ifndef DEVICE_FUNCTIONS_H
+#define DEVICE_FUNCTIONS_H
+#include "../../config.h"
+
+/* Local headers. */
+//#include "../dimension.h"
+//#include "../error.h"
+//#include "../inline.h"
+//#include "../minmax.h"
+//#include "../vector.h"
+
+// Is this even necessary? Probably not as our code will operate differently
+#define num_cuda_threads 128
+#define hydro_dimension 3.f
+
+/// Here we define stuff from kernel_hydro.h when using cubic_spline_kernel.
+/// Will worry about sorting 'if statements for different kernels later////
+/* First some powers of gamma = H/h */
+#define kernel_gamma ((float)(1.825742))
+#define kernel_gamma_inv ((float)(1. / kernel_gamma))
+#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
+#define kernel_ivals 2
+#define kernel_degree 3 /*!< Degree of the polynomial */
+#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_dim_plus_one                                              \
+  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
+#define kernel_gamma_inv_dim                                                   \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_gamma_inv_dim_plus_one                                          \
+  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
+#define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
+#define kernel_constant ((float)(16. * M_1_PI))
+/*! Cosmology default beta=3.0.
+ * Alpha can be set in the parameter file.
+ * Beta is defined as in e.g. Price (2010) Eqn (103) */
+#define const_viscosity_beta 3.0f
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+/**
+ * @brief Returns the argument to the power given by the dimension plus one
+ *
+ * Computes \f$x^{d+1}\f$.
+ */
+__device__ float d_pow_dimension_plus_one(float x) {
+
+#if defined(HYDRO_DIMENSION_3D)
+
+  const float x2 = x * x;
+  return x2 * x2;
+
+#elif defined(HYDRO_DIMENSION_2D)
+
+  return x * x * x;
+
+#elif defined(HYDRO_DIMENSION_1D)
+
+  return x * x;
+
+#else
+
+  error("The dimension is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Return the argument to the power three adiabatic index minus five over
+ * two.
+ *
+ * Computes \f$x^{(3\gamma - 5)/2}\f$.
+ *
+ * @param x Argument
+ */
+__device__ float d_pow_three_gamma_minus_five_over_two(float x) {
+#if defined(HYDRO_GAMMA_5_3)
+
+  return 1.f; /* x^(0) */
+
+#elif defined(HYDRO_GAMMA_7_5)
+
+  return powf(x, -0.4f); /* x^(-2/5) */
+
+#elif defined(HYDRO_GAMMA_4_3)
+
+  return 1.f / sqrtf(x); /* x^(-1/2) */
+
+#elif defined(HYDRO_GAMMA_2_1)
+
+  return sqrtf(x); /* x^(1/2) */
+
+#else
+
+  error("The adiabatic index is not defined !");
+  return 0.f;
+
+#endif
+}
+
+/**
+ * @brief Computes the kernel function and its derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param W (return) The value of the kernel function \f$W(x,h)\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__device__ void d_kernel_deval(float u, float *restrict W,
+                               float *restrict dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] = {
+      3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
+      -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
+      0.f,  0.f,  0.f,  0.f}; /* 1 < u */
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+  /* First two terms of the polynomial ... */
+  float w = coeffs[0] * x + coeffs[1];
+  float dw_dx = coeffs[0];
+
+  /* ... and the rest of them */
+  for (int k = 2; k <= kernel_degree; k++) {
+    dw_dx = dw_dx * x + w;
+    w = x * w + coeffs[k];
+  }
+
+  w = max(w, 0.f);
+  dw_dx = min(dw_dx, 0.f);
+
+  /* Return everything */
+  *W = w * kernel_constant * kernel_gamma_inv_dim;
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // DEVICE_FUNCTIONS_H
diff --git a/src/hip/tester.h b/src/hip/tester.h
new file mode 100755
index 0000000000..5729e66904
--- /dev/null
+++ b/src/hip/tester.h
@@ -0,0 +1,9 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void testing_linkage(int a, float *b, float c);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/memuse.h b/src/memuse.h
index 5883e68684..d51ab4282d 100644
--- a/src/memuse.h
+++ b/src/memuse.h
@@ -20,8 +20,11 @@
 #define SWIFT_MEMUSE_H
 
 /* Config parameters. */
+#ifdef WITH_CUDA
+#include "../config.h"
+#else
 #include <config.h>
-
+#endif
 /* Includes. */
 #include <stdlib.h>
 

From 6334924682849bb8b62d4b94e0ae4b2167f1ca04 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 22 Oct 2024 17:54:42 +0100
Subject: [PATCH 008/217] Added dummy.c in src/cuda

---
 src/cuda/dummy.c | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100755 src/cuda/dummy.c

diff --git a/src/cuda/dummy.c b/src/cuda/dummy.c
new file mode 100755
index 0000000000..c75d2d873c
--- /dev/null
+++ b/src/cuda/dummy.c
@@ -0,0 +1,9 @@
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void swiftcudadummy(void) {}
+#ifdef __cplusplus
+}
+#endif

From 8b64cbb3df0b429f9e67e25517bdce8a1b206cff Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 23 Oct 2024 17:25:47 +0100
Subject: [PATCH 009/217] Added cudalt.py dummy.C src/cuda/tester.cu

---
 cudalt.py          | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 dummy.C            |  3 ++
 src/cuda/tester.cu | 20 ++++++++++++
 3 files changed, 103 insertions(+)
 create mode 100755 cudalt.py
 create mode 100755 dummy.C
 create mode 100755 src/cuda/tester.cu

diff --git a/cudalt.py b/cudalt.py
new file mode 100755
index 0000000000..e8643cd1e6
--- /dev/null
+++ b/cudalt.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+# libtoolish hack: compile a .cu file like libtool does
+import sys
+import os
+
+lo_filepath = sys.argv[1]
+o_filepath = lo_filepath.replace(".lo", ".o")
+
+try:
+   i = o_filepath.rindex("/")
+   lo_dir = o_filepath[0:i+1]
+   o_filename = o_filepath[i+1:]
+
+except ValueError:
+   lo_dir = ""
+   o_filename = o_filepath
+
+local_pic_dir = ".libs/"
+local_npic_dir = ""
+pic_dir = lo_dir + local_pic_dir
+npic_dir = lo_dir + local_npic_dir
+
+pic_filepath = pic_dir + o_filename
+npic_filepath = npic_dir + o_filename
+local_pic_filepath = local_pic_dir + o_filename
+local_npic_filepath = local_npic_dir + o_filename
+
+# Make lib dir
+try:
+   os.mkdir(pic_dir)
+except OSError:
+   pass
+
+# generate the command to compile the .cu for shared library
+args = sys.argv[2:]
+args.extend(["-Xcompiler","-fPIC"]) 
+# position indep code
+args.append("-o")
+args.append(pic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+    sys.exit(1)
+
+# generate the command to compile the .cu for static library
+args = sys.argv[2:]
+args.append("-o")
+args.append(npic_filepath)
+command = " ".join(args)
+print (command)
+
+# compile the .cu
+rv = os.system(command)
+if rv != 0:
+    sys.exit(1)
+
+# get libtool version
+fd = os.popen("libtool --version")
+libtool_version = fd.readline()
+fd.close()
+
+# generate the .lo file
+f = open(lo_filepath, "w")
+f.write("# " +  lo_filepath + " - a libtool object file\n")
+f.write("# Generated by " + libtool_version + "\n")
+f.write("#\n")
+f.write("# Please DO NOT delete this file!\n")
+f.write("# It is necessary for linking the library.\n\n")
+
+f.write("# Name of the PIC object.\n")
+f.write("pic_object='" + local_pic_filepath + "'\n\n")
+
+f.write("# Name of the non-PIC object.\n")
+f.write("non_pic_object='" + local_npic_filepath + "'\n")
+f.close()
+
+sys.exit(0)
diff --git a/dummy.C b/dummy.C
new file mode 100755
index 0000000000..bbf68f8cea
--- /dev/null
+++ b/dummy.C
@@ -0,0 +1,3 @@
+void dummy(){
+
+}
diff --git a/src/cuda/tester.cu b/src/cuda/tester.cu
new file mode 100755
index 0000000000..5e70230211
--- /dev/null
+++ b/src/cuda/tester.cu
@@ -0,0 +1,20 @@
+#include "tester.h"
+#include <iostream>
+#include <vector>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+  std::vector<float> b_value_list;
+  b_value_list.reserve(a);
+  for (int i = 0; i < a; i++) {
+    (*b) = (*b) + c;
+    b_value_list.push_back((*b));
+    std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+              << std::endl;
+  }
+  std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif

From a822b0b05724687f402daa713ccaf0289f3e5979 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 16:17:59 +0100
Subject: [PATCH 010/217] Added code to cell.c and h cell_hydro.c cell_unskip.c
 engine_config.c task.c and h queue.h and removed unused vars from
 runner_doiact_functions_hydro_gpu.h

---
 src/cell.h          |  33 +++++++++++
 src/cell_hydro.h    |  18 ++++++
 src/cell_unskip.c   |  60 ++++++++++++++++++++
 src/engine_config.c |  29 +++++++++-
 src/queue.h         |   9 +++
 src/task.c          | 130 ++++++++++++++++++++++++++++++++++++++++++++
 src/task.h          |  15 +++++
 7 files changed, 292 insertions(+), 2 deletions(-)

diff --git a/src/cell.h b/src/cell.h
index cac5c49878..1d2aa0d7e1 100644
--- a/src/cell.h
+++ b/src/cell.h
@@ -360,6 +360,39 @@ enum cell_flags {
  */
 struct cell {
 
+  /*Marks a cell for GPU execution A. Nasar */
+  bool is_gpu_cell;
+
+  int unpacker_cell;
+
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done;
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done_g;
+  /*Marks a cell as having done its pack task 0->not 1-> yes*/
+  int pack_done_f;
+
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_g;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_f;
+
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done_g;
+  /*Has the task run on the GPU? 0->No, 1-> Yes*/
+  int unpack_done_f;
+
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair;
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair_g;
+  /*Has the pair task run on the GPU? 0->No, 1-> Yes*/
+  int gpu_done_pair_f;
+
   /*! The cell location on the grid (corner nearest to the origin). */
   double loc[3];
 
diff --git a/src/cell_hydro.h b/src/cell_hydro.h
index 39db7bc219..4b9446f731 100644
--- a/src/cell_hydro.h
+++ b/src/cell_hydro.h
@@ -61,6 +61,24 @@ struct cell_hydro {
     /*! Linked list of the tasks computing this cell's hydro density. */
     struct link *density;
 
+    /*! Linked list of the tasks computing this cell's hydro density pack. A. Nasar */
+    struct link *density_pack;
+    struct link *density_unpack;
+    /*! Linked list of the tasks computing this cell's hydro force pack. */
+    struct link *force_pack;
+    struct link *force_unpack;
+    /*! Linked list of the tasks computing this cell's hydro gradient pack. */
+    struct link *gradient_pack;
+    struct link *gradient_unpack;
+
+    struct task *d_pack;
+    struct task *g_pack;
+    struct task *f_pack;
+
+    struct task *d_unpack;
+    struct task *g_unpack;
+    struct task *f_unpack;
+
     /* Linked list of the tasks computing this cell's hydro gradients. */
     struct link *gradient;
 
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 6ad14a3560..5fe8b0ef3f 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1916,6 +1916,66 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
 
     for (struct link *l = c->hydro.limiter; l != NULL; l = l->next)
       scheduler_activate(s, l->t);
+    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
+      scheduler_activate(s, l->t);
+      if (l->t->ci != NULL)
+        l->t->ci->pack_done = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->gpu_done = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->unpack_done = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->pack_done = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->gpu_done = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->unpack_done = 0;
+    }
+    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+      l->t->gpu_done = 0;
+    }
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+      if (l->t->ci != NULL)
+        l->t->ci->pack_done_f = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->gpu_done_f = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->unpack_done_f = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->pack_done_f = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->gpu_done_f = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->unpack_done_f = 0;
+    }
+    for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+      l->t->gpu_done = 0;
+    }
+
+#ifdef EXTRA_HYDRO_LOOP
+    for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+      if (l->t->ci != NULL)
+        l->t->ci->pack_done_g = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->gpu_done_g = 0;
+      if (l->t->ci != NULL)
+        l->t->ci->unpack_done_g = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->pack_done_g = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->gpu_done_g = 0;
+      if (l->t->cj != NULL)
+        l->t->cj->unpack_done_g = 0;
+    }
+    for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+      scheduler_activate(s, l->t);
+      l->t->gpu_done = 0;
+    }
+#endif
 
     if (c->hydro.extra_ghost != NULL)
       scheduler_activate(s, c->hydro.extra_ghost);
diff --git a/src/engine_config.c b/src/engine_config.c
index 5e6c4eb98c..336e3d155d 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -32,6 +32,17 @@
 #include <numa.h>
 #endif
 
+#ifdef WITH_CUDA
+#include <cuda_runtime.h> /* A. Nasar */
+#include "runner_main_clean.cu"
+#endif
+
+#ifdef WITH_HIP
+//#include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include "runner_main_clean.hip"
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -909,9 +920,12 @@ void engine_config(int restart, int fof, struct engine *e,
   e->links_per_tasks =
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
-  /* Init the scheduler. */
+  /* Init the scheduler. Allow stealing*/
+//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+//                 (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
+  /* Init the scheduler. NO stealing  A. Nasar */
   scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                 (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
+                     0, e->nodeID, &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
@@ -981,9 +995,20 @@ void engine_config(int restart, int fof, struct engine *e,
   for (int k = 0; k < e->nr_threads; k++) {
     e->runners[k].id = k;
     e->runners[k].e = e;
+
+#ifdef WITH_CUDA
+    if (pthread_create(&e->runners[k].thread, NULL, &runner_main2,
+                       &e->runners[k]) != 0)
+      error("Failed to create GPU runner thread.");
+#elif WITH_HIP
+    if (pthread_create(&e->runners[k].thread, NULL, &runner_main_hip,
+                       &e->runners[k]) != 0)
+      error("Failed to create runner thread.");
+#else
     if (pthread_create(&e->runners[k].thread, NULL, &runner_main,
                        &e->runners[k]) != 0)
       error("Failed to create runner thread.");
+#endif
 
     /* Try to pin the runner to a given core */
     if (with_aff &&
diff --git a/src/queue.h b/src/queue.h
index 0576403bef..9ff0787cfa 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -75,6 +75,15 @@ struct queue {
   int *tid_incoming;
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
+  /*Number of pack tasks left in queue A. Nasar */
+  int n_packs_self_left;  /*Number of density pack tasks left in queue*/
+  int n_packs_self_left_f;  /*Number of force pack tasks left in queue*/
+  int n_packs_self_left_g;  /*Number of gradient pack tasks left in queue*/
+
+  int n_packs_pair_left;
+  int n_packs_pair_left_f;
+  int n_packs_pair_left_g;
+
 } __attribute__((aligned(queue_struct_align)));
 
 /* Function prototypes. */
diff --git a/src/task.c b/src/task.c
index 3b504a79e6..d1bfe8d9ca 100644
--- a/src/task.c
+++ b/src/task.c
@@ -164,6 +164,12 @@ const char *subtaskID_names[task_subtype_count] = {
     "sink_do_gas_swallow",
     "rt_gradient",
     "rt_transport",
+	"gpu_pack", // A. Nasar
+	"gpu_pack_g",
+	"gpu_pack_f",
+	"gpu_unpack",
+	"gpu_unpack_g",
+	"gpu_unpack_f",
 };
 
 const char *task_category_names[task_category_count] = {
@@ -598,6 +604,20 @@ void task_unlock(struct task *t) {
 #ifdef SWIFT_TASKS_WITHOUT_ATOMICS
         cell_unlocktree(ci);
 #endif
+      } else if (subtype == task_subtype_gpu_unpack) {
+//        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
+//		  cell_unlocktree(t->ci_unpack[pp]);
+//	    }
+      /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+      /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+      /*Do nothing and be on your way*/
+      } else if (subtype == task_subtype_gpu_pack) {
+      	cell_unlocktree(ci);
+      } else if (subtype == task_subtype_gpu_pack_f) {
+      	cell_unlocktree(ci);
+      } else if (subtype == task_subtype_gpu_pack_g) {
       } else { /* hydro */
         cell_unlocktree(ci);
       }
@@ -645,6 +665,21 @@ void task_unlock(struct task *t) {
         cell_unlocktree(ci);
         cell_unlocktree(cj);
 #endif
+      } else if (subtype == task_subtype_gpu_pack) {
+      	cell_unlocktree(ci);
+      	cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_pack_f) {
+      	cell_unlocktree(ci);
+      	cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_pack_g) {
+      	cell_unlocktree(ci);
+      	cell_unlocktree(cj);
+      } else if (subtype == task_subtype_gpu_unpack) {
+        /* Nothing to do */
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do */
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do */
       } else { /* hydro */
         cell_unlocktree(ci);
         cell_unlocktree(cj);
@@ -848,6 +883,43 @@ int task_lock(struct task *t) {
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
 #endif
+      } else if (subtype == task_subtype_gpu_pack) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        /* Attempt to lock the cell */
+        if (ci->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+      } else if (subtype == task_subtype_gpu_unpack) {
+  //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
+  //    	  if (t->ci_unpack[pp]->gpu_done == 0){
+  //    		  message("trying to queue an unpack before all packs done on GPU");
+  //    		  return 0;
+  //    	  }
+  ////          if (t->ci_unpack[pp]->hydro.hold)
+  ////    		return 0;
+  ////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
+  ////            return 0;
+  //        }
+        /* Nothing to do here */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do here */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do here */
+        return 1;
       } else { /* subtype == hydro */
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
@@ -964,6 +1036,45 @@ int task_lock(struct task *t) {
           return 0;
         }
 #endif
+      } else if (subtype == task_subtype_gpu_pack) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_pack_f) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_pack_g) {
+        /* Lock the parts in both cells */
+        if (ci->hydro.hold || cj->hydro.hold)
+          return 0;
+        if (cell_locktree(ci) != 0)
+          return 0;
+        if (cell_locktree(cj) != 0) {
+          cell_unlocktree(ci);
+          return 0;
+        }
+      } else if (subtype == task_subtype_gpu_unpack) {
+        /* Nothing to do here. */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_f) {
+        /* Nothing to do here. */
+        return 1;
+      } else if (subtype == task_subtype_gpu_unpack_g) {
+        /* Nothing to do here. */
+        return 1;
       } else { /* subtype == hydro */
         /* Lock the parts in both cells */
         if (ci->hydro.hold || cj->hydro.hold) return 0;
@@ -1127,6 +1238,19 @@ void task_get_group_name(int type, int subtype, char *cluster) {
   }
 
   switch (subtype) {
+    /* A. Nasar */
+    case task_subtype_gpu_pack:
+    case task_subtype_gpu_unpack:
+      strcpy(cluster, "Density");
+      break;
+    case task_subtype_gpu_pack_f:
+    case task_subtype_gpu_unpack_f:
+      strcpy(cluster, "Force");
+      break;
+    case task_subtype_gpu_pack_g:
+    case task_subtype_gpu_unpack_g:
+      strcpy(cluster, "Gradient");
+      break;
     case task_subtype_density:
       strcpy(cluster, "Density");
       break;
@@ -1755,6 +1879,12 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_density:
         case task_subtype_gradient:
         case task_subtype_force:
+        case task_subtype_gpu_pack: // A. Nasar
+        case task_subtype_gpu_unpack:
+        case task_subtype_gpu_pack_f:
+        case task_subtype_gpu_unpack_f:
+        case task_subtype_gpu_pack_g:
+        case task_subtype_gpu_unpack_g:
           return task_category_hydro;
 
         case task_subtype_limiter:
diff --git a/src/task.h b/src/task.h
index b405a0795f..68c495cf17 100644
--- a/src/task.h
+++ b/src/task.h
@@ -160,6 +160,12 @@ enum task_subtypes {
   task_subtype_sink_do_gas_swallow,
   task_subtype_rt_gradient,
   task_subtype_rt_transport,
+  task_subtype_gpu_pack, // A. Nasar
+  task_subtype_gpu_pack_g,
+  task_subtype_gpu_pack_f,
+  task_subtype_gpu_unpack,
+  task_subtype_gpu_unpack_g,
+  task_subtype_gpu_unpack_f,
   task_subtype_count
 } __attribute__((packed));
 
@@ -235,6 +241,15 @@ struct task {
   /*! Pointers to the cells this task acts upon */
   struct cell *ci, *cj;
 
+  int done; // A. Nasar
+
+  int gpu_done;
+
+  int corner_pair;
+
+  /*! Pointers to the cells this task acts upon */
+  struct cell **ci_unpack;//, **cj;
+
   /*! List of tasks unlocked by this one */
   struct task **unlock_tasks;
 

From b0193cba92596fb531f9a0f211a23ed9e6924685 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 16:36:03 +0100
Subject: [PATCH 011/217] Added code to engine.c

---
 src/engine.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/engine.c b/src/engine.c
index 6d1fa0e3f7..cd5a4bcf56 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1092,12 +1092,20 @@ int engine_estimate_nr_tasks(const struct engine *e) {
      */
     n1 += 38;
     n2 += 2;
+#ifdef WITH_CUDA
+    n1 += 2; //Self force and density packs
+    n1 += 26; //Pair force and density packs
+#endif
 #ifdef WITH_MPI
     n1 += 6;
 #endif
 
 #ifdef EXTRA_HYDRO_LOOP
     n1 += 15;
+#ifdef WITH_CUDA
+    n1 += 1; //Self gradient packs
+    n1 += 13; //Pair gradient packs
+#endif
 #ifdef WITH_MPI
     n1 += 2;
 #endif
@@ -1750,9 +1758,13 @@ void engine_skip_force_and_kick(struct engine *e) {
         t->type == task_type_rt_ghost2 || t->type == task_type_rt_tchem ||
         t->type == task_type_rt_advance_cell_time ||
         t->type == task_type_neutrino_weight || t->type == task_type_csds ||
-        t->subtype == task_subtype_force ||
+        t->subtype == task_subtype_force || // A. Nasar
+        t->subtype == task_subtype_gpu_pack_f ||
+        t->subtype == task_subtype_gpu_unpack_f ||
         t->subtype == task_subtype_limiter ||
         t->subtype == task_subtype_gradient ||
+        t->subtype == task_subtype_gpu_pack_g ||
+        t->subtype == task_subtype_gpu_unpack_g ||
         t->subtype == task_subtype_stars_prep1 ||
         t->subtype == task_subtype_stars_prep2 ||
         t->subtype == task_subtype_stars_feedback ||
@@ -2192,6 +2204,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   }
 #endif
 
+  scheduler_write_dependencies(&e->sched, e->verbose); // A. Nasar write deps before running first step
   /* Now, launch the calculation */
   TIMER_TIC;
   engine_launch(e, "tasks");

From 391d2bf3c9339556cca9006298b64b3b555f6008 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 16:41:22 +0100
Subject: [PATCH 012/217] Removed bug from engine.c and added some code to
 scheduler.c

---
 src/engine.c    | 2 +-
 src/scheduler.c | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/engine.c b/src/engine.c
index cd5a4bcf56..b461060084 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2204,7 +2204,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   }
 #endif
 
-  scheduler_write_dependencies(&e->sched, e->verbose); // A. Nasar write deps before running first step
+  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar write deps before running first step
   /* Now, launch the calculation */
   TIMER_TIC;
   engine_launch(e, "tasks");
diff --git a/src/scheduler.c b/src/scheduler.c
index 2b156f8250..f935fc0ea9 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1651,6 +1651,14 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
       scheduler_splittask_gravity(t, s);
+    // if task is gpu task do not split A. Nasar
+    }else if (t->subtype == task_subtype_gpu_pack ||
+              t->subtype == task_subtype_gpu_unpack||
+      		  t->subtype == task_subtype_gpu_pack_g ||
+              t->subtype == task_subtype_gpu_unpack_g||
+      	      t->subtype == task_subtype_gpu_pack_f ||
+              t->subtype == task_subtype_gpu_unpack_f) {
+      continue; /*Do nothing and grab next task to split*/
     } else {
 #ifdef SWIFT_DEBUG_CHECKS
       error("Unexpected task sub-type %s/%s", taskID_names[t->type],

From 9bd433d12c57d893c224fd686406f04c8faadcec Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 17:35:57 +0100
Subject: [PATCH 013/217] added code to scheduler.* engine_unskip.c

---
 src/engine_unskip.c |   3 +
 src/scheduler.c     | 194 ++++++++++++++++++++++++++++++++++++++------
 src/scheduler.h     |  27 ++++++
 3 files changed, 200 insertions(+), 24 deletions(-)

diff --git a/src/engine_unskip.c b/src/engine_unskip.c
index 43af8b5aed..d14c29cffe 100644
--- a/src/engine_unskip.c
+++ b/src/engine_unskip.c
@@ -79,6 +79,9 @@ struct unskip_data {
  */
 static void engine_do_unskip_hydro(struct cell *c, struct engine *e) {
 
+//  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_pack); A. Nasar
+//  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_unpack);
+
   /* Early abort (are we below the level where tasks are)? */
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
 
diff --git a/src/scheduler.c b/src/scheduler.c
index f935fc0ea9..b4eb8ae70d 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -61,6 +61,7 @@
 int activate_by_unskip = 1;
 #endif
 
+#include "cuda/BLOCK_SIZE.h"
 /**
  * @brief Re-set the list of active tasks.
  */
@@ -1756,6 +1757,29 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (ci != NULL) cell_set_flag(ci, cell_flag_has_tasks);
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
+  //#ifdef WITH_CUDA  A. Nasar
+  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack) {
+    atomic_inc(&s->nr_self_pack_tasks);
+  }
+  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack) {
+    atomic_inc(&s->nr_pair_pack_tasks);
+  }
+  //#ifdef WITH_CUDA
+  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_g) {
+    atomic_inc(&s->nr_self_pack_tasks_g);
+  }
+  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_g) {
+    atomic_inc(&s->nr_pair_pack_tasks_g);
+  }
+  //#ifdef WITH_CUDA
+  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_f) {
+    atomic_inc(&s->nr_self_pack_tasks_f);
+  }
+  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_f) {
+    atomic_inc(&s->nr_pair_pack_tasks_f);
+  }
+
+  //#endif
   /* Add an index for it. */
   // lock_lock( &s->lock );
   s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
@@ -1948,13 +1972,22 @@ void scheduler_reset(struct scheduler *s, int size) {
   /* Reset the counters. */
   s->size = size;
   s->nr_tasks = 0;
+  s->nr_self_pack_tasks = 0; // A. Nasar
+  s->nr_pair_pack_tasks = 0;
+  s->nr_self_pack_tasks_f = 0;
+  s->nr_pair_pack_tasks_f = 0;
+  s->nr_self_pack_tasks_g = 0;
+  s->nr_pair_pack_tasks_g = 0;
   s->tasks_next = 0;
   s->waiting = 0;
   s->nr_unlocks = 0;
   s->completed_unlock_writes = 0;
   s->active_count = 0;
   s->total_ticks = 0;
-
+  s->pack_size = N_TASKS_PER_PACK_SELF;
+  s->pack_size_pair = N_TASKS_PER_PACK_PAIR;
+  if (s->pack_tasks_ind != NULL)
+    free(s->pack_tasks_ind);
   /* Set the task pointers in the queues. */
   for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
@@ -2015,6 +2048,18 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * (wscale * gcount_i) * gcount_i;
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
+        else if (t->subtype == task_subtype_gpu_pack) // A. Nasar
+          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_f)
+          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_g)
+          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack)
+          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_f)
+          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_g)
+          cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_stars_density ||
                  t->subtype == task_subtype_stars_prep1 ||
                  t->subtype == task_subtype_stars_prep2 ||
@@ -2053,7 +2098,19 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
             cost = 3.f * (wscale * gcount_i) * gcount_j;
           else
             cost = 2.f * (wscale * gcount_i) * gcount_j;
-
+        // Abouzied: Think about good cost (for rainy days) A. Nasar
+        } else if (t->subtype == task_subtype_gpu_pack) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_f) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_unpack) {
+          cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_stars_density ||
                    t->subtype == task_subtype_stars_prep1 ||
                    t->subtype == task_subtype_stars_prep2 ||
@@ -2379,9 +2436,24 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
 
     /* Ignore skipped tasks. */
     if (t->skip) continue;
+    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack) // A. Nasar
+      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
+    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_f)
+      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
+    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_g)
+      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
+
+    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack)
+          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
+    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_f)
+          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
+    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_g)
+          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
 
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
+    t->done = 0;
+    t->gpu_done = 0;
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Check that we don't have more waits that what can be stored. */
@@ -2419,7 +2491,14 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  * @param s The #scheduler.
  */
 void scheduler_start(struct scheduler *s) {
-
+  for (int i = 0; i < s->nr_queues; i++){ // A. Nasar
+    s->queues[i].n_packs_self_left = 0;
+    s->queues[i].n_packs_pair_left = 0;
+    s->queues[i].n_packs_self_left_f = 0;
+    s->queues[i].n_packs_pair_left_f = 0;
+    s->queues[i].n_packs_self_left_g = 0;
+    s->queues[i].n_packs_pair_left_g = 0;
+  }
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
     threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active,
@@ -2495,6 +2574,23 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
             t->subtype == task_subtype_external_grav) {
           qid = t->ci->grav.super->owner;
           owner = &t->ci->grav.super->owner;
+        } else if (t->subtype == task_subtype_gpu_pack) { // A. Nasar
+          qid = t->ci->hydro.super->owner;
+            //          fprintf(stderr,"nqueues %i waiting %i active_count %i\n",
+            //          s->nr_queues, s->waiting, s->active_count);
+            //          if(qid==-1)fprintf(stderr,"queue id is negative\n");
+            //          else fprintf(stderr,"queue id is %i\n", qid);
+        } else if (t->subtype == task_subtype_gpu_pack_f) {
+          qid = t->ci->hydro.super->owner;
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          qid = t->ci->hydro.super->owner;
+        } else if (t->subtype == task_subtype_gpu_unpack) {
+          ////          qid = t->ci->owner;
+          qid = -1;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          qid = -1;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          qid = -1;
         } else {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
@@ -2529,6 +2625,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
           qid = t->cj->super->owner;
           owner = &t->cj->super->owner;
         }
+        if(t->subtype == task_subtype_gpu_unpack ||
+            t->subtype == task_subtype_gpu_unpack_f ||
+		     t->subtype == task_subtype_gpu_unpack_g) qid = -1;
         break;
       case task_type_recv:
 #ifdef WITH_MPI
@@ -2792,6 +2891,41 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   return NULL;
 }
 
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
+  /* Mark the task as skip. */
+//  t->skip = 1;
+
+  /* Task definitely done, signal any sleeping runners. */
+  if (!t->implicit) {
+    t->toc = getticks();
+    t->total_ticks += t->toc - t->tic;
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+  return NULL;
+}
+
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t) {
+//  t->skip = 1;
+  /* Loop through the dependencies and add them to a queue if
+     they are ready. */
+  for (int k = 0; k < t->nr_unlock_tasks; k++) {
+    struct task *t2 = t->unlock_tasks[k];
+    if (t2->skip)
+      continue;
+    const int res = atomic_dec(&t2->wait);
+    if (res < 1) {
+      error("Negative wait!");
+    } else if (res == 1) {
+      scheduler_enqueue(s, t2);
+    }
+  }
+
+  return NULL;
+}
+
 /**
  * @brief Resolve a single dependency by hand.
  *
@@ -2914,7 +3048,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
   struct task *res = NULL;
   const int nr_queues = s->nr_queues;
-  unsigned int seed = qid;
+//  unsigned int seed = qid;
 
   /* Check qid. */
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
@@ -2932,26 +3066,26 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         if (res != NULL) break;
       }
 
-      /* If unsuccessful, try stealing from the other queues. */
-      if (s->flags & scheduler_flag_steal) {
-        int count = 0, qids[nr_queues];
-        for (int k = 0; k < nr_queues; k++)
-          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
-            qids[count++] = k;
-          }
-        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
-          const int ind = rand_r(&seed) % count;
-          TIMER_TIC
-          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
-          TIMER_TOC(timer_qsteal);
-          if (res != NULL) {
-            break;
-          } else {
-            qids[ind] = qids[--count];
-          }
-        }
-        if (res != NULL) break;
-      }
+      /* If unsuccessful, try stealing from the other queues. A. Nasar commented out for GPU work*/
+//      if (s->flags & scheduler_flag_steal) {
+//        int count = 0, qids[nr_queues];
+//        for (int k = 0; k < nr_queues; k++)
+//          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
+//            qids[count++] = k;
+//          }
+//        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+//          const int ind = rand_r(&seed) % count;
+//          TIMER_TIC
+//          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+//          TIMER_TOC(timer_qsteal);
+//          if (res != NULL) {
+//            break;
+//          } else {
+//            qids[ind] = qids[--count];
+//          }
+//        }
+//        if (res != NULL) break;
+//      }
     }
 
 /* If we failed, take a short nap. */
@@ -3036,6 +3170,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->size = 0;
   s->tasks = NULL;
   s->tasks_ind = NULL;
+  s->pack_tasks_ind = NULL; // A. Nasar
   scheduler_reset(s, nr_tasks);
 
 #if defined(SWIFT_DEBUG_CHECKS)
@@ -3098,6 +3233,17 @@ void scheduler_free_tasks(struct scheduler *s) {
   }
   s->size = 0;
   s->nr_tasks = 0;
+  //reset GPU task counters too
+  if (s->pack_tasks_ind != NULL) { // A. Nasar
+    swift_free("pack_tasks_ind", s->pack_tasks_ind);
+    s->pack_tasks_ind = NULL;
+  }
+  s->nr_self_pack_tasks = 0;
+  s->nr_self_pack_tasks_f = 0;
+  s->nr_self_pack_tasks_g = 0;
+  s->nr_pair_pack_tasks = 0;
+  s->nr_pair_pack_tasks_f = 0;
+  s->nr_pair_pack_tasks_g = 0;
 }
 
 /**
diff --git a/src/scheduler.h b/src/scheduler.h
index 6ea7b41d58..578a1442d4 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -60,6 +60,33 @@ extern int activate_by_unskip;
 
 /* Data of a scheduler. */
 struct scheduler {
+
+  int nr_packs_self_dens_done; //A. Nasar
+  int nr_packs_pair_dens_done;
+  int nr_packs_self_forc_done;
+  int nr_packs_pair_forc_done;
+  int nr_packs_self_grad_done;
+  int nr_packs_pair_grad_done;
+
+  /* Actual number of GPU tasks. */
+  int nr_gpu_tasks;
+  /* Number of tasks we want*/
+  int target_gpu_tasks;
+  /* Actual number of density pack tasks. */
+  int nr_self_pack_tasks, nr_pair_pack_tasks;
+  /* Actual number of force pack tasks. */
+  int nr_self_pack_tasks_f, nr_pair_pack_tasks_f;
+  /* Actual number of gradient pack tasks. */
+  int nr_self_pack_tasks_g, nr_pair_pack_tasks_g;
+  /* Pack task indices */
+
+  // MATTHIEU: To be removed as unused !!!
+  int *pack_tasks_ind;
+
+  /*how many tasks we want to try and work on at once on the GPU*/
+  int pack_size;
+  int pack_size_pair;
+
   /* Scheduler flags. */
   unsigned int flags;
 

From 8d0d437c421def86c0ff08a3d418b57f0e1c0950 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 17:41:51 +0100
Subject: [PATCH 014/217] I had made a mistake by putting
 runner_doiact_functions_hydro_gpu.h in main dir instead of src

---
 src/runner_doiact_functions_hydro_gpu.h | 4249 +++++++++++++++++++++++
 src/runner_main_clean.cu                | 1936 +++++++++++
 2 files changed, 6185 insertions(+)
 create mode 100644 src/runner_doiact_functions_hydro_gpu.h
 create mode 100755 src/runner_main_clean.cu

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
new file mode 100644
index 0000000000..696a11e7f2
--- /dev/null
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -0,0 +1,4249 @@
+#include "scheduler.h"
+struct pack_vars_self{
+    /*List of tasks and respective cells to be packed*/
+	struct task **task_list;
+	struct cell **cell_list;
+	/*List of cell positions*/
+	double *cellx;
+	double *celly;
+	double *cellz;
+	/*List of cell positions*/
+	double *d_cellx;
+	double *d_celly;
+	double *d_cellz;
+	int bundle_size;
+	/*How many particles in a bundle*/
+	int count_parts;
+	/**/
+	int tasks_packed;
+	int *task_first_part;
+	int *task_last_part;
+	int *d_task_first_part;
+	int *d_task_last_part;
+	int * bundle_first_part;
+	int * bundle_last_part;
+	int * bundle_first_task_list;
+	int count_max_parts;
+	int launch;
+	int launch_leftovers;
+	int target_n_tasks;
+	int nBundles;
+	int tasksperbundle;
+
+}pack_vars_self;
+
+struct pack_vars_pair{
+    /*List of tasks and respective cells to be packed*/
+	struct task **task_list;
+	struct cell **ci_list;
+	struct cell **cj_list;
+	/*List of cell shifts*/
+	double *shiftx;
+	double *shifty;
+	double *shiftz;
+	/*List of cell shifts*/
+	double *d_shiftx;
+	double *d_shifty;
+	double *d_shiftz;
+	int bundle_size;
+	/*How many particles in a bundle*/
+	int count_parts;
+	/**/
+	int tasks_packed;
+	int *task_first_part;
+	int *task_last_part;
+	int *d_task_first_part;
+	int *d_task_last_part;
+	int * bundle_first_part;
+	int * bundle_last_part;
+	int * bundle_first_task_list;
+	int count_max_parts;
+	int launch;
+	int launch_leftovers;
+	int target_n_tasks;
+	int nBundles;
+	int tasksperbundle;
+
+}pack_vars_pair;
+
+struct pack_vars_pair_f4{
+    /*List of tasks and respective cells to be packed*/
+	struct task **task_list;
+	struct cell **ci_list;
+	struct cell **cj_list;
+	/*List of cell shifts*/
+	float3 *shift;
+	/*List of cell shifts*/
+	float3 *d_shift;
+	int bundle_size;
+	/*How many particles in a bundle*/
+	int count_parts;
+	/**/
+	int tasks_packed;
+	int4 *fparti_fpartj_lparti_lpartj;
+	int4 *d_fparti_fpartj_lparti_lpartj;
+	int * bundle_first_part;
+	int * bundle_last_part;
+	int * bundle_first_task_list;
+	int count_max_parts;
+	int launch;
+	int launch_leftovers;
+	int target_n_tasks;
+	int nBundles;
+	int tasksperbundle;
+
+}pack_vars_pair_f4;
+
+#include "task.h"
+#include "runner_gpu_pack_functions.h"
+#include "cuda/BLOCK_SIZE.h"
+#include "cuda/GPU_runner_functions.h"
+#define CUDA_DEBUG
+void runner_doself1_pack(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos *parts_aos, int * packing_time){
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+	pack_vars->cellx[tasks_packed] = ci->loc[0];
+	pack_vars->celly[tasks_packed] = ci->loc[1];
+	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left));
+	t->done = 1;
+	/* Release the lock on the cell */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+}
+
+double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_send *parts_send, int2 *task_first_part_f4){
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+//	pack_vars->cellx[tasks_packed] = ci->loc[0];
+//	pack_vars->celly[tasks_packed] = ci->loc[1];
+//	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+//	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos_f4(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+//	d_task_first_part_self_dens_f4[tasks_packed].y = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left));
+	t->done = 1;
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+	  clock_gettime(CLOCK_REALTIME, &t1);
+		/* Release the lock on the cell */
+//		task_unlock(t);
+		cell_unlocktree(ci);
+	  return (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+}
+
+void runner_doself1_pack_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_g *parts_aos, double *packing_time){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+	pack_vars->cellx[tasks_packed] = ci->loc[0];
+	pack_vars->celly[tasks_packed] = ci->loc[1];
+	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos_g(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done_g++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+	t->done = 1;
+	/* Release the lock on the cell */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left_g == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+}
+
+double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_g_send *parts_send, int2 * task_first_part_f4){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+//	pack_vars->cellx[tasks_packed] = ci->loc[0];
+//	pack_vars->celly[tasks_packed] = ci->loc[1];
+//	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos_f4_g(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done_g++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+	t->done = 1;
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left_g == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+	/* Release the lock on the cell */
+//	task_unlock(t);
+	cell_unlocktree(ci);
+    return (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_doself1_pack_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f *parts_aos, double *packing_time){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+	pack_vars->cellx[tasks_packed] = ci->loc[0];
+	pack_vars->celly[tasks_packed] = ci->loc[1];
+	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos_f(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done_f++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+	t->done = 1;
+	/* Release the lock on the cell */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left_f == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_f_send *parts_send, int2 * task_first_part_f4){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	int tasks_packed = pack_vars->tasks_packed;
+//	pack_vars->cellx[tasks_packed] = ci->loc[0];
+//	pack_vars->celly[tasks_packed] = ci->loc[1];
+//	pack_vars->cellz[tasks_packed] = ci->loc[2];
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->cell_list[tasks_packed] = ci;
+	//    /* Identify row in particle arrays where this task starts*/
+//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	int *count_parts_self = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+	runner_doself1_gpu_pack_neat_aos_f4_f(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
+	//    // identify the row in the array where this task ends (row id of its last particle)
+//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+	/* Tell the cell it has been packed */
+	ci->pack_done_f++;
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+	t->done = 1;
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_self_left_f == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+	/* Release the lock on the cell */
+//	task_unlock(t);
+	cell_unlocktree(ci);
+    return (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		 struct cell *cj, struct task *t, struct part_aos *parts_aos, struct engine *e, double *packing_time){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
+//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
+	                                                              //Be sure to test that pack_vars->count_parts
+	                                                              //is actually increment here
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
+//    		pack_vars->count_parts);
+    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done++;
+	cj->pack_done++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left));
+	t->done = 1;
+
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+	/* Copies done. Release the lock ! */
+//	task_unlock(t);
+	cell_unlocktree(ci);
+	cell_unlocktree(cj);
+}
+
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+
+    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos_f4(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
+//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
+	                                                              //Be sure to test that pack_vars->count_parts
+	                                                              //is actually increment here
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
+//    		pack_vars->count_parts);
+    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
+    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done++;
+	cj->pack_done++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left));
+	t->done = 1;
+	/* Copies done. Release the lock ! */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    return (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		 struct cell *cj, struct task *t, struct part_aos_g *parts_aos, struct engine *e, double * packing_time){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed;
+	const int tid_tmp = 2 * tasks_packed;
+	/*shifts for ci*/
+	pack_vars->shiftx[tid_tmp] = 0.0;
+	pack_vars->shifty[tid_tmp] = 0.0;
+	pack_vars->shiftz[tid_tmp] = 0.0;
+	/*shifts for cj. Stored using strided indexing (stride of two per task)*/
+	pack_vars->shiftx[tid_tmp + 1] = 0.0;
+	pack_vars->shifty[tid_tmp + 1] = 0.0;
+	pack_vars->shiftz[tid_tmp + 1] = 0.0;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+    const double cjx = cj->loc[0];
+    const double cjy = cj->loc[1];
+    const double cjz = cj->loc[2];
+
+    /*Correct the shifts for cell i*/
+	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
+	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
+	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
+	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	pack_vars->shiftx[tid_tmp + 1] = cjx;
+	pack_vars->shifty[tid_tmp + 1] = cjy;
+	pack_vars->shiftz[tid_tmp + 1] = cjz;
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos_g(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj);
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done_g++;
+	cj->pack_done_g++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+	t->done = 1;
+	/* Copies done. Release the lock ! */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left_g == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+
+    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos_f4_g(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
+//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
+	                                                              //Be sure to test that pack_vars->count_parts
+	                                                              //is actually increment here
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
+//    		pack_vars->count_parts);
+    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
+    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done_g++;
+	cj->pack_done_g++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+	t->done = 1;
+	/* Copies done. Release the lock ! */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left_g == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    return (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+
+void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		 struct cell *cj, struct task *t, struct part_aos_f *parts_aos, struct engine *e, double *packing_time){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed; //Copy pasted this code again. Issue isn't here
+	const int tid_tmp = 2 * tasks_packed;
+	/*shifts for ci*/
+	pack_vars->shiftx[tid_tmp] = 0.0;
+	pack_vars->shifty[tid_tmp] = 0.0;
+	pack_vars->shiftz[tid_tmp] = 0.0;
+	/*shifts for cj. Stored using strided indexing (stride of two per task)*/
+	pack_vars->shiftx[tid_tmp + 1] = 0.0;
+	pack_vars->shifty[tid_tmp + 1] = 0.0;
+	pack_vars->shiftz[tid_tmp + 1] = 0.0;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+    const double cjx = cj->loc[0];
+    const double cjy = cj->loc[1];
+    const double cjz = cj->loc[2];
+
+    /*Correct the shifts for cell i*/
+	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
+	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
+	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
+	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	pack_vars->shiftx[tid_tmp + 1] = cjx;
+	pack_vars->shifty[tid_tmp + 1] = cjy;
+	pack_vars->shiftz[tid_tmp + 1] = cjz;
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos_f(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj);
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done_f++;
+	cj->pack_done_f++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+	t->done = 1;
+	/* Copies done. Release the lock ! */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left_f == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *packing_time += (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+
+    /* Timers for how long this all takes.
+    * t0 and t1 are from start to finish including GPU calcs
+    * tp0 and tp1 only time packing and unpacking*/
+    struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+	int tasks_packed = pack_vars->tasks_packed;
+
+    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+    /*Get the shifts in case of periodics*/
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+	/*Get pointers to the list of tasks and cells packed*/
+	pack_vars->task_list[tasks_packed] = t;
+	pack_vars->ci_list[tasks_packed] = ci;
+	pack_vars->cj_list[tasks_packed] = cj;
+
+
+    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+    const int count_ci = ci->hydro.count;
+    const int count_cj = cj->hydro.count;
+
+    /*Assign an id for this task*/
+    const int tid = tasks_packed;
+
+    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
+
+    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
+
+    int *count_parts = &pack_vars->count_parts;
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
+	/* This re-arranges the particle data from cell->hydro->parts into a
+	long array of part structs*/
+    runner_do_ci_cj_gpu_pack_neat_aos_f4_f(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
+    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
+//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
+//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
+	                                                              //Be sure to test that pack_vars->count_parts
+	                                                              //is actually increment here
+	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
+
+//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
+//    		pack_vars->count_parts);
+    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
+    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+	/* Tell the cells they have been packed */
+	ci->pack_done_f++;
+	cj->pack_done_f++;
+
+	/* Identify first particle for each bundle of tasks */
+	const int bundle_size = pack_vars->bundle_size;
+	if (tasks_packed % bundle_size == 0) {
+	  int bid = tasks_packed / bundle_size;
+	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
+	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
+	}
+
+	/* Record that we have now done a packing (self) */
+	int qid = r->qid;
+	atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+	t->done = 1;
+	/* Copies done. Release the lock ! */
+	task_unlock(t);
+	pack_vars->tasks_packed++;
+	pack_vars->launch = 0;
+	pack_vars->launch_leftovers = 0;
+	if ((s->queues[qid].n_packs_pair_left_f == 0))
+		pack_vars->launch_leftovers = 1;
+	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
+		pack_vars->launch = 1;
+    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
+    clock_gettime(CLOCK_REALTIME, &t1);
+    return (t1.tv_sec - t0.tv_sec) +
+                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
+
+void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos *parts_aos, struct part_aos *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *hmemcpy_time){
+
+	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		}
+	  }
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos), cudaMemcpyHostToDevice, stream[bid]);
+
+//#ifdef CUDA_DEBUG
+//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//										// Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(
+//			stderr,
+//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+//			cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+        tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  const char *loop_type = "density";
+//	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
+//			  tasks_packed, pack_vars->launch_leftovers);
+	  // Launch the kernel
+	  launch_density_aos(
+		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
+		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
+		  numBlocks_x, numBlocks_y, bundle_first_task,
+		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//#endif
+	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos), cudaMemcpyDeviceToHost, stream[bid]);
+
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//										// Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		error("Something's up with your cuda code");
+//	  }
+//#endif
+	}/*End of looping over bundles to launch in streams*/
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &tp0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  struct cell *cii = pack_vars->cell_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+//              struct cell *cii = ci_list_self_dens[tid];
+//              struct task *tii = task_list_self_dens[tid];
+
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	 runner_doself1_gpu_unpack_neat_aos(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done++;
+
+	  /* Release the lock */
+	  cell_unlocktree(cii);
+
+	  /*schedule my dependencies (Only unpacks really)*/
+	  enqueue_dependencies(s, tii);
+	  /*Signal sleeping runners*/
+	  signal_sleeping_runners(s, tii);
+
+	  tii->gpu_done = 1;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &tp1);
+	*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+	(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+} /*End of GPU work Self*/
+
+void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int2 * d_task_first_part_self_dens_f4,
+		int devId, int2 * task_first_part_f4, int2 * d_task_first_part_f4, cudaEvent_t * self_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
+	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+//    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+    /*Copy arrays containing first and last part for each task to GPU*/
+//    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+//    cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed * sizeof(int2), devId, NULL);
+    /*Copy cell shifts to device*/
+//    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+//    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+//			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		  last_task = tid;
+		}
+	  }
+//	  const int n_tasks = last_task - first_task;
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+//	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+//      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task], (last_task - first_task) * sizeof(int2),
+//    		  devId, stream[bid]);
+      cudaMemcpyAsync(&d_task_first_part_f4[first_task], &task_first_part_f4[first_task],
+    		  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
+//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(
+//			stderr,
+//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+//			cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//      clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+//      *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+//  			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+
+//#ifdef CUDA_DEBUG
+//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//										// Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(
+//			stderr,
+//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+//			cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+            tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//	  const char *loop_type = "density";
+//	  struct first_part first_parts;
+//	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] = pack_vars->task_first_part[i];
+//	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
+//			  tasks_packed, pack_vars->launch_leftovers);
+	  // Launch the kernel
+	  launch_density_aos_f4(
+			  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
+		  d_task_first_part_f4);
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//#endif
+	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(self_end[bid], stream[bid]);
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//										// Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		error("Something's up with your cuda code");
+//	  }
+//#endif
+	}/*End of looping over bundles to launch in streams*/
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+		cudaEventSynchronize(self_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+			  struct cell *cii = pack_vars->cell_list[tid];
+			  struct task *tii = pack_vars->task_list[tid];
+
+		//              struct cell *cii = ci_list_self_dens[tid];
+		//              struct task *tii = task_list_self_dens[tid];
+
+				clock_gettime(CLOCK_REALTIME, &tp0);
+
+//			  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+			  while(cell_locktree(cii)) {
+				;  /* spin until we acquire the lock */
+			  }
+//			  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+//				*hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+//				(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+			  /* Do the copy */
+			 runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+
+			  /* Record things for debugging */
+			  cii->gpu_done++;
+			  /*Time end of unpacking*/
+			  clock_gettime(CLOCK_REALTIME, &tp1);
+			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+			    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+			  /* Release the lock */
+			  cell_unlocktree(cii);
+
+			  /*schedule my dependencies (Only unpacks really)*/
+			  enqueue_dependencies(s, tii);
+			  /*Signal sleeping runners*/
+			  signal_sleeping_runners(s, tii);
+
+			  tii->gpu_done = 1;
+
+		  }
+
+		}
+		/*Time end of unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp1);
+//		*hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+
+} /*End of GPU work Self*/
+
+void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_g *parts_aos, struct part_aos_g *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time){
+
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		}
+	  }
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(
+			stderr,
+			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+			cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+  tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  const char *loop_type = "density";
+	  // Launch the kernel
+	  launch_gradient_aos(
+		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
+		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
+		  numBlocks_x, numBlocks_y, bundle_first_task,
+		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyDeviceToHost, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  struct cell *cii = pack_vars->cell_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+//              struct cell *cii = ci_list_self_dens[tid];
+//              struct task *tii = task_list_self_dens[tid];
+
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	 runner_doself1_gpu_unpack_neat_aos_g(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_g++;
+
+	  /* Release the lock */
+	  cell_unlocktree(cii);
+
+	  /*schedule my dependencies (Only unpacks really)*/
+	  enqueue_dependencies(s, tii);
+	  /*Signal sleeping runners*/
+	  signal_sleeping_runners(s, tii);
+
+	  tii->gpu_done = 1;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work Self Gradient*/
+
+void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
+		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, int2 *task_first_part_f4, int2 *d_task_first_part_f4,
+		cudaEvent_t *self_end, double *unpack_time){
+
+
+	struct timespec t0, t1, tp0, tp1;
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+//	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		  last_task = tid;
+		}
+	  }
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+	  cudaMemcpyAsync(&d_task_first_part_f4[first_task], &task_first_part_f4[first_task],
+			  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
+
+	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
+//	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid, first_part_tmp, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(
+			stderr,
+			"CUDA error in gradient self host 2 device memcpy: %s cpuid id is: %i\n ",
+			cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+            tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//	  const char *loop_type = "density";
+	  // Launch the kernel
+	  launch_gradient_aos_f4(
+		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
+		  d_task_first_part_f4);
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+//	exit(0);
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+		cudaEventSynchronize(self_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+			  if(tid < tasks_packed){
+
+			  struct cell *cii = pack_vars->cell_list[tid];
+			  struct task *tii = pack_vars->task_list[tid];
+
+		//              struct cell *cii = ci_list_self_dens[tid];
+		//              struct task *tii = task_list_self_dens[tid];
+
+			  while(cell_locktree(cii)) {
+				;  /* spin until we acquire the lock */
+			  }
+			    /*Time unpacking*/
+				clock_gettime(CLOCK_REALTIME, &tp0);
+			  /* Do the copy */
+			 runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+				/*Time end of unpacking*/
+				clock_gettime(CLOCK_REALTIME, &tp1);
+				*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+				(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+			  /* Record things for debugging */
+			  cii->gpu_done_g++;
+
+			  /* Release the lock */
+			  cell_unlocktree(cii);
+
+			  /*schedule my dependencies (Only unpacks really)*/
+			  enqueue_dependencies(s, tii);
+			  /*Signal sleeping runners*/
+			  signal_sleeping_runners(s, tii);
+
+			  tii->gpu_done = 1;
+
+			}
+		}
+		/*Time end of unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp1);
+//		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+
+} /*End of GPU work Self Gradient*/
+
+
+void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f *parts_aos, struct part_aos_f *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time){
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		}
+	  }
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(
+			stderr,
+			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+			cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+  tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  const char *loop_type = "density";
+	  // Launch the kernel
+	  launch_force_aos(
+		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
+		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
+		  numBlocks_x, numBlocks_y, bundle_first_task,
+		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyDeviceToHost, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  struct cell *cii = pack_vars->cell_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+//              struct cell *cii = ci_list_self_dens[tid];
+//              struct task *tii = task_list_self_dens[tid];
+
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	 runner_doself1_gpu_unpack_neat_aos_f(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_f++;
+
+	  /* Release the lock */
+	  cell_unlocktree(cii);
+
+	  /*schedule my dependencies (Only unpacks really)*/
+	  enqueue_dependencies(s, tii);
+	  /*Signal sleeping runners*/
+	  signal_sleeping_runners(s, tii);
+
+	  tii->gpu_done = 1;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work Self Gradient*/
+
+void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
+		struct part_aos_f4_f_send *d_parts_send, struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f, cudaEvent_t * self_end,
+		double *unpack_time){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4_f[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+//    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+//    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+	  max_parts = 0;
+	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+		   tid++) {
+		if (tid < tasks_packed) {
+		  /*Get an estimate for the max number of parts per cell in the bundle.
+		   *  Used for determining the number of GPU CUDA blocks*/
+		  int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x;
+		  parts_in_bundle += count;
+		  max_parts = max(max_parts, count);
+		  last_task = tid;
+		}
+	  }
+
+	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
+	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
+	  cudaMemcpyAsync(&d_task_first_part_f4_f[first_task], &task_first_part_f4_f[first_task],
+			  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
+
+	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(
+			stderr,
+			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+			cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tasks_left = tasksperbundle;
+	  if (bid == nBundles_temp - 1) {
+		tasks_left =
+             tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+	  }
+	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
+	  // the y dimension and max_parts is the x dimension
+	  int numBlocks_y = tasks_left;
+	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  // Launch the kernel
+	  launch_force_aos_f4(
+		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
+		  d_task_first_part_f4_f);
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+			bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(self_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+		cudaEventSynchronize(self_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+			  struct cell *cii = pack_vars->cell_list[tid];
+			  struct task *tii = pack_vars->task_list[tid];
+
+		//              struct cell *cii = ci_list_self_dens[tid];
+		//              struct task *tii = task_list_self_dens[tid];
+
+			  while(cell_locktree(cii)) {
+				;  /* spin until we acquire the lock */
+			  }
+			 clock_gettime(CLOCK_REALTIME, &tp0);
+			  /* Do the copy */
+			 runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
+
+			  /* Record things for debugging */
+			  cii->gpu_done_f++;
+			  clock_gettime(CLOCK_REALTIME, &tp1);
+			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+			  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+			  /* Release the lock */
+			  cell_unlocktree(cii);
+
+			  /*schedule my dependencies (Only unpacks really)*/
+			  enqueue_dependencies(s, tii);
+			  /*Signal sleeping runners*/
+			  signal_sleeping_runners(s, tii);
+
+			  tii->gpu_done = 1;
+		  }
+		}
+		/*Time end of unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp1);
+//		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+	}
+
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+  } /*End of GPU work Self Gradient*/
+
+void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos *parts_aos, struct part_aos *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time){
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+//    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
+//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+//    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
+//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
+          int count_i = pack_vars->task_last_part[tid_tmp]
+															  - pack_vars->task_first_part[tid_tmp];
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = pack_vars->task_last_part[tid_tmp + 1]
+															  - pack_vars->task_first_part[tid_tmp + 1];
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos), cudaMemcpyHostToDevice, stream[bid]);
+
+//#ifdef CUDA_DEBUG
+//      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//                                                // Get error code
+//      if (cu_error != cudaSuccess) {
+//        fprintf(stderr,
+//        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+//        cudaGetErrorString(cu_error), r->cpuid);
+//        error("Something's up with your cuda code");
+//      }
+//#endif
+
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = tasks_left;
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+      const char *loop_type = "density";
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopairci_branch_density_gpu_aos(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopaircj_branch_density_gpu_aos(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		exit(0);
+//	  }
+//#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos), cudaMemcpyDeviceToHost, stream[bid]);
+
+//#ifdef CUDA_DEBUG
+//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+//										// Get error code
+//	  if (cu_error != cudaSuccess) {
+//		fprintf(stderr,
+//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+//				cudaGetErrorString(cu_error), r->cpuid);
+//		error("Something's up with your cuda code");
+//	  }
+//#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  /*grab cell and task pointers*/
+	  struct cell *cii = pack_vars->ci_list[tid];
+      struct cell *cjj = pack_vars->cj_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+	  /*Let's lock ci*/
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /*Let's lock cj*/
+	  while(cell_locktree(cjj)) {
+	    ;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	  runner_do_ci_cj_gpu_unpack_neat_aos(r, cii, cjj, parts_aos, 0,
+		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_pair++;
+	  cjj->gpu_done_pair++;
+
+	  tii->gpu_done = 1;
+
+      /*schedule my dependencies (Only unpacks really)*/
+      enqueue_dependencies(s, tii);
+      /*Signal sleeping runners*/
+      signal_sleeping_runners(s, tii);
+
+	  /* Release the locks */
+	  cell_unlocktree(cii);
+	  /* Release the locks */
+	  cell_unlocktree(cjj);
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+		int4 *d_fparti_fpartj_lparti_lpartj_dens, cudaEvent_t * pair_end){
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
+															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
+															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+
+		  last_task = tid;
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_fparti_fpartj_lparti_lpartj_dens[first_task], &fparti_fpartj_lparti_lpartj_dens[first_task],
+    		  (last_task + 1  - first_task) * sizeof(int4), cudaMemcpyHostToDevice, stream[bid]);
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = tasks_left;
+      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopairci_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
+
+      numBlocks_x = (max_parts_j + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopaircj_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  /*grab cell and task pointers*/
+	  struct cell *cii = pack_vars->ci_list[tid];
+      struct cell *cjj = pack_vars->cj_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+	  /*Let's lock ci*/
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /*Let's lock cj*/
+	  while(cell_locktree(cjj)) {
+	    ;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
+		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_pair++;
+	  cjj->gpu_done_pair++;
+
+	  tii->gpu_done = 1;
+
+      /*schedule my dependencies (Only unpacks really)*/
+      enqueue_dependencies(s, tii);
+      /*Signal sleeping runners*/
+      signal_sleeping_runners(s, tii);
+
+	  /* Release the locks */
+	  cell_unlocktree(cii);
+	  /* Release the locks */
+	  cell_unlocktree(cjj);
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
+															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
+															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = 0;//tasks_left;
+      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+		/*Time unpacking*/
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+        cudaEventSynchronize(pair_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+		  clock_gettime(CLOCK_REALTIME, &tp0);
+		  /*grab cell and task pointers*/
+		  struct cell *cii = pack_vars->ci_list[tid];
+		  struct cell *cjj = pack_vars->cj_list[tid];
+		  struct task *tii = pack_vars->task_list[tid];
+
+		  /*Let's lock ci*/
+		  if(tii->corner_pair == 1)fprintf(stderr, "Corner task\n");
+		  while(cell_locktree(cii)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /*Let's lock cj*/
+		  while(cell_locktree(cjj)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /* Do the copy */
+		  runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		  /* Record things for debugging */
+		  cii->gpu_done_pair++;
+		  cjj->gpu_done_pair++;
+
+		  /* Release the locks */
+		  cell_unlocktree(cii);
+		  /* Release the locks */
+		  cell_unlocktree(cjj);
+
+		  /*Time end of unpacking*/
+		  clock_gettime(CLOCK_REALTIME, &tp1);
+		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+		  /*schedule my dependencies (Only unpacks really)*/
+		  enqueue_dependencies(s, tii);
+		  /*Signal sleeping runners*/
+		  signal_sleeping_runners(s, tii);
+
+		  tii->gpu_done = 1;
+
+
+		  }
+		}
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+//	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+//	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+//	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
+															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
+															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+	}
+      	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+            int max_parts_i = 0;
+            int max_parts_j = 0;
+            int parts_in_bundle_ci = 0;
+            int parts_in_bundle_cj = 0;
+            for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+                 tid++) {
+              if (tid < tasks_packed) {
+                /*Get an estimate for the max number of parts per cell in each bundle.
+                 *  Used for determining the number of GPU CUDA blocks*/
+                int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
+      															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
+                parts_in_bundle_ci += count_i;
+                max_parts_i = max(max_parts_i, count_i);
+                int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
+      															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
+                parts_in_bundle_cj += count_j;
+                max_parts_j = max(max_parts_j, count_j);
+              }
+            }
+            const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+            const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+//////////////////////////////////
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = 0;//tasks_left;
+      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		exit(0);
+	  }
+#endif
+      	}
+
+    	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+          int max_parts_i = 0;
+          int max_parts_j = 0;
+          int parts_in_bundle_ci = 0;
+          int parts_in_bundle_cj = 0;
+          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+               tid++) {
+            if (tid < tasks_packed) {
+              /*Get an estimate for the max number of parts per cell in each bundle.
+               *  Used for determining the number of GPU CUDA blocks*/
+              int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
+    															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
+              parts_in_bundle_ci += count_i;
+              max_parts_i = max(max_parts_i, count_i);
+              int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
+    															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
+              parts_in_bundle_cj += count_j;
+              max_parts_j = max(max_parts_j, count_j);
+            }
+          }
+          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+///////////////////////////////////////////////////////////////////
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+	  clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+	  cudaEventSynchronize(pair_end[bid]);
+
+	  clock_gettime(CLOCK_REALTIME, &t1);
+	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &tp0);
+//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+
+		    /*grab cell and task pointers*/
+		    struct cell *cii = pack_vars->ci_list[tid];
+		    struct cell *cjj = pack_vars->cj_list[tid];
+		    struct task *tii = pack_vars->task_list[tid];
+
+		  /*Let's lock ci*/
+		    while(cell_locktree(cii)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /*Let's lock cj*/
+		    while(cell_locktree(cjj)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /* Do the copy */
+		    /*Time unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp0);
+		    runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		    /* Record things for debugging */
+		    cii->gpu_done_pair++;
+		    cjj->gpu_done_pair++;
+
+		    tii->gpu_done = 1;
+		    /*Time end of unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp1);
+		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+		    /*schedule my dependencies (Only unpacks really)*/
+		    enqueue_dependencies(s, tii);
+		    /*Signal sleeping runners*/
+		    signal_sleeping_runners(s, tii);
+
+		    /* Release the locks */
+		    cell_unlocktree(cii);
+		    /* Release the locks */
+		    cell_unlocktree(cjj);
+
+		  }
+	  }
+	  /*Time end of unpacking*/
+//	  clock_gettime(CLOCK_REALTIME, &tp1);
+//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_g *parts_aos, struct part_aos_g *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time){
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
+          int count_i = pack_vars->task_last_part[tid_tmp]
+															  - pack_vars->task_first_part[tid_tmp];
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = pack_vars->task_last_part[tid_tmp + 1]
+															  - pack_vars->task_first_part[tid_tmp + 1];
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = tasks_left;
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+      const char *loop_type = "density";
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopairci_branch_density_gpu_aos_g(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopaircj_branch_density_gpu_aos_g(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyDeviceToHost, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  /*grab cell and task pointers*/
+	  struct cell *cii = pack_vars->ci_list[tid];
+      struct cell *cjj = pack_vars->cj_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+	  /*Let's lock ci*/
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /*Let's lock cj*/
+	  while(cell_locktree(cjj)) {
+	    ;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	  runner_do_ci_cj_gpu_unpack_neat_aos_g(r, cii, cjj, parts_aos, 0,
+		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_pair_g++;
+	  cjj->gpu_done_pair_g++;
+
+	  tii->gpu_done = 1;
+
+      /*schedule my dependencies (Only unpacks really)*/
+      enqueue_dependencies(s, tii);
+      /*Signal sleeping runners*/
+      signal_sleeping_runners(s, tii);
+
+	  /* Release the locks */
+	  cell_unlocktree(cii);
+	  /* Release the locks */
+	  cell_unlocktree(cjj);
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
+		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+//	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+//	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj[tid].z
+															  - fparti_fpartj_lparti_lpartj[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj[tid].w
+															  - fparti_fpartj_lparti_lpartj[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = 0;//tasks_left;
+      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+		/*Time unpacking*/
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+        cudaEventSynchronize(pair_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+		  clock_gettime(CLOCK_REALTIME, &tp0);
+		  /*grab cell and task pointers*/
+		  struct cell *cii = pack_vars->ci_list[tid];
+		  struct cell *cjj = pack_vars->cj_list[tid];
+		  struct task *tii = pack_vars->task_list[tid];
+		  /*Let's lock ci*/
+		  while(cell_locktree(cii)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /*Let's lock cj*/
+		  while(cell_locktree(cjj)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /* Do the copy */
+		  runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		  /* Record things for debugging */
+		  cii->gpu_done_pair_g++;
+		  cjj->gpu_done_pair_g++;
+
+		  /* Release the locks */
+		  cell_unlocktree(cii);
+		  /* Release the locks */
+		  cell_unlocktree(cjj);
+
+		  /*Time end of unpacking*/
+		  clock_gettime(CLOCK_REALTIME, &tp1);
+		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+		  /*schedule my dependencies (Only unpacks really)*/
+		  enqueue_dependencies(s, tii);
+		  /*Signal sleeping runners*/
+		  signal_sleeping_runners(s, tii);
+
+		  tii->gpu_done = 1;
+
+
+		  }
+		}
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+//	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
+		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+//	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+//	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj[tid].z
+															  - fparti_fpartj_lparti_lpartj[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj[tid].w
+															  - fparti_fpartj_lparti_lpartj[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+
+//		  last_task = tid;
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+	}
+      	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+          int max_parts_i = 0;
+          int max_parts_j = 0;
+          int parts_in_bundle_ci = 0;
+          int parts_in_bundle_cj = 0;
+//          const int first_task = bid * pack_vars->bundle_size;
+//      	  int last_task = (bid + 1) * bundle_size;
+          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+                 tid++) {
+              if (tid < tasks_packed) {
+                /*Get an estimate for the max number of parts per cell in each bundle.
+                 *  Used for determining the number of GPU CUDA blocks*/
+                int count_i = fparti_fpartj_lparti_lpartj[tid].z
+      															  - fparti_fpartj_lparti_lpartj[tid].x;
+                parts_in_bundle_ci += count_i;
+                max_parts_i = max(max_parts_i, count_i);
+                int count_j = fparti_fpartj_lparti_lpartj[tid].w
+      															  - fparti_fpartj_lparti_lpartj[tid].y;
+                parts_in_bundle_cj += count_j;
+                max_parts_j = max(max_parts_j, count_j);
+
+//      		  last_task = tid;
+              }
+          }
+          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+//////////////////////////////////
+//	      const int tasksperbundle = pack_vars->tasksperbundle;
+	      /* LAUNCH THE GPU KERNELS for ci & cj */
+          // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+          // the y dimension and max_parts is the x dimension
+          int numBlocks_y = 0;//tasks_left;
+          int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+          int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+
+          /* Launch the kernel for ci using data for ci and cj */
+          runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
+		        d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+          cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
+	      if (cu_error != cudaSuccess) {
+		    fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		    exit(0);
+	      }
+#endif
+      	}
+
+    	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+          int max_parts_i = 0;
+          int max_parts_j = 0;
+          int parts_in_bundle_ci = 0;
+          int parts_in_bundle_cj = 0;
+//          const int first_task = bid * pack_vars->bundle_size;
+//    	  int last_task = (bid + 1) * bundle_size;
+          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+               tid++) {
+            if (tid < tasks_packed) {
+              /*Get an estimate for the max number of parts per cell in each bundle.
+               *  Used for determining the number of GPU CUDA blocks*/
+              int count_i = fparti_fpartj_lparti_lpartj[tid].z
+    															  - fparti_fpartj_lparti_lpartj[tid].x;
+              parts_in_bundle_ci += count_i;
+              max_parts_i = max(max_parts_i, count_i);
+              int count_j = fparti_fpartj_lparti_lpartj[tid].w
+    															  - fparti_fpartj_lparti_lpartj[tid].y;
+              parts_in_bundle_cj += count_j;
+              max_parts_j = max(max_parts_j, count_j);
+
+//    		  last_task = tid;
+            }
+          }
+          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+///////////////////////////////////////////////////////////////////
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+	  clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+	  cudaEventSynchronize(pair_end[bid]);
+
+	  clock_gettime(CLOCK_REALTIME, &t1);
+	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &tp0);
+//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+
+		    /*grab cell and task pointers*/
+		    struct cell *cii = pack_vars->ci_list[tid];
+		    struct cell *cjj = pack_vars->cj_list[tid];
+		    struct task *tii = pack_vars->task_list[tid];
+
+		  /*Let's lock ci*/
+		    while(cell_locktree(cii)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /*Let's lock cj*/
+		    while(cell_locktree(cjj)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /* Do the copy */
+		    /*Time unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp0);
+		    runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		    /* Record things for debugging */
+		    cii->gpu_done_pair_g++;
+		    cjj->gpu_done_pair_g++;
+
+		    tii->gpu_done = 1;
+		    /*Time end of unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp1);
+		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+		    /*schedule my dependencies (Only unpacks really)*/
+		    enqueue_dependencies(s, tii);
+		    /*Signal sleeping runners*/
+		    signal_sleeping_runners(s, tii);
+
+		    /* Release the locks */
+		    cell_unlocktree(cii);
+		    /* Release the locks */
+		    cell_unlocktree(cjj);
+
+		  }
+	  }
+	  /*Time end of unpacking*/
+//	  clock_gettime(CLOCK_REALTIME, &tp1);
+//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
+		struct task *t, struct part_aos_f *parts_aos, struct part_aos_f *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time){
+
+	struct timespec t0, t1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+    /*Copy arrays containing first and last part for each task to GPU*/
+    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+    /*Copy cell shifts to device*/
+    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
+    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+//	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+//      const int first_task = bid * pack_vars->bundle_size;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
+          int count_i = pack_vars->task_last_part[tid_tmp]
+															  - pack_vars->task_first_part[tid_tmp];
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = pack_vars->task_last_part[tid_tmp + 1]
+															  - pack_vars->task_first_part[tid_tmp + 1];
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+
+	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = tasks_left;
+//      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+      const char *loop_type = "density";
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopairci_branch_density_gpu_aos_f(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+        /* Launch the kernel for ci using data for ci and cj */
+        runner_dopaircj_branch_density_gpu_aos_f(d_parts_aos,
+        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
+	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
+	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyDeviceToHost, stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int tid = 0; tid < tasks_packed; tid++) {
+
+	  /*grab cell and task pointers*/
+	  struct cell *cii = pack_vars->ci_list[tid];
+      struct cell *cjj = pack_vars->cj_list[tid];
+	  struct task *tii = pack_vars->task_list[tid];
+
+	  /*Let's lock ci*/
+	  while(cell_locktree(cii)) {
+		;  /* spin until we acquire the lock */
+	  }
+	  /*Let's lock cj*/
+	  while(cell_locktree(cjj)) {
+	    ;  /* spin until we acquire the lock */
+	  }
+	  /* Do the copy */
+	  runner_do_ci_cj_gpu_unpack_neat_aos_f(r, cii, cjj, parts_aos, 0,
+		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+	  /* Record things for debugging */
+	  cii->gpu_done_pair_f++;
+	  cjj->gpu_done_pair_f++;
+
+	  tii->gpu_done = 1;
+
+      /*schedule my dependencies (Only unpacks really)*/
+      enqueue_dependencies(s, tii);
+      /*Signal sleeping runners*/
+      signal_sleeping_runners(s, tii);
+
+	  /* Release the locks */
+	  cell_unlocktree(cii);
+	  /* Release the locks */
+	  cell_unlocktree(cjj);
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*packing_time += (t1.tv_sec - t0.tv_sec) +
+	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, struct part_aos_f4_f_send *d_parts_send,
+		struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+//	const int packed_tmp = 2 * (tasks_packed - 1);
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+//	int max_parts = 0;
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj[tid].z
+															  - fparti_fpartj_lparti_lpartj[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj[tid].w
+															  - fparti_fpartj_lparti_lpartj[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+
+//		  last_task = tid;
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  /* LAUNCH THE GPU KERNELS for ci & cj */
+//      int tid = 0;
+//      int offset = bid * tasksperbundle;
+//      int tasks_left = tasksperbundle;
+//      if (bid == nBundles_temp - 1) {
+//        tasks_left =
+//        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+//      }
+
+      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+      // the y dimension and max_parts is the x dimension
+      int numBlocks_y = 0;//tasks_left;
+      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+
+      /* Launch the kernel for ci using data for ci and cj */
+      runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
+		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		exit(0);
+	  }
+#endif
+
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+		/*Time unpacking*/
+		clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+        cudaEventSynchronize(pair_end[bid]);
+
+		clock_gettime(CLOCK_REALTIME, &t1);
+		*gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//		clock_gettime(CLOCK_REALTIME, &tp0);
+//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+		  clock_gettime(CLOCK_REALTIME, &tp0);
+		  /*grab cell and task pointers*/
+		  struct cell *cii = pack_vars->ci_list[tid];
+		  struct cell *cjj = pack_vars->cj_list[tid];
+		  struct task *tii = pack_vars->task_list[tid];
+		  /*Let's lock ci*/
+		  while(cell_locktree(cii)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /*Let's lock cj*/
+		  while(cell_locktree(cjj)) {
+			;  /* spin until we acquire the lock */
+		  }
+		  /* Do the copy */
+		  runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		  /* Record things for debugging */
+		  cii->gpu_done_pair_f++;
+		  cjj->gpu_done_pair_f++;
+
+		  /* Release the locks */
+		  cell_unlocktree(cii);
+		  /* Release the locks */
+		  cell_unlocktree(cjj);
+
+		  /*Time end of unpacking*/
+		  clock_gettime(CLOCK_REALTIME, &tp1);
+		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+		  /*schedule my dependencies (Only unpacks really)*/
+		  enqueue_dependencies(s, tii);
+		  /*Signal sleeping runners*/
+		  signal_sleeping_runners(s, tii);
+
+		  tii->gpu_done = 1;
+
+
+		  }
+		}
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+//	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
+void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, struct part_aos_f4_f_send *d_parts_send,
+		struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
+		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+		cudaEvent_t * pair_end){
+
+	struct timespec t0, t1, tp0, tp1; //
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+	/* Identify the number of GPU bundles to run in ideal case*/
+	int nBundles_temp = pack_vars->nBundles;
+	/*How many tasks have we packed?*/
+	const int tasks_packed = pack_vars->tasks_packed;
+
+	/*How many tasks should be in a bundle?*/
+	const int bundle_size = pack_vars->bundle_size;
+
+	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
+	if (pack_vars->launch_leftovers) {
+	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
+//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
+	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+	}
+    /* Identify the last particle for each bundle of tasks */
+    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+	}
+    /* special treatment for the last bundle */
+    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+	/* Launch the copies for each bundle and run the GPU kernel */
+	/*We don't go into this loop if tasks_left_self == 1 as
+	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+      int max_parts_i = 0;
+      int max_parts_j = 0;
+      int parts_in_bundle_ci = 0;
+      int parts_in_bundle_cj = 0;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
+      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+           tid++) {
+        if (tid < tasks_packed) {
+          /*Get an estimate for the max number of parts per cell in each bundle.
+           *  Used for determining the number of GPU CUDA blocks*/
+          int count_i = fparti_fpartj_lparti_lpartj[tid].z
+															  - fparti_fpartj_lparti_lpartj[tid].x;
+          parts_in_bundle_ci += count_i;
+          max_parts_i = max(max_parts_i, count_i);
+          int count_j = fparti_fpartj_lparti_lpartj[tid].w
+															  - fparti_fpartj_lparti_lpartj[tid].y;
+          parts_in_bundle_cj += count_j;
+          max_parts_j = max(max_parts_j, count_j);
+
+//		  last_task = tid;
+        }
+      }
+      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
+        bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+                                                // Get error code
+      if (cu_error != cudaSuccess) {
+        fprintf(stderr,
+        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
+        cudaGetErrorString(cu_error), r->cpuid);
+        error("Something's up with your cuda code");
+      }
+#endif
+	}
+      	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+          int max_parts_i = 0;
+          int max_parts_j = 0;
+          int parts_in_bundle_ci = 0;
+          int parts_in_bundle_cj = 0;
+//          const int first_task = bid * pack_vars->bundle_size;
+//      	  int last_task = (bid + 1) * bundle_size;
+          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+                 tid++) {
+              if (tid < tasks_packed) {
+                /*Get an estimate for the max number of parts per cell in each bundle.
+                 *  Used for determining the number of GPU CUDA blocks*/
+                int count_i = fparti_fpartj_lparti_lpartj[tid].z
+      															  - fparti_fpartj_lparti_lpartj[tid].x;
+                parts_in_bundle_ci += count_i;
+                max_parts_i = max(max_parts_i, count_i);
+                int count_j = fparti_fpartj_lparti_lpartj[tid].w
+      															  - fparti_fpartj_lparti_lpartj[tid].y;
+                parts_in_bundle_cj += count_j;
+                max_parts_j = max(max_parts_j, count_j);
+
+//      		  last_task = tid;
+              }
+          }
+          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+//////////////////////////////////
+	      /* LAUNCH THE GPU KERNELS for ci & cj */
+          // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+          // the y dimension and max_parts is the x dimension
+          int numBlocks_y = 0;//tasks_left;
+          int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+          int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+
+          /* Launch the kernel for ci using data for ci and cj */
+          runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
+		        d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+
+#ifdef CUDA_DEBUG
+          cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
+	      if (cu_error != cudaSuccess) {
+		    fprintf(stderr,
+				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
+		    exit(0);
+	      }
+#endif
+      	}
+
+    	for (int bid = 0; bid < nBundles_temp; bid++) {
+
+          int max_parts_i = 0;
+          int max_parts_j = 0;
+          int parts_in_bundle_ci = 0;
+          int parts_in_bundle_cj = 0;
+          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
+               tid++) {
+            if (tid < tasks_packed) {
+              /*Get an estimate for the max number of parts per cell in each bundle.
+               *  Used for determining the number of GPU CUDA blocks*/
+              int count_i = fparti_fpartj_lparti_lpartj[tid].z
+    															  - fparti_fpartj_lparti_lpartj[tid].x;
+              parts_in_bundle_ci += count_i;
+              max_parts_i = max(max_parts_i, count_i);
+              int count_j = fparti_fpartj_lparti_lpartj[tid].w
+    															  - fparti_fpartj_lparti_lpartj[tid].y;
+              parts_in_bundle_cj += count_j;
+              max_parts_j = max(max_parts_j, count_j);
+            }
+          }
+          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+///////////////////////////////////////////////////////////////////
+        // Copy results back to CPU BUFFERS
+      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
+    	  	  bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
+      cudaEventRecord(pair_end[bid], stream[bid]);
+
+#ifdef CUDA_DEBUG
+      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+										// Get error code
+	  if (cu_error != cudaSuccess) {
+		fprintf(stderr,
+				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+				cudaGetErrorString(cu_error), r->cpuid);
+		error("Something's up with your cuda code");
+	  }
+#endif
+	}/*End of looping over bundles to launch in streams*/
+
+	/* Make sure all the kernels and copies back are finished */
+//	cudaDeviceSynchronize();
+
+    /*Time end of GPU work*/
+	clock_gettime(CLOCK_REALTIME, &t1);
+	*gpu_time += (t1.tv_sec - t0.tv_sec) +
+			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t0);
+	/* Now copy the data back from the CPU thread-local buffers to the cells */
+	/* Pack length counter for use in unpacking */
+	int pack_length_unpack=0;
+	for (int bid = 0; bid < nBundles_temp; bid++){
+	  clock_gettime(CLOCK_REALTIME, &t0);
+
+//		cudaStreamSynchronize(stream[bid]);
+	  cudaEventSynchronize(pair_end[bid]);
+
+	  clock_gettime(CLOCK_REALTIME, &t1);
+	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
+				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+	    /*Time unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &tp0);
+
+	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+		  if(tid < tasks_packed){
+
+		    /*grab cell and task pointers*/
+		    struct cell *cii = pack_vars->ci_list[tid];
+		    struct cell *cjj = pack_vars->cj_list[tid];
+		    struct task *tii = pack_vars->task_list[tid];
+
+		  /*Let's lock ci*/
+		    while(cell_locktree(cii)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /*Let's lock cj*/
+		    while(cell_locktree(cjj)) {
+			  ;  /* spin until we acquire the lock */
+		    }
+		  /* Do the copy */
+		    /*Time unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp0);
+		    runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(r, cii, cjj, parts_recv, 0,
+				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
+
+		    /* Record things for debugging */
+		    cii->gpu_done_pair_f++;
+		    cjj->gpu_done_pair_f++;
+
+		    tii->gpu_done = 1;
+		    /*Time end of unpacking*/
+		    clock_gettime(CLOCK_REALTIME, &tp1);
+		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+		    /*schedule my dependencies (Only unpacks really)*/
+		    enqueue_dependencies(s, tii);
+		    /*Signal sleeping runners*/
+		    signal_sleeping_runners(s, tii);
+
+		    /* Release the locks */
+		    cell_unlocktree(cii);
+		    /* Release the locks */
+		    cell_unlocktree(cjj);
+
+		  }
+	  }
+	  /*Time end of unpacking*/
+//	  clock_gettime(CLOCK_REALTIME, &tp1);
+//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+	}
+	/* Zero counters for the next pack operations */
+	pack_vars->count_parts = 0;
+	pack_vars->tasks_packed = 0;
+	/*Time end of unpacking*/
+//	clock_gettime(CLOCK_REALTIME, &t1);
+//	*packing_time += (t1.tv_sec - t0.tv_sec) +
+//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  } /*End of GPU work*/
+
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
new file mode 100755
index 0000000000..6b307350b5
--- /dev/null
+++ b/src/runner_main_clean.cu
@@ -0,0 +1,1936 @@
+/*******************************************************************************
+ * This file is part of SWIFT.
+ * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
+ *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
+ *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+/* Config parameters. */
+#define GPUOFFLOAD 1 //off-load hydro to GPU
+#define DO_CORNERS 1 //do corner pair tasks on CPU
+#include "../config.h"
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Config parameters. */
+#include <config.h>
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+/* This object's header. */
+#include "runner.h"
+
+/* Local headers. */
+#include "engine.h"
+#include "feedback.h"
+#include "runner_doiact_sinks.h"
+#include "scheduler.h"
+#include "space_getsid.h"
+#include "timers.h"
+
+/* Import the gravity loop functions. */
+#include "runner_doiact_grav.h"
+
+/* Import the density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the gradient loop functions (if required). */
+#ifdef EXTRA_HYDRO_LOOP
+#define FUNCTION gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+#endif
+
+/* Import the force loop functions. */
+#define FUNCTION force
+#define FUNCTION_TASK_LOOP TASK_LOOP_FORCE
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the limiter loop functions. */
+#define FUNCTION limiter
+#define FUNCTION_TASK_LOOP TASK_LOOP_LIMITER
+#include "runner_doiact_limiter.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#ifdef EXTRA_STAR_LOOPS
+
+/* Import the stars prepare1 loop functions. */
+#define FUNCTION prep1
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP1
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the stars prepare2 loop functions. */
+#define FUNCTION prep2
+#define FUNCTION_TASK_LOOP TASK_LOOP_STARS_PREP2
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+#endif /* EXTRA_STAR_LOOPS */
+
+/* Import the stars feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_stars.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole density loop functions. */
+#define FUNCTION density
+#define FUNCTION_TASK_LOOP TASK_LOOP_DENSITY
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION swallow
+#define FUNCTION_TASK_LOOP TASK_LOOP_SWALLOW
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the black hole feedback loop functions. */
+#define FUNCTION feedback
+#define FUNCTION_TASK_LOOP TASK_LOOP_FEEDBACK
+#include "runner_doiact_black_holes.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT gradient loop functions */
+#define FUNCTION rt_gradient
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_GRADIENT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+/* Import the RT transport (force) loop functions. */
+#define FUNCTION rt_transport
+#define FUNCTION_TASK_LOOP TASK_LOOP_RT_TRANSPORT
+#include "runner_doiact_hydro.h"
+#include "runner_doiact_undef.h"
+
+#ifdef __cplusplus
+}
+#endif
+/**
+ * @brief The #runner main thread routine.
+ *
+ * @param data A pointer to this thread's data.
+ **/
+
+/* CUDA Header */
+#ifdef WITH_CUDA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cuda/part_gpu.h"
+#include "runner_gpu_pack_functions.h"
+#include "runner_doiact_functions_hydro_gpu.h"
+#include "files_for_new_functions/host_device_data_transfer.h"
+#include "files_for_new_functions/arrays_malloc.h"
+//#include "./cuda/BLOCK_SIZE.h"
+#include "cuda/GPU_runner_functions.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+
+#ifdef __cplusplus
+}
+#endif
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+#define CUDA_DEBUG
+
+inline cudaError_t checkCuda(cudaError_t result) {
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+  return result;
+}
+
+//inline void gpuErrchk(cudaError_t code) {
+//#define __FILE__ __LINE__
+//  inline void gpuAssert(cudaError_t code, const char *file, int line) {
+//    int abort = 0;
+//    if (code != cudaSuccess) {
+//      //			fprintf( stderr, "cudaCheckError() failed at
+//      //%s:%i : %s\n",
+//      //                 file, line, cudaGetErrorString( code ) );
+//      abort = 1;
+//      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+//              line);
+//      if (abort)
+//        exit(code);
+//    }
+//  }
+//}
+
+void *runner_main2(void *data) {
+  struct runner *r = (struct runner *)data;
+  struct engine *e = r->e;
+  struct scheduler *sched = &e->sched;
+  struct space *space = e->s;
+  unsigned int seed = r->id;
+  pthread_setspecific(sched->local_seed_pointer, &seed);
+  /*pack_vars contain data required for packing tasks destined for the GPU*/
+  struct pack_vars_self *pack_vars_self_dens;
+  struct pack_vars_self *pack_vars_self_forc;
+  struct pack_vars_self *pack_vars_self_grad;
+
+  /*pack_vars contain data required for packing tasks destined for the GPU*/
+  struct pack_vars_pair *pack_vars_pair_dens;
+  struct pack_vars_pair *pack_vars_pair_forc;
+  struct pack_vars_pair *pack_vars_pair_grad;
+
+  cudaMallocHost((void **)&pack_vars_self_dens,
+                       sizeof(struct pack_vars_self *));
+  cudaMallocHost((void **)&pack_vars_self_forc,
+                       sizeof(struct pack_vars_self *));
+  cudaMallocHost((void **)&pack_vars_self_grad,
+                       sizeof(struct pack_vars_self *));
+
+  cudaMallocHost((void **)&pack_vars_pair_dens,
+                       sizeof(struct pack_vars_pair *));
+  cudaMallocHost((void **)&pack_vars_pair_forc,
+                       sizeof(struct pack_vars_pair *));
+  cudaMallocHost((void **)&pack_vars_pair_grad,
+                       sizeof(struct pack_vars_pair *));
+
+  int devId = 0; // find and print gpu device name
+  struct cudaDeviceProp prop;
+  int nDevices;
+  int maxBlocksSM;
+  int nSMs;
+  cudaGetDeviceCount(&nDevices);
+  cudaGetDeviceProperties(&prop, devId);
+  cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor, devId);
+  cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId);
+  int nPartsPerCell = space->nr_parts/space->tot_cells;
+  int mpi_rank = 0;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#endif
+  if (r->cpuid == 0 && mpi_rank == 0) {
+    fprintf(stderr, "%i devices available device id is %i\n", nDevices, devId);
+    fprintf(stderr, "Device : %s\n", prop.name);
+    fprintf(stderr, "nSMs %i max blocks per SM %i maxnBlocks per stream %i\n", nSMs, maxBlocksSM, nSMs * maxBlocksSM);
+    fprintf(stderr, "Target nBlocks per kernel is %i\n", N_TASKS_BUNDLE_SELF*nPartsPerCell/BLOCK_SIZE);
+    fprintf(stderr, "Target nBlocks per stream is %i\n", N_TASKS_PER_PACK_SELF*nPartsPerCell/BLOCK_SIZE);
+  }
+  if(nDevices == 1)cudaSetDevice(devId);
+#ifndef WITH_MPI
+  else{
+    cudaSetDevice(devId);
+  }
+#endif
+#ifdef WITH_MPI
+  else{
+      cudaSetDevice(mpi_rank * 2);
+      fprintf(stderr, "%i devices available device id is %i\n", nDevices, mpi_rank * 2);
+  }
+#endif
+  fprintf(stderr, "after dev select engine_rank %i rank %i\n", engine_rank, mpi_rank);
+
+  cudaError_t cu_error;
+  // how many tasks do we want for each launch of GPU kernel
+  //  fprintf(stderr,"pack_size is %i\n", sched->pack_size);
+  const int target_n_tasks = sched->pack_size;
+  const int target_n_tasks_pair = sched->pack_size_pair;
+  pack_vars_self_dens->target_n_tasks = target_n_tasks;
+  pack_vars_pair_dens->target_n_tasks = target_n_tasks_pair;
+  pack_vars_self_forc->target_n_tasks = target_n_tasks;
+  pack_vars_pair_forc->target_n_tasks = target_n_tasks_pair;
+  pack_vars_self_grad->target_n_tasks = target_n_tasks;
+  pack_vars_pair_grad->target_n_tasks = target_n_tasks_pair;
+  // how many tasks we want in each bundle (used for launching kernels in
+  // different streams)
+  const int bundle_size = N_TASKS_BUNDLE_SELF;
+  const int bundle_size_pair = N_TASKS_BUNDLE_PAIR;
+
+  pack_vars_self_dens->bundle_size = bundle_size;
+  pack_vars_pair_dens->bundle_size = bundle_size_pair;
+  pack_vars_self_forc->bundle_size = bundle_size;
+  pack_vars_pair_forc->bundle_size = bundle_size_pair;
+  pack_vars_self_grad->bundle_size = bundle_size;
+  pack_vars_pair_grad->bundle_size = bundle_size_pair;
+//  fprintf(stderr, "size %i size %i\n", sizeof(*pack_vars_self_dens), sizeof(pack_vars_self));
+//  const int bundle_size_pair = bundle_size/2;
+  // Keep track of first and last particles for each task (particle data is
+  // arranged in long arrays containing particles from all the tasks we will
+  // work with)
+
+  // Copy of the above residing on the GPU
+  int *d_task_first_part_self_dens, *d_task_last_part_self_dens;
+  int2 *task_first_part_self_dens_f4;
+  int2 *task_first_part_f4;
+  int2 *task_first_part_f4_f;
+  int2 *task_first_part_f4_g;
+  int2 *d_task_first_part_f4;
+  int2 *d_task_first_part_f4_f;
+  int2 *d_task_first_part_f4_g;
+  int *d_task_first_part_self_forc, *d_task_last_part_self_forc;
+  int *d_task_first_part_self_grad, *d_task_last_part_self_grad;
+  int *d_task_first_parts_pair_dens, *d_task_last_parts_pair_dens;
+
+  int4 *fparti_fpartj_lparti_lpartj_dens;
+  int4 *fparti_fpartj_lparti_lpartj_forc, *d_fparti_fpartj_lparti_lpartj_forc;
+  int4 *fparti_fpartj_lparti_lpartj_grad, *d_fparti_fpartj_lparti_lpartj_grad;
+
+  int *d_task_first_parts_pair_forc, *d_task_last_parts_pair_forc;
+  int *d_task_first_parts_pair_grad, *d_task_last_parts_pair_grad;
+
+  cudaMallocManaged((void**)&task_first_part_self_dens_f4,
+		  target_n_tasks * sizeof(int2), cudaMemAttachGlobal);
+  cudaMallocHost((void**)&task_first_part_f4,
+		  target_n_tasks * sizeof(int2));
+  cudaMalloc((void**)&d_task_first_part_f4,
+		  target_n_tasks * sizeof(int2));
+  cudaMallocHost((void**)&task_first_part_f4_f,
+		  target_n_tasks * sizeof(int2));
+  cudaMalloc((void**)&d_task_first_part_f4_f,
+		  target_n_tasks * sizeof(int2));
+  cudaMallocHost((void**)&task_first_part_f4_g,
+		  target_n_tasks * sizeof(int2));
+  cudaMalloc((void**)&d_task_first_part_f4_g,
+  		  target_n_tasks * sizeof(int2));
+
+  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_dens,
+		  target_n_tasks * sizeof(int4));
+//  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_dens,
+//  		  target_n_tasks * sizeof(int4));
+
+  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_forc,
+		  target_n_tasks * sizeof(int4));
+  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_forc,
+  		  target_n_tasks * sizeof(int4));
+
+  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_grad,
+		  target_n_tasks * sizeof(int4));
+  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_grad,
+  		  target_n_tasks * sizeof(int4));
+
+
+//  cudaMallocManaged((void**)&d_task_last_part_self_dens_f4,
+//		  target_n_tasks * sizeof(int), cudaMemAttachGlobal);
+
+  // Arrays keeping track of the row numbers of the first and last particles
+  // within each bundle. Required by the GPU code
+
+  cudaMallocHost((void **)&pack_vars_self_dens->task_first_part,
+                   target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_dens->task_last_part,
+                   target_n_tasks * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_dens->task_first_part,
+                   2 * target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_dens->task_last_part,
+                   2 * target_n_tasks * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_forc->task_first_part,
+                   target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_forc->task_last_part,
+                   target_n_tasks * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_forc->task_first_part,
+                   2 * target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_forc->task_last_part,
+                   2 * target_n_tasks * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_grad->task_first_part,
+                   target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_grad->task_last_part,
+                   target_n_tasks * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_grad->task_first_part,
+                   2 * target_n_tasks * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_grad->task_last_part,
+                   2 * target_n_tasks * sizeof(int));
+
+  /* nBundles is the number of task bundles each
+  thread has ==> Used to loop through bundles */
+  int nBundles = (target_n_tasks + bundle_size - 1) /
+                   bundle_size;
+  int nBundles_pair = (target_n_tasks_pair + bundle_size_pair - 1) /
+                   bundle_size_pair;
+
+  if(r->cpuid == 0){
+	  fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n", engine_rank, r->cpuid, nBundles);
+	  fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair);
+  }
+
+  pack_vars_self_dens->nBundles = nBundles;
+  pack_vars_pair_dens->nBundles = nBundles_pair;
+  pack_vars_self_forc->nBundles = nBundles;
+  pack_vars_pair_forc->nBundles = nBundles_pair;
+  pack_vars_self_grad->nBundles = nBundles;
+  pack_vars_pair_grad->nBundles = nBundles_pair;
+
+  // first part and last part are the first and last particle ids (locally
+  // within this thread)
+
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
+		  nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
+          nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_task_list,
+		  nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_part,
+		  2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_last_part,
+          2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_task_list,
+		  2 * nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_part,
+		  nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_last_part,
+          nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_task_list,
+		  nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_part,
+		  2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_last_part,
+          2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_task_list,
+		  2 * nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_part,
+		  nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_last_part,
+          nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_task_list,
+		  nBundles * sizeof(int));
+
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_part,
+		  2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_last_part,
+          2 * nBundles * sizeof(int));
+  cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
+		  2 * nBundles * sizeof(int));
+
+//These I need to keep/////////////////
+  cudaMalloc((void **)&d_task_first_part_self_dens, target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_part_self_forc, target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_part_self_grad, target_n_tasks * sizeof(int));
+
+  cudaMalloc((void **)&d_task_last_part_self_dens, target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_part_self_forc, target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_part_self_grad, target_n_tasks * sizeof(int));
+  //These I need to keep/////////////////
+  pack_vars_self_dens->d_task_first_part = d_task_first_part_self_dens;
+  pack_vars_self_dens->d_task_last_part = d_task_last_part_self_dens;
+  //These I need to keep/////////////////
+  pack_vars_self_forc->d_task_first_part = d_task_first_part_self_forc;
+  pack_vars_self_forc->d_task_last_part = d_task_last_part_self_forc;
+  //These I need to keep/////////////////
+  pack_vars_self_grad->d_task_first_part = d_task_first_part_self_grad;
+  pack_vars_self_grad->d_task_last_part = d_task_last_part_self_grad;
+
+  //These I need to keep/////////////////
+  cudaMalloc((void **)&d_task_first_parts_pair_dens, 2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_parts_pair_forc, 2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_parts_pair_grad, 2 * target_n_tasks * sizeof(int));
+
+  cudaMalloc((void **)&d_task_last_parts_pair_dens, 2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_parts_pair_forc, 2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_parts_pair_grad, 2 * target_n_tasks * sizeof(int));
+  //These I need to keep/////////////////
+  pack_vars_pair_dens->d_task_first_part = d_task_first_parts_pair_dens;
+  pack_vars_pair_dens->d_task_last_part = d_task_last_parts_pair_dens;
+  pack_vars_pair_forc->d_task_first_part = d_task_first_parts_pair_forc;
+  pack_vars_pair_forc->d_task_last_part = d_task_last_parts_pair_forc;
+  pack_vars_pair_grad->d_task_first_part = d_task_first_parts_pair_grad;
+  pack_vars_pair_grad->d_task_last_part = d_task_last_parts_pair_grad;
+  //cell positions for self tasks REMEMBER to remove CPU copies as these are no longer necessary
+  double *d_dens_cell_x, *d_dens_cell_y, *d_dens_cell_z;
+  float3 *d_dens_f3_cell_x;
+  double *d_grad_cell_x, *d_grad_cell_y, *d_grad_cell_z;
+  double *d_forc_cell_x, *d_forc_cell_y, *d_forc_cell_z;
+  //Shifts for pair tasks REMEMBER to remove CPU copies as these are no longer necessary
+  double *d_dens_shift_x, *d_dens_shift_y, *d_dens_shift_z;
+  double *d_grad_shift_x, *d_grad_shift_y, *d_grad_shift_z;
+  double *d_forc_shift_x, *d_forc_shift_y, *d_forc_shift_z;
+
+  //These I need to keep/////////////////
+  cudaMalloc((void **)&d_dens_cell_x, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_dens_cell_y, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_dens_cell_z, target_n_tasks * sizeof(double));
+
+  cudaMalloc((void **)&d_dens_f3_cell_x, target_n_tasks * sizeof(float3));
+
+  cudaMalloc((void **)&d_forc_cell_x, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_forc_cell_y, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_forc_cell_z, target_n_tasks * sizeof(double));
+
+  cudaMalloc((void **)&d_grad_cell_x, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_grad_cell_y, target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_grad_cell_z, target_n_tasks * sizeof(double));
+
+  cudaMalloc((void **)&d_dens_shift_x, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_dens_shift_y, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_dens_shift_z, 2 * target_n_tasks * sizeof(double));
+
+  cudaMalloc((void **)&d_forc_shift_x, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_forc_shift_y, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_forc_shift_z, 2 * target_n_tasks * sizeof(double));
+
+  cudaMalloc((void **)&d_grad_shift_x, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_grad_shift_y, 2 * target_n_tasks * sizeof(double));
+  cudaMalloc((void **)&d_grad_shift_z, 2 * target_n_tasks * sizeof(double));
+  //These I need to keep/////////////////
+
+  cudaMallocHost((void **)&pack_vars_self_dens->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_dens->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_dens->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_self_dens->d_cellx = d_dens_cell_x;
+  pack_vars_self_dens->d_celly = d_dens_cell_y;
+  pack_vars_self_dens->d_cellz = d_dens_cell_z;
+
+  cudaMallocHost((void **)&pack_vars_pair_dens->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_dens->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_dens->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_pair_dens->d_shiftx = d_dens_shift_x;
+  pack_vars_pair_dens->d_shifty = d_dens_shift_y;
+  pack_vars_pair_dens->d_shiftz = d_dens_shift_z;
+
+  cudaMallocHost((void **)&pack_vars_self_forc->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_forc->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_forc->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_self_forc->d_cellx = d_forc_cell_x;
+  pack_vars_self_forc->d_celly = d_forc_cell_y;
+  pack_vars_self_forc->d_cellz = d_forc_cell_z;
+
+  cudaMallocHost((void **)&pack_vars_pair_forc->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_forc->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_forc->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_pair_forc->d_shiftx = d_forc_shift_x;
+  pack_vars_pair_forc->d_shifty = d_forc_shift_y;
+  pack_vars_pair_forc->d_shiftz = d_forc_shift_z;
+
+  cudaMallocHost((void **)&pack_vars_self_grad->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_grad->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_grad->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_self_grad->d_cellx = d_grad_cell_x;
+  pack_vars_self_grad->d_celly = d_grad_cell_y;
+  pack_vars_self_grad->d_cellz = d_grad_cell_z;
+
+  cudaMallocHost((void **)&pack_vars_pair_grad->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_grad->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_pair_grad->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+
+  pack_vars_pair_grad->d_shiftx = d_grad_shift_x;
+  pack_vars_pair_grad->d_shifty = d_grad_shift_y;
+  pack_vars_pair_grad->d_shiftz = d_grad_shift_z;
+
+  cudaStream_t stream[nBundles];
+  cudaStream_t stream_pairs[nBundles_pair];
+
+  cudaEvent_t self_end[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&self_end[i]);
+
+  cudaEvent_t self_end_g[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&self_end_g[i]);
+
+  cudaEvent_t self_end_f[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&self_end_f[i]);
+
+  cudaEvent_t pair_end[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&pair_end[i]);
+
+  cudaEvent_t pair_end_g[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&pair_end_g[i]);
+
+  cudaEvent_t pair_end_f[nBundles];
+  for (int i =0; i < nBundles; i++)
+	  cudaEventCreate(&pair_end_f[i]);
+
+  int tasksperbundle = (target_n_tasks + nBundles - 1) / nBundles;
+  int tasksperbundle_pair = (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair;
+
+  pack_vars_self_dens->tasksperbundle = tasksperbundle;
+  pack_vars_pair_dens->tasksperbundle = tasksperbundle_pair;
+  pack_vars_self_forc->tasksperbundle = tasksperbundle;
+  pack_vars_pair_forc->tasksperbundle = tasksperbundle_pair;
+  pack_vars_self_grad->tasksperbundle = tasksperbundle;
+  pack_vars_pair_grad->tasksperbundle = tasksperbundle_pair;
+
+
+  for (int i = 0; i < nBundles; ++i)
+    cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
+
+  for (int i = 0; i < nBundles_pair; ++i)
+    cudaStreamCreateWithFlags(&stream_pairs[i], cudaStreamNonBlocking);
+
+  pack_vars_self_dens->count_parts = 0;
+  pack_vars_pair_dens->count_parts = 0;
+  pack_vars_self_forc->count_parts = 0;
+  pack_vars_pair_forc->count_parts = 0;
+  pack_vars_self_grad->count_parts = 0;
+  pack_vars_pair_grad->count_parts = 0;
+
+  /*Estimate how many particles to pack for GPU for each GPU launch
+   * instruction*/
+  int nr_nodes = 1, res = 0;
+#ifdef WITH_MPI
+  if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
+    error("MPI_Comm_size failed with error %i.", res);
+#endif
+  int count_max_parts_tmp =
+      2 * target_n_tasks * space->nr_parts * nr_nodes /
+      space->nr_cells;
+  pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
+  pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_forc->count_max_parts = count_max_parts_tmp;
+  pack_vars_self_grad->count_max_parts = count_max_parts_tmp;
+  pack_vars_pair_grad->count_max_parts = count_max_parts_tmp;
+
+  struct part_aos      		*parts_aos_dens;
+  struct part_aos_f4   		*parts_aos_dens_f4;
+  struct part_aos_f4_send   *parts_aos_f4_send;
+  struct part_aos_f4_recv   *parts_aos_f4_recv;
+
+  struct part_aos_f    		*parts_aos_forc;
+  struct part_aos_f4_f 		*parts_aos_forc_f4;
+  struct part_aos_f4_f_send *parts_aos_forc_f4_send;
+  struct part_aos_f4_f_recv *parts_aos_forc_f4_recv;
+
+  struct part_aos_g    		*parts_aos_grad;
+  struct part_aos_f4_g 		*parts_aos_grad_f4;
+  struct part_aos_f4_g_send *parts_aos_grad_f4_send;
+  struct part_aos_f4_g_recv *parts_aos_grad_f4_recv;
+
+  struct part_aos      		*d_parts_aos_dens;
+  struct part_aos_f4   		*d_parts_aos_dens_f4;
+  struct part_aos_f4_send   *d_parts_aos_f4_send;
+  struct part_aos_f4_recv   *d_parts_aos_f4_recv;
+
+  struct part_aos_f    		*d_parts_aos_forc;
+  struct part_aos_f4_f 		*d_parts_aos_forc_f4;
+  struct part_aos_f4_f_send *d_parts_aos_forc_f4_send;
+  struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv;
+
+  struct part_aos_g    		*d_parts_aos_grad;
+  struct part_aos_f4_g 		*d_parts_aos_grad_f4;
+  struct part_aos_f4_g_send *d_parts_aos_grad_f4_send;
+  struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv;
+
+  struct part_aos           *parts_aos_pair_dens;
+  struct part_aos_f4_send   *parts_aos_pair_f4_send;
+  struct part_aos_f4_recv   *parts_aos_pair_f4_recv;
+
+  struct part_aos           *d_parts_aos_pair_dens;
+  struct part_aos_f4_send   *d_parts_aos_pair_f4_send;
+  struct part_aos_f4_recv   *d_parts_aos_pair_f4_recv;
+
+  struct part_aos_f         *parts_aos_pair_forc;
+  struct part_aos_f4_f_send *parts_aos_pair_f4_f_send;
+  struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv;
+
+  struct part_aos_f         *d_parts_aos_pair_forc;
+  struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send;
+  struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv;
+
+  struct part_aos_g         *parts_aos_pair_grad;
+  struct part_aos_f4_g_send *parts_aos_pair_f4_g_send;
+  struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv;
+
+  struct part_aos_g         *d_parts_aos_pair_grad;
+  struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
+  struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
+
+
+
+
+//  cudaMalloc((void**)&d_parts_aos_dens, count_max_parts_tmp * sizeof(struct part_aos));
+  cudaMalloc((void**)&d_parts_aos_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void**)&d_parts_aos_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+//  cudaMalloc((void**)&d_parts_aos_dens_f4, count_max_parts_tmp * sizeof(struct part_aos_f4));
+//  cudaMalloc((void**)&d_parts_aos_forc, count_max_parts_tmp * sizeof(struct part_aos_f));
+//  cudaMalloc((void**)&d_parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_f));
+  cudaMalloc((void**)&d_parts_aos_forc_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void**)&d_parts_aos_forc_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+//  cudaMalloc((void**)&d_parts_aos_grad, count_max_parts_tmp * sizeof(struct part_aos_g));
+//  cudaMalloc((void**)&d_parts_aos_grad_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_g));
+  cudaMalloc((void**)&d_parts_aos_grad_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void**)&d_parts_aos_grad_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+//  cudaMallocHost((void **)&parts_aos_dens, count_max_parts_tmp * sizeof(struct part_aos));
+  cudaMallocHost((void **)&parts_aos_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+//  cudaMallocHost((void **)&parts_aos_dens_f4, count_max_parts_tmp * sizeof(struct part_aos_f4));
+//  cudaMallocHost((void **)&parts_aos_forc, count_max_parts_tmp * sizeof(struct part_aos_f));
+//  cudaMallocHost((void **)&parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_f));
+  cudaMallocHost((void **)&parts_aos_forc_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_forc_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+//  cudaMallocHost((void **)&parts_aos_grad, count_max_parts_tmp * sizeof(struct part_aos_g));
+//  cudaMallocHost((void **)&parts_aos_grad_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_g));
+  cudaMallocHost((void **)&parts_aos_grad_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_grad_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+//  cudaMalloc((void**)&d_parts_aos_pair_dens, 2 * count_max_parts_tmp * sizeof(struct part_aos));
+  cudaMalloc((void**)&d_parts_aos_pair_f4_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void**)&d_parts_aos_pair_f4_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMalloc((void**)&d_parts_aos_pair_f4_f_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void**)&d_parts_aos_pair_f4_f_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMalloc((void**)&d_parts_aos_pair_f4_g_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void**)&d_parts_aos_pair_f4_g_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+
+///////////Probably not needed anymore////////////////////////////////////////////////////////////////
+  cudaMalloc((void**)&d_parts_aos_pair_forc, 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
+  cudaMalloc((void**)&d_parts_aos_pair_grad, 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
+  ///////////Probably not needed anymore////////////////////////////////////////////////////////////////
+
+//  cudaMallocHost((void **)&parts_aos_pair_dens, 2 * count_max_parts_tmp * sizeof(struct part_aos));
+  cudaMallocHost((void **)&parts_aos_pair_f4_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_forc, 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
+  cudaMallocHost((void **)&parts_aos_pair_grad, 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
+
+  /*Declare some global variables*/
+  float d_a = e->cosmology->a;
+  float d_H = e->cosmology->H;
+  int step = 0;
+
+  // a list of the cells and tasks the GPU will work on
+  pack_vars_self_dens->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_dens->cell_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_dens->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_dens->ci_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_dens->cj_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_self_forc->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_forc->cell_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_forc->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_forc->ci_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_forc->cj_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_self_grad->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_self_grad->cell_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  pack_vars_pair_grad->task_list =
+		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_grad->ci_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+  pack_vars_pair_grad->cj_list =
+		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+
+  // number of density self tasks executed
+  int tasks_done_cpu = 0;
+  int tasks_done_gpu = 0;
+  int tasks_done_gpu_inc = 0;
+
+  /* Main loop. */
+  while (1) {
+	  /*Stuff for debugging*/
+	  int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0;
+	  int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0;
+	  int n_partial_d_bundles = 0, n_partial_g_bundles = 0, n_partial_f_bundles = 0;
+	  int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0, n_partial_p_f_bundles = 0;
+	  int output = 0;
+	  int packed_self = 0;
+	  int packed_pair = 0;
+	  int packed_self_f = 0;
+	  int packed_pair_f = 0;
+	  int packed_self_g = 0;
+	  int packed_pair_g = 0;
+	  int density = 0;
+	  int density_sub = 0;
+	  int unpacked = 0;
+	  int unpacked_f = 0;
+	  int unpacked_g = 0;
+	  int unpacked_pair = 0;
+	  int unpacked_pair_f = 0;
+	  int unpacked_pair_g = 0;
+	  int ghost_in = 0;
+	  int cpu_self = 0;
+	  int cpu_self_f = 0;
+	  int cpu_self_g = 0;
+	  int cpu_pair = 0;
+	  int cpu_pair_f = 0;
+	  int cpu_pair_g = 0;
+    //	Initialise timers to zero
+	double time_for_density_cpu = 0.0;
+	double time_for_density_cpu_pair = 0.0;
+	double time_for_cpu_g = 0.0;
+	double time_for_cpu_pair_g = 0.0;
+	double time_for_cpu_f = 0.0;
+	double time_for_cpu_pair_f = 0.0;
+    double time_for_density_cpu_sub = 0.0;
+    double time_for_density_gpu = 0.0;
+    double time_for_density_gpu_pair = 0.0;
+    double time_for_gpu_f = 0.0;
+    double time_for_gpu_pair_f = 0.0;
+    double time_for_gpu_g = 0.0;
+    double time_for_gpu_pair_g = 0.0;
+    double unpack_time_self_g = 0.0;
+    double unpack_time_self_f = 0.0;
+    double unpack_time_self = 0.0;
+    double time_for_gpu_pair = 0.0;
+    int nr_cells = space->nr_cells;
+    /* Wait at the barrier. */
+    engine_barrier(e);
+    // Initialise packing counters
+    pack_vars_self_dens->tasks_packed = 0;
+    pack_vars_pair_dens->tasks_packed = 0;
+    pack_vars_self_dens->count_parts = 0;
+    pack_vars_pair_dens->count_parts = 0;
+    // Initialise packing counters
+    pack_vars_self_forc->tasks_packed = 0;
+    pack_vars_pair_forc->tasks_packed = 0;
+    pack_vars_self_forc->count_parts = 0;
+    pack_vars_pair_forc->count_parts = 0;
+    // Initialise packing counters
+    pack_vars_self_grad->tasks_packed = 0;
+    pack_vars_pair_grad->tasks_packed = 0;
+    pack_vars_self_grad->count_parts = 0;
+    pack_vars_pair_grad->count_parts = 0;
+
+    int total_tasks_packed_this_time_pair = 0;
+    double packing_time = 0.0;
+    double packing_time_f = 0.0;
+    double packing_time_g = 0.0;
+    double unpacking_time = 0.0;
+    double unpacking_time_f = 0.0;
+    double unpacking_time_g = 0.0;
+    double packing_time_pair = 0.0;
+    double packing_time_pair_f = 0.0;
+    double packing_time_pair_g = 0.0;
+    double unpacking_time_pair = 0.0;
+    double unpacking_time_pair_f = 0.0;
+    double unpacking_time_pair_g = 0.0;
+    double time_for_copy_to_struct = 0.0;
+    double tot_time_for_hard_memcpys = 0.0;
+    /* Can we go home yet? */
+    if (e->step_props & engine_step_prop_done)
+      break;
+    /* Re-set the pointer to the previous task, as there is none. */
+    struct task *t = NULL;
+    struct task *prev = NULL;
+    int zeropacks = 0;
+    int lesspacks = 0;
+    /*Some bits for output in case of debug*/
+    char buf5[20];
+    snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
+
+    FILE *fgpu_steps;
+//    if(step == 0 || step%10 == 0)fgpu_steps = fopen(buf5, "w");
+    fgpu_steps = fopen(buf5, "w");
+//    if (step == 0) cudaProfilerStart();
+    step++;
+
+    sched->nr_packs_self_dens_done = 0;
+    sched->nr_packs_pair_dens_done = 0;
+    sched->nr_packs_self_forc_done = 0;
+    sched->nr_packs_pair_forc_done = 0;
+    sched->nr_packs_self_grad_done = 0;
+    sched->nr_packs_pair_grad_done = 0;
+
+    /* Loop while there are tasks... */
+    tasks_done_gpu_inc = 0;
+    while (1) {
+
+      /* If there's no old task, try to get a new one. */
+      if (t == NULL) {
+        /* Get the task. */
+        TIMER_TIC
+        t = scheduler_gettask(sched, r->qid, prev);
+        TIMER_TOC(timer_gettask);
+        /* Did I get anything? */
+        if (t == NULL)
+          break;
+      }
+
+      /* Get the cells. */
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+
+#ifdef SWIFT_DEBUG_TASKS
+      /* Mark the thread we run on */
+      t->rid = r->cpuid;
+
+      /* And recover the pair direction */
+      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+        struct cell *ci_temp = ci;
+        struct cell *cj_temp = cj;
+        double shift[3];
+        t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift);
+      } else {
+        t->sid = -1;
+      }
+#endif
+
+
+#ifdef SWIFT_DEBUG_CHECKS
+      /* Check that we haven't scheduled an inactive task */
+      t->ti_run = e->ti_current;
+      /* Store the task that will be running (for debugging only) */
+      r->t = t;
+#endif
+      /* Different types of tasks... */
+      switch (t->type) {
+      case task_type_self:
+        if (t->subtype == task_subtype_gpu_unpack) {
+            unpacked++;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+            unpacked_g++;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+            unpacked_f++;
+        } else if (t->subtype == task_subtype_density) {
+            cpu_self++;
+#ifndef GPUOFFLOAD
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_doself1_branch_density(r, ci);
+          clock_gettime(CLOCK_REALTIME, &t1);
+          tasks_done_cpu++;
+          time_for_density_cpu +=
+              (t1.tv_sec - t0.tv_sec) +
+              (t1.tv_nsec - t0.tv_nsec) /
+                  1000000000.0;
+          density++;
+#endif
+
+	    /* GPU WORK */
+        } else if (t->subtype == task_subtype_gpu_pack) {
+          packed_self++;
+#ifdef GPUOFFLOAD
+//          struct timespec t0, t1; //
+//          clock_gettime(CLOCK_REALTIME, &t0);
+          packing_time += runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci,
+          		t, parts_aos_f4_send, task_first_part_f4);
+//      	  clock_gettime(CLOCK_REALTIME, &t1);
+//      	  packing_time += (t1.tv_sec - t0.tv_sec) +
+//      			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+//          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
+//        		  t, parts_aos_dens, &packing_time);
+          /* No pack tasks left in queue, flag that we want to run */
+	      int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+	      /*Packed enough tasks let's go*/
+	      int launch = pack_vars_self_dens->launch;
+	      /* Do we have enough stuff to run the GPU ? */
+	      if(launch)n_full_d_bundles++;
+	      if(launch_leftovers)n_partial_d_bundles++;
+          if (launch || launch_leftovers) {
+        	/*Launch GPU tasks*/
+  	        runner_doself1_launch_f4(r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send, parts_aos_f4_recv,
+  	        		d_parts_aos_f4_send, d_parts_aos_f4_recv, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+  					&unpack_time_self, task_first_part_self_dens_f4, devId,
+					  task_first_part_f4, d_task_first_part_f4, self_end);
+//	        runner_doself1_launch(r, sched, pack_vars_self_dens, ci, t, parts_aos_dens,
+//	        		d_parts_aos_dens, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+//					&tot_time_for_hard_memcpys);
+          } /*End of GPU work Self*/
+#endif //GPUDENSSELF
+        } /* self / pack */
+        else if (t->subtype == task_subtype_gpu_pack_g){
+          packed_self_g++;
+#ifdef GPUOFFLOAD
+//          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+//        		  t, parts_aos_grad, &packing_time_g);
+          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
+        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
+          /* No pack tasks left in queue, flag that we want to run */
+  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+  	      /*Packed enough tasks let's go*/
+  	      int launch = pack_vars_self_grad->launch;
+  	      /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+          	/*Launch GPU tasks*/
+//      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
+//      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
+      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
+      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
+      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
+      	      		self_end_g, &unpack_time_self_g);
+            } /*End of GPU work Self*/
+#endif //GPUGRADSELF
+        }
+        else if (t->subtype == task_subtype_gpu_pack_f){
+          packed_self_f++;
+#ifdef GPUOFFLOAD
+//          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+//        		  t, parts_aos_forc, &packing_time_f);
+          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
+        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
+//          int count = ci->hydro.count;
+//          for(int i = 0; i < count; i++){
+//        	  int pid = pack_vars_self_forc->count_parts - count + i;
+//        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
+//          }
+          /* No pack tasks left in queue, flag that we want to run */
+	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+	      /*Packed enough tasks let's go*/
+	      int launch = pack_vars_self_forc->launch;
+	      /* Do we have enough stuff to run the GPU ? */
+          if (launch || launch_leftovers) {
+            /*Launch GPU tasks*/
+//  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
+//  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
+	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
+	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
+					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
+					self_end_f, &unpack_time_self_f);
+          } /*End of GPU work Self*/
+#endif //GPUFORCSELF
+        }
+#ifdef EXTRA_HYDRO_LOOP
+        else if (t->subtype == task_subtype_gradient) {
+            cpu_self_g++;
+#ifndef GPUOFFLOAD
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself1_branch_gradient(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_g +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) /
+                    1000000000.0;
+#endif //GPUGRADSELF
+        }
+#endif
+        else if (t->subtype == task_subtype_force) {
+            cpu_self_f++;
+#ifndef GPUOFFLOAD
+            struct timespec t0, t1;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself2_branch_force(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_f +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) /
+                    1000000000.0;
+#endif //GPUFORCSELF
+        } else if (t->subtype == task_subtype_limiter)
+          runner_doself1_branch_limiter(r, ci);
+        else if (t->subtype == task_subtype_grav)
+          runner_doself_recursive_grav(r, ci, 1);
+        else if (t->subtype == task_subtype_external_grav)
+          runner_do_grav_external(r, ci, 1);
+        else if (t->subtype == task_subtype_stars_density)
+          runner_doself_branch_stars_density(r, ci);
+        else if (t->subtype == task_subtype_stars_feedback)
+          runner_doself_branch_stars_feedback(r, ci);
+        else if (t->subtype == task_subtype_bh_density)
+          runner_doself_branch_bh_density(r, ci);
+        else if (t->subtype == task_subtype_bh_swallow)
+          runner_doself_branch_bh_swallow(r, ci);
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          runner_do_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          runner_do_bh_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_bh_feedback)
+          runner_doself_branch_bh_feedback(r, ci);
+        else if (t->subtype == task_subtype_rt_inject)
+          runner_doself_branch_rt_inject(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_compute_formation)
+          runner_doself_branch_sinks_compute_formation(r, ci);
+        else
+          error("Self Unknown/invalid task subtype (%s).",
+                subtaskID_names[t->subtype]);
+        break;
+
+      case task_type_pair:
+        if (t->subtype == task_subtype_density) {
+	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
+          cpu_pair++;
+#ifndef GPUOFFLOAD
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+	      runner_dopair1_branch_density(r, ci, cj);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+	      tasks_done_cpu++;
+	      time_for_density_cpu_pair +=
+	          (t1.tv_sec - t0.tv_sec) +
+	          (t1.tv_nsec - t0.tv_nsec) /
+	           1000000000.0;
+#endif
+	    }
+	    /* GPU WORK */
+	    else if (t->subtype == task_subtype_gpu_pack) {
+	      packed_pair++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+		    clock_gettime(CLOCK_REALTIME, &t0);
+		    runner_dopair1_branch_density(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_dens->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
+	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
+	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
+		    int launch = pack_vars_pair_dens->launch;
+		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+		    /* Do we have enough stuff to run the GPU ? */
+	        if(launch)n_full_p_d_bundles++;
+	        if(launch_leftovers)n_partial_p_d_bundles++;
+		    if(launch || launch_leftovers) {
+		    /*Launch GPU tasks*/
+//				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+//						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
+			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+		    }
+#ifdef DO_CORNERS
+		  } /* End of GPU work Pairs */
+#endif //DO_CORNERS
+#endif //GPUDENS
+	    } /* pair / pack */
+        else if (t->subtype == task_subtype_gpu_pack_g){
+  	      packed_pair_g++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+    	    clock_gettime(CLOCK_REALTIME, &t0);
+	    	runner_dopair1_branch_gradient(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left_g == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair_g +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_grad->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
+//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
+//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
+	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
+            /* No pack tasks left in queue, flag that we want to run */
+	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+	        /*Packed enough tasks let's go*/
+	        int launch = pack_vars_pair_grad->launch;
+		    /* Do we have enough stuff to run the GPU ? */
+		    if (launch || launch_leftovers) {
+		    /*Launch GPU tasks*/
+//			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+//					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
+			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+		    }
+#ifdef DO_CORNERS
+	      }/* End of GPU work Pairs */
+#endif //DO_CORNERS
+#endif //GPUGRADPAIR
+        }
+        else if (t->subtype == task_subtype_gpu_pack_f){
+    	    packed_pair_f++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+  	      struct timespec t0, t1, dt;
+  	      clock_gettime(CLOCK_REALTIME, &t0);
+  	      double shift[3] = {0.0};
+  	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+    	  clock_gettime(CLOCK_REALTIME, &t1);
+  		  packing_time_pair +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+  	    	runner_dopair1_branch_force(r, ci, cj);
+  		    t->corner_pair = 1;
+  			int qid = r->qid;
+  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+  			/* Tell the cells they have been packed */
+  			ci->pack_done++;
+  			cj->pack_done++;
+  			t->done = 1;
+  			int launch = 0, launch_leftovers = 0;
+  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
+  				launch_leftovers = 1;
+  			/* Tasks done. Release the lock ! */
+  		    task_unlock(t);
+  			/*schedule my dependencies (Only unpacks really)*/
+  			enqueue_dependencies(sched, t);
+  			/*Signal sleeping runners*/
+  			signal_sleeping_runners(sched, t);
+  		    clock_gettime(CLOCK_REALTIME, &t1);
+  		    packing_time_pair_f +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+  			if (launch_leftovers) {
+  			  pack_vars_pair_forc->launch_leftovers = 1;
+  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+  			}
+  	      }
+  	      else{
+#endif //DO_CORNERS
+//            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
+//            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
+  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
+            /* No pack tasks left in queue, flag that we want to run */
+  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+  	        /*Packed enough tasks let's go*/
+  	        int launch = pack_vars_pair_forc->launch;
+  		    /* Do we have enough stuff to run the GPU ? */
+  		    if (launch || launch_leftovers) {
+  		    /*Launch GPU tasks*/
+//  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+//  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
+  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+  		    } /* End of GPU work Pairs */
+#ifdef DO_CORNERS
+  	      }
+#endif //DO_CORNERS
+#endif //GPUFORCPAIR
+        }
+	    else if (t->subtype == task_subtype_gpu_unpack) {
+	  	      unpacked_pair++;
+        }
+	    else if (t->subtype == task_subtype_gpu_unpack_g) {
+	  	      unpacked_pair_g++;
+        }
+	    else if (t->subtype == task_subtype_gpu_unpack_f) {
+	  	      unpacked_pair_f++;
+        }
+#ifdef EXTRA_HYDRO_LOOP
+        else if (t->subtype == task_subtype_gradient){
+          int Do_nothing = 0;
+#ifndef GPUOFFLOAD
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_dopair1_branch_gradient(r, ci, cj);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+	      tasks_done_cpu++;
+	      time_for_cpu_pair_g +=
+	          (t1.tv_sec - t0.tv_sec) +
+	          (t1.tv_nsec - t0.tv_nsec) /
+	           1000000000.0;
+#endif //GPUGRADPAIR
+        }
+#endif //EXTRA_HYDRO_LOOP
+        else if (t->subtype == task_subtype_force){
+          int Do_nothing = 0;
+#ifndef GPUOFFLOAD
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_dopair2_branch_force(r, ci, cj);
+  	      clock_gettime(CLOCK_REALTIME, &t1);
+  	      tasks_done_cpu++;
+  	      time_for_cpu_pair_f +=
+  	          (t1.tv_sec - t0.tv_sec) +
+  	          (t1.tv_nsec - t0.tv_nsec) /
+  	           1000000000.0;
+#endif //GPUFORCPAIR
+        }
+        else if (t->subtype == task_subtype_limiter)
+          runner_dopair1_branch_limiter(r, ci, cj);
+        else if (t->subtype == task_subtype_grav)
+          runner_dopair_recursive_grav(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_stars_density)
+          runner_dopair_branch_stars_density(r, ci, cj);
+        else if (t->subtype == task_subtype_stars_feedback)
+          runner_dopair_branch_stars_feedback(r, ci, cj);
+        else if (t->subtype == task_subtype_bh_density)
+          runner_dopair_branch_bh_density(r, ci, cj);
+        else if (t->subtype == task_subtype_bh_swallow)
+          runner_dopair_branch_bh_swallow(r, ci, cj);
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          runner_do_gas_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          runner_do_bh_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_bh_feedback)
+          runner_dopair_branch_bh_feedback(r, ci, cj);
+        else if (t->subtype == task_subtype_rt_inject)
+          runner_dopair_branch_rt_inject(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_compute_formation)
+          runner_dopair_branch_sinks_compute_formation(r, ci, cj);
+        else {
+          error("Pair Unknown/invalid task subtype (%s/%s).",
+                taskID_names[t->type], subtaskID_names[t->subtype]);
+	}
+        break;
+
+      case task_type_sub_self:
+        if (t->subtype == task_subtype_density) {
+          struct timespec t0, t1, dt;
+          const int count = ci->hydro.count;
+          density_sub++;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_dosub_self1_density(r, ci, 1);
+          clock_gettime(CLOCK_REALTIME, &t1);
+          tasks_done_cpu++;
+          time_for_density_cpu_sub +=
+              (t1.tv_sec - t0.tv_sec) +
+              (t1.tv_nsec - t0.tv_nsec) /
+                  1000000000.0;
+        }
+#ifdef EXTRA_HYDRO_LOOP
+        else if (t->subtype == task_subtype_gradient){
+          runner_dosub_self1_gradient(r, ci, 1);
+          fprintf(stderr, "split a g task\n");
+        }
+#endif
+        else if (t->subtype == task_subtype_force){
+          runner_dosub_self2_force(r, ci, 1);
+          fprintf(stderr, "split a f task\n");
+        }
+        else if (t->subtype == task_subtype_limiter)
+          runner_dosub_self1_limiter(r, ci, 1);
+        else if (t->subtype == task_subtype_stars_density)
+          runner_dosub_self_stars_density(r, ci, 1);
+        else if (t->subtype == task_subtype_stars_feedback)
+          runner_dosub_self_stars_feedback(r, ci, 1);
+        else if (t->subtype == task_subtype_bh_density)
+          runner_dosub_self_bh_density(r, ci, 1);
+        else if (t->subtype == task_subtype_bh_swallow)
+          runner_dosub_self_bh_swallow(r, ci, 1);
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          runner_do_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          runner_do_bh_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_bh_feedback)
+          runner_dosub_self_bh_feedback(r, ci, 1);
+        else if (t->subtype == task_subtype_rt_inject)
+          runner_dosub_self_rt_inject(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_compute_formation)
+          runner_dosub_self_sinks_compute_formation(r, ci, 1);
+        else
+          error("Sub Self Unknown/invalid task subtype (%s/%s).",
+                taskID_names[t->type], subtaskID_names[t->subtype]);
+        break;
+
+      case task_type_sub_pair:
+        if (t->subtype == task_subtype_density){
+        	int nothing = 0;
+        	fprintf(stderr,"Doing a pair sub task");
+          runner_dosub_pair1_density(r, ci, cj, 1);
+        }
+#ifdef EXTRA_HYDRO_LOOP
+        else if (t->subtype == task_subtype_gradient){
+          runner_dosub_pair1_gradient(r, ci, cj, 1);
+          fprintf(stderr, "split a g task\n");
+        }
+#endif
+        else if (t->subtype == task_subtype_force){
+          runner_dosub_pair2_force(r, ci, cj, 1);
+          fprintf(stderr, "split a f task\n");
+        }
+        else if (t->subtype == task_subtype_limiter)
+          runner_dosub_pair1_limiter(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_stars_density)
+          runner_dosub_pair_stars_density(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_stars_feedback)
+          runner_dosub_pair_stars_feedback(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_bh_density)
+          runner_dosub_pair_bh_density(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_bh_swallow)
+          runner_dosub_pair_bh_swallow(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          runner_do_gas_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          runner_do_bh_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_bh_feedback)
+          runner_dosub_pair_bh_feedback(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_rt_inject)
+          runner_dosub_pair_rt_inject(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_compute_formation)
+          runner_dosub_pair_sinks_compute_formation(r, ci, cj, 1);
+        else
+          error("Sub Pair Unknown/invalid task subtype (%s/%s).",
+                taskID_names[t->type], subtaskID_names[t->subtype]);
+        break;
+
+      case task_type_sort:
+        /* Cleanup only if any of the indices went stale. */
+        runner_do_hydro_sort(
+            r, ci, t->flags,
+            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+        /* Reset the sort flags as our work here is done. */
+        t->flags = 0;
+        break;
+      case task_type_stars_sort:
+        /* Cleanup only if any of the indices went stale. */
+        runner_do_stars_sort(
+            r, ci, t->flags,
+            ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+        /* Reset the sort flags as our work here is done. */
+        t->flags = 0;
+        break;
+      case task_type_init_grav:
+        runner_do_init_grav(r, ci, 1);
+        break;
+      case task_type_ghost:
+        runner_do_ghost(r, ci, 1);
+        break;
+#ifdef EXTRA_HYDRO_LOOP
+      case task_type_extra_ghost:
+        runner_do_extra_ghost(r, ci, 1);
+        break;
+#endif
+      case task_type_stars_ghost:
+        runner_do_stars_ghost(r, ci, 1);
+        break;
+      case task_type_bh_density_ghost:
+        runner_do_black_holes_density_ghost(r, ci, 1);
+        break;
+      case task_type_bh_swallow_ghost3:
+        runner_do_black_holes_swallow_ghost(r, ci, 1);
+        break;
+      case task_type_drift_part:
+        runner_do_drift_part(r, ci, 1);
+        break;
+      case task_type_drift_spart:
+        runner_do_drift_spart(r, ci, 1);
+        break;
+      case task_type_drift_sink:
+        runner_do_drift_sink(r, ci, 1);
+        break;
+      case task_type_drift_bpart:
+        runner_do_drift_bpart(r, ci, 1);
+        break;
+      case task_type_drift_gpart:
+        runner_do_drift_gpart(r, ci, 1);
+        break;
+      case task_type_kick1:
+        runner_do_kick1(r, ci, 1);
+        break;
+      case task_type_kick2:
+        runner_do_kick2(r, ci, 1);
+        break;
+      case task_type_end_hydro_force:
+        runner_do_end_hydro_force(r, ci, 1);
+        break;
+      case task_type_end_grav_force:
+        runner_do_end_grav_force(r, ci, 1);
+        break;
+      case task_type_logger:
+        runner_do_logger(r, ci, 1);
+        break;
+      case task_type_timestep:
+        runner_do_timestep(r, ci, 1);
+        break;
+      case task_type_timestep_limiter:
+        runner_do_limiter(r, ci, 0, 1);
+        break;
+      case task_type_timestep_sync:
+        runner_do_sync(r, ci, 0, 1);
+        break;
+#ifdef WITH_MPI
+      case task_type_send:
+        if (t->subtype == task_subtype_tend_part) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_gpart) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_spart) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_bpart) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_sf_counts) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_part_swallow) {
+          free(t->buff);
+        } else if (t->subtype == task_subtype_bpart_merger) {
+          free(t->buff);
+        }
+        break;
+      case task_type_recv:
+        if (t->subtype == task_subtype_tend_part) {
+          cell_unpack_end_step_hydro(ci, (struct pcell_step_hydro *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_gpart) {
+          cell_unpack_end_step_grav(ci, (struct pcell_step_grav *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_spart) {
+          cell_unpack_end_step_stars(ci, (struct pcell_step_stars *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_tend_bpart) {
+          cell_unpack_end_step_black_holes(
+              ci, (struct pcell_step_black_holes *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_sf_counts) {
+          cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
+          cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_xv) {
+          runner_do_recv_part(r, ci, 1, 1);
+        } else if (t->subtype == task_subtype_rho) {
+          runner_do_recv_part(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_gradient) {
+          runner_do_recv_part(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_part_swallow) {
+          cell_unpack_part_swallow(ci, (struct black_holes_part_data *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_bpart_merger) {
+          cell_unpack_bpart_swallow(ci,
+                                    (struct black_holes_bpart_data *)t->buff);
+          free(t->buff);
+        } else if (t->subtype == task_subtype_limiter) {
+          runner_do_recv_part(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_gpart) {
+          runner_do_recv_gpart(r, ci, 1);
+        } else if (t->subtype == task_subtype_spart) {
+          runner_do_recv_spart(r, ci, 1, 1);
+        } else if (t->subtype == task_subtype_bpart_rho) {
+          runner_do_recv_bpart(r, ci, 1, 1);
+        } else if (t->subtype == task_subtype_bpart_swallow) {
+          runner_do_recv_bpart(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_bpart_feedback) {
+          runner_do_recv_bpart(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_multipole) {
+          cell_unpack_multipoles(ci, (struct gravity_tensors *)t->buff);
+          free(t->buff);
+        } else {
+          error("Unknown/invalid task subtype (%d).", t->subtype);
+        }
+        break;
+#endif
+      case task_type_grav_down:
+        runner_do_grav_down(r, t->ci, 1);
+        break;
+      case task_type_grav_long_range:
+        runner_do_grav_long_range(r, t->ci, 1);
+        break;
+      case task_type_grav_mm:
+        runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj);
+        break;
+      case task_type_cooling:
+        runner_do_cooling(r, t->ci, 1);
+        break;
+      case task_type_star_formation:
+        runner_do_star_formation(r, t->ci, 1);
+        break;
+      case task_type_stars_resort:
+        runner_do_stars_resort(r, t->ci, 1);
+        break;
+      case task_type_sink_formation:
+        runner_do_sink_formation(r, t->ci);
+        break;
+      case task_type_fof_self:
+        runner_do_fof_self(r, t->ci, 1);
+        break;
+      case task_type_fof_pair:
+        runner_do_fof_pair(r, t->ci, t->cj, 1);
+        break;
+      case task_type_rt_ghost1:
+        runner_do_rt_ghost1(r, t->ci, 1);
+        break;
+      default:
+        error("Unknown/invalid task type (%d).", t->type);
+      }
+
+/* Mark that we have run this task on these cells */
+#ifdef SWIFT_DEBUG_CHECKS
+      if (ci != NULL) {
+        ci->tasks_executed[t->type]++;
+        ci->subtasks_executed[t->subtype]++;
+      }
+      if (cj != NULL) {
+        cj->tasks_executed[t->type]++;
+        cj->subtasks_executed[t->subtype]++;
+      }
+      /* This runner is not doing a task anymore */
+      r->t = NULL;
+#endif
+
+
+      /* We're done with this task, see if we get a next one. */
+      prev = t;
+#ifdef GPUOFFLOAD
+//      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
+      if (t->subtype == task_subtype_gpu_pack){
+    	/* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+      }
+//      else if (t->subtype == task_subtype_gpu_pack_g && t->type == task_type_self){
+      else if (t->subtype == task_subtype_gpu_pack_g){
+      	/* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+      }
+//      else if (t->subtype == task_subtype_gpu_pack_f && t->type == task_type_self){
+      else if (t->subtype == task_subtype_gpu_pack_f){
+      	/* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+      }
+      else{ /* Mark task as done, as per usual */
+        t = scheduler_done(sched, t);
+      }
+#endif //GPUOFFLOAD
+#ifndef GPUOFFLOAD
+        t = scheduler_done(sched, t);
+#endif //GPUOFFLOAD
+
+    } /* main loop. */
+    // Stuff for writing debug data to file for validation
+////        if (step % 10 == 0 || step == 1) {
+//      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z, rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n");
+//      for (int tid = 0; tid < space->nr_local_cells;
+//           tid++) { /* This should indeed be tasks_done_gpu as they are the only
+////                     tasks which have been done*/
+//        struct cell *ctemp = &(space->cells_top[tid]);
+//        for (int i = 0; i < ctemp->hydro.count; i++) {
+//          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n",
+//                  ctemp->hydro.parts[i].x[0], ctemp->hydro.parts[i].x[1],
+//                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+//                  ctemp->hydro.parts[i].density.rho_dh,
+//                  ctemp->hydro.parts[i].viscosity.v_sig, ctemp->hydro.parts[i].diffusion.laplace_u,
+//                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb, ctemp->hydro.parts[i].a_hydro[0],
+//				  ctemp->hydro.parts[i].a_hydro[1],
+//				  ctemp->hydro.parts[i].a_hydro[2]);
+////          message("wcount %f density %f", ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho);
+////          message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+//        }
+//      }
+////  }
+    /*Output compute times to separate files. cat later into one file*/
+//    if (step % 11 == 0 || step == 1) {
+#ifdef GPUOFFLOAD
+//        char buffer[30];
+//        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", r->cpuid, step);
+//        FILE *fullbundles = fopen(buffer, "w");
+//        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial, nfullpair, npartialpair\n");
+//        else fprintf(fullbundles, "%i, %i, %i, %i\n",
+//        		n_full_d_bundles, n_partial_d_bundles, n_full_p_d_bundles, n_partial_p_d_bundles);
+//        fflush(fullbundles);
+
+///////////////////////////////////////////////////////////////
+///to ooutput timings uncomment this
+///////////////////////////////////////////////////////////////
+		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
+				"GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, GPU_PG, P_PG, U_PG\n "
+				"%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n",
+				time_for_density_gpu, packing_time, unpack_time_self, time_for_density_gpu_pair, packing_time_pair,	unpacking_time_pair,
+				time_for_gpu_f, packing_time_f, unpack_time_self_f, time_for_gpu_pair_f,  packing_time_pair_f, unpacking_time_pair_f,
+				time_for_gpu_g, packing_time_g, unpack_time_self_g, time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+		else fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n",
+				time_for_density_gpu, packing_time, unpack_time_self, time_for_density_gpu_pair, packing_time_pair,	unpacking_time_pair,
+				time_for_gpu_f, packing_time_f, unpack_time_self_f, time_for_gpu_pair_f,  packing_time_pair_f, unpacking_time_pair_f,
+				time_for_gpu_g, packing_time_g, unpack_time_self_g, time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+//////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////
+
+#endif
+#ifndef GPUOFFLOAD
+		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
+				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
+				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
+				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
+				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+#endif
+//    }
+	fflush(fgpu_steps);
+	fclose(fgpu_steps);
+    time_for_density_cpu = 0.0;
+    time_for_density_gpu = 0.0;
+    time_for_density_cpu_pair = 0.0;
+    time_for_density_gpu_pair = 0.0;
+    time_for_density_cpu_sub = 0.0;
+    tot_time_for_hard_memcpys = 0.0;
+    tasks_done_gpu = 0;
+    tasks_done_cpu = 0;
+    tasks_done_gpu_inc = 0;
+    if(ghost_in > 0)fprintf(stderr,"total tasks not done on GPU %i is %i\n", r->cpuid, ghost_in);
+    packed_self = 0;
+    packed_pair = 0;
+    packed_self_f = 0;
+    packed_pair_f = 0;
+    packed_self_g = 0;
+    packed_pair_g = 0;
+    density = 0;
+    density_sub = 0;
+    unpacked = 0;
+//	if(step == 2)cudaProfilerStop();
+//	if(step == 2)exit(0);
+//	  size_t free_byte ;
+//	  size_t total_byte ;
+//	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+//	  double free = (double)free_byte;
+//	  double available = (double)total_byte;
+//	  double used = (available - free);
+//	  fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+    /* Wait at the wait barrier. */
+//    swift_barrier_wait(&e->wait_barrier);
+
+  }
+  // Free all data
+//  cudaFree(d_tid_p);
+//  cudaFree(d_id);
+//  cudaFree(d_x_p);
+//  cudaFree(d_y_p);
+//  cudaFree(d_z_p);
+//  cudaFree(d_ux);
+//  cudaFree(d_uy);
+//  cudaFree(d_uz);
+//  cudaFree(d_a_hydrox);
+//  cudaFree(d_a_hydroy);
+//  cudaFree(d_a_hydroz);
+//  cudaFree(d_mass);
+//  cudaFree(d_h);
+//  cudaFree(d_u);
+//  cudaFree(d_u_dt);
+//  cudaFree(d_rho);
+//  cudaFree(d_SPH_sum);
+//  cudaFree(d_locx);
+//  cudaFree(d_locy);
+//  cudaFree(d_locz);
+//  cudaFree(d_widthx);
+//  cudaFree(d_widthy);
+//  cudaFree(d_widthz);
+//  cudaFree(d_h_max);
+//  cudaFree(d_count_p);
+//  cudaFree(d_wcount);
+//  cudaFree(d_wcount_dh);
+//  cudaFree(d_rho_dh);
+//  cudaFree(d_rot_ux);
+//  cudaFree(d_rot_uy);
+//  cudaFree(d_rot_uz);
+//  cudaFree(d_div_v);
+//  cudaFree(d_div_v_previous_step);
+//  cudaFree(d_alpha_visc);
+//  cudaFree(d_v_sig);
+//  cudaFree(d_laplace_u);
+//  cudaFree(d_alpha_diff);
+//  cudaFree(d_f);
+//  cudaFree(d_soundspeed);
+//  cudaFree(d_h_dt);
+//  cudaFree(d_balsara);
+//  cudaFree(d_pressure);
+//  cudaFree(d_alpha_visc_max_ngb);
+//  cudaFree(d_time_bin);
+//  cudaFree(d_wakeup);
+//  cudaFree(d_min_ngb_time_bin);
+//  cudaFree(d_to_be_synchronized);
+//  cudaFree(tid_p);
+//  cudaFree(id);
+//  cudaFree(mass);
+//  cudaFree(h);
+//  cudaFree(u);
+//  cudaFree(u_dt);
+//  cudaFree(rho);
+//  cudaFree(SPH_sum);
+//  cudaFree(x_p);
+//  cudaFree(y_p);
+//  cudaFree(z_p);
+//  cudaFree(ux);
+//  cudaFree(uy);
+//  cudaFree(uz);
+//  cudaFree(a_hydrox);
+//  cudaFree(a_hydroy);
+//  cudaFree(a_hydroz);
+//  cudaFree(locx);
+//  cudaFree(locy);
+//  cudaFree(locz);
+//  cudaFree(widthx);
+//  cudaFree(widthy);
+//  cudaFree(widthz);
+//  cudaFree(h_max);
+//  cudaFree(count_p);
+//  cudaFree(wcount);
+//  cudaFree(wcount_dh);
+//  cudaFree(rho_dh);
+//  cudaFree(rot_ux);
+//  cudaFree(rot_uy);
+//  cudaFree(rot_uz);
+//  cudaFree(div_v);
+//  cudaFree(div_v_previous_step);
+//  cudaFree(alpha_visc);
+//  cudaFree(v_sig);
+//  cudaFree(laplace_u);
+//  cudaFree(alpha_diff);
+//  cudaFree(f);
+//  cudaFree(soundspeed);
+//  cudaFree(h_dt);
+//  cudaFree(balsara);
+//  cudaFree(pressure);
+//  cudaFree(alpha_visc_max_ngb);
+//  cudaFree(time_bin);
+//  cudaFree(wakeup);
+//  cudaFree(min_ngb_time_bin);
+//  cudaFree(to_be_synchronized);
+//  cudaFree(partid_p);
+//  cudaFree(d_task_first_part);
+//  cudaFree(d_task_last_part);
+//  cudaFree(task_first_part_self_dens);
+//  cudaFree(task_last_part_self_dens);
+//  cudaFree(task_first_part_pair_ci);
+//  cudaFree(task_last_part_pair_ci);
+//  cudaFree(task_first_part_pair_cj);
+//  cudaFree(task_last_part_pair_cj);
+//  cudaFree(d_bundle_first_part_self_dens);
+//  cudaFree(d_bundle_last_part_self_dens);
+//  cudaFree(bundle_first_part_self_dens);
+//  cudaFree(bundle_last_part_self_dens);
+//  cudaFree(bundle_first_part_pair_ci);
+//  cudaFree(bundle_last_part_pair_ci);
+//  cudaFree(bundle_first_part_pair_cj);
+//  cudaFree(bundle_last_part_pair_cj);
+//  free(ci_list_self_dens);
+//  free(ci_list_pair);
+//  free(cj_list_pair);
+
+  /* Be kind, rewind. */
+  return NULL;
+}
+
+#endif // WITH_CUDA
+
+#include <stdio.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+// uint64_t time_used ( ) {
+//    struct rusage ru;
+//    struct timeval t;
+//    getrusage(RUSAGE_THREAD,&ru);
+//    t = ru.ru_utime;
+//    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
+// }

From caa0852c85f50a7a50b768c46422a0e6231cbe7f Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 24 Oct 2024 18:51:12 +0100
Subject: [PATCH 015/217] Sorted most of the code out. Compiles and runs fine
 with gpu offload off because I have not made gpu tasks yet

---
 src/runner_doiact_functions_hydro_gpu.h |  12 +-
 src/runner_main_clean.cu                | 211 +++++++++++++++++-------
 src/scheduler.h                         |   2 +
 src/space_getsid.h                      |  87 +++++++++-
 src/space_recycle.c                     |   6 +
 5 files changed, 248 insertions(+), 70 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 696a11e7f2..2b047ae25a 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -513,8 +513,8 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars
 	cell_unlocktree(cj);
 }
 
-double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -693,8 +693,8 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 }
 
 
-double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -873,8 +873,8 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 6b307350b5..ce59cc5f0b 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,8 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD 1 //off-load hydro to GPU
+//#define GPUOFFLOAD 1 //off-load hydro to GPU
 #define DO_CORNERS 1 //do corner pair tasks on CPU
+//#define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -201,8 +202,7 @@ void *runner_main2(void *data) {
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
   struct space *space = e->s;
-  unsigned int seed = r->id;
-  pthread_setspecific(sched->local_seed_pointer, &seed);
+
   /*pack_vars contain data required for packing tasks destined for the GPU*/
   struct pack_vars_self *pack_vars_self_dens;
   struct pack_vars_self *pack_vars_self_forc;
@@ -874,8 +874,6 @@ void *runner_main2(void *data) {
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
     struct task *prev = NULL;
-    int zeropacks = 0;
-    int lesspacks = 0;
     /*Some bits for output in case of debug*/
     char buf5[20];
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
@@ -921,19 +919,20 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift);
+        t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }
 #endif
 
-
 #ifdef SWIFT_DEBUG_CHECKS
       /* Check that we haven't scheduled an inactive task */
       t->ti_run = e->ti_current;
       /* Store the task that will be running (for debugging only) */
       r->t = t;
 #endif
+
+      const ticks task_beg = getticks();
       /* Different types of tasks... */
       switch (t->type) {
       case task_type_self:
@@ -957,7 +956,6 @@ void *runner_main2(void *data) {
                   1000000000.0;
           density++;
 #endif
-
 	    /* GPU WORK */
         } else if (t->subtype == task_subtype_gpu_pack) {
           packed_self++;
@@ -1080,6 +1078,12 @@ void *runner_main2(void *data) {
           runner_do_grav_external(r, ci, 1);
         else if (t->subtype == task_subtype_stars_density)
           runner_doself_branch_stars_density(r, ci);
+#ifdef EXTRA_STAR_LOOPS
+        else if (t->subtype == task_subtype_stars_prep1)
+          runner_doself_branch_stars_prep1(r, ci);
+        else if (t->subtype == task_subtype_stars_prep2)
+          runner_doself_branch_stars_prep2(r, ci);
+#endif
         else if (t->subtype == task_subtype_stars_feedback)
           runner_doself_branch_stars_feedback(r, ci);
         else if (t->subtype == task_subtype_bh_density)
@@ -1092,12 +1096,18 @@ void *runner_main2(void *data) {
           runner_do_bh_swallow_self(r, ci, 1);
         else if (t->subtype == task_subtype_bh_feedback)
           runner_doself_branch_bh_feedback(r, ci);
-        else if (t->subtype == task_subtype_rt_inject)
-          runner_doself_branch_rt_inject(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_compute_formation)
-          runner_doself_branch_sinks_compute_formation(r, ci);
+        else if (t->subtype == task_subtype_rt_gradient)
+          runner_doself1_branch_rt_gradient(r, ci);
+        else if (t->subtype == task_subtype_rt_transport)
+          runner_doself2_branch_rt_transport(r, ci);
+        else if (t->subtype == task_subtype_sink_swallow)
+          runner_doself_branch_sinks_swallow(r, ci);
+        else if (t->subtype == task_subtype_sink_do_gas_swallow)
+          runner_do_sinks_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_do_sink_swallow)
+          runner_do_sinks_sink_swallow_self(r, ci, 1);
         else
-          error("Self Unknown/invalid task subtype (%s).",
+          error("Unknown/invalid task subtype (%s).",
                 subtaskID_names[t->subtype]);
         break;
 
@@ -1371,6 +1381,12 @@ void *runner_main2(void *data) {
           runner_dopair_recursive_grav(r, ci, cj, 1);
         else if (t->subtype == task_subtype_stars_density)
           runner_dopair_branch_stars_density(r, ci, cj);
+#ifdef EXTRA_STAR_LOOPS
+        else if (t->subtype == task_subtype_stars_prep1)
+          runner_dopair_branch_stars_prep1(r, ci, cj);
+        else if (t->subtype == task_subtype_stars_prep2)
+          runner_dopair_branch_stars_prep2(r, ci, cj);
+#endif
         else if (t->subtype == task_subtype_stars_feedback)
           runner_dopair_branch_stars_feedback(r, ci, cj);
         else if (t->subtype == task_subtype_bh_density)
@@ -1383,14 +1399,19 @@ void *runner_main2(void *data) {
           runner_do_bh_swallow_pair(r, ci, cj, 1);
         else if (t->subtype == task_subtype_bh_feedback)
           runner_dopair_branch_bh_feedback(r, ci, cj);
-        else if (t->subtype == task_subtype_rt_inject)
-          runner_dopair_branch_rt_inject(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_compute_formation)
-          runner_dopair_branch_sinks_compute_formation(r, ci, cj);
-        else {
-          error("Pair Unknown/invalid task subtype (%s/%s).",
+        else if (t->subtype == task_subtype_rt_gradient)
+          runner_dopair1_branch_rt_gradient(r, ci, cj);
+        else if (t->subtype == task_subtype_rt_transport)
+          runner_dopair2_branch_rt_transport(r, ci, cj);
+        else if (t->subtype == task_subtype_sink_swallow)
+          runner_dopair_branch_sinks_swallow(r, ci, cj);
+        else if (t->subtype == task_subtype_sink_do_gas_swallow)
+          runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_do_sink_swallow)
+          runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+        else
+          error("Unknown/invalid task subtype (%s/%s).",
                 taskID_names[t->type], subtaskID_names[t->subtype]);
-	}
         break;
 
       case task_type_sub_self:
@@ -1421,6 +1442,12 @@ void *runner_main2(void *data) {
           runner_dosub_self1_limiter(r, ci, 1);
         else if (t->subtype == task_subtype_stars_density)
           runner_dosub_self_stars_density(r, ci, 1);
+#ifdef EXTRA_STAR_LOOPS
+        else if (t->subtype == task_subtype_stars_prep1)
+          runner_dosub_self_stars_prep1(r, ci, 1);
+        else if (t->subtype == task_subtype_stars_prep2)
+          runner_dosub_self_stars_prep2(r, ci, 1);
+#endif
         else if (t->subtype == task_subtype_stars_feedback)
           runner_dosub_self_stars_feedback(r, ci, 1);
         else if (t->subtype == task_subtype_bh_density)
@@ -1433,12 +1460,18 @@ void *runner_main2(void *data) {
           runner_do_bh_swallow_self(r, ci, 1);
         else if (t->subtype == task_subtype_bh_feedback)
           runner_dosub_self_bh_feedback(r, ci, 1);
-        else if (t->subtype == task_subtype_rt_inject)
-          runner_dosub_self_rt_inject(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_compute_formation)
-          runner_dosub_self_sinks_compute_formation(r, ci, 1);
+        else if (t->subtype == task_subtype_rt_gradient)
+          runner_dosub_self1_rt_gradient(r, ci, 1);
+        else if (t->subtype == task_subtype_rt_transport)
+          runner_dosub_self2_rt_transport(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_swallow)
+          runner_dosub_self_sinks_swallow(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_do_gas_swallow)
+          runner_do_sinks_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_do_sink_swallow)
+          runner_do_sinks_sink_swallow_self(r, ci, 1);
         else
-          error("Sub Self Unknown/invalid task subtype (%s/%s).",
+          error("Unknown/invalid task subtype (%s/%s).",
                 taskID_names[t->type], subtaskID_names[t->subtype]);
         break;
 
@@ -1462,6 +1495,12 @@ void *runner_main2(void *data) {
           runner_dosub_pair1_limiter(r, ci, cj, 1);
         else if (t->subtype == task_subtype_stars_density)
           runner_dosub_pair_stars_density(r, ci, cj, 1);
+#ifdef EXTRA_STAR_LOOPS
+        else if (t->subtype == task_subtype_stars_prep1)
+          runner_dosub_pair_stars_prep1(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_stars_prep2)
+          runner_dosub_pair_stars_prep2(r, ci, cj, 1);
+#endif
         else if (t->subtype == task_subtype_stars_feedback)
           runner_dosub_pair_stars_feedback(r, ci, cj, 1);
         else if (t->subtype == task_subtype_bh_density)
@@ -1474,12 +1513,18 @@ void *runner_main2(void *data) {
           runner_do_bh_swallow_pair(r, ci, cj, 1);
         else if (t->subtype == task_subtype_bh_feedback)
           runner_dosub_pair_bh_feedback(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_rt_inject)
-          runner_dosub_pair_rt_inject(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_compute_formation)
-          runner_dosub_pair_sinks_compute_formation(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_rt_gradient)
+          runner_dosub_pair1_rt_gradient(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_rt_transport)
+          runner_dosub_pair2_rt_transport(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_swallow)
+          runner_dosub_pair_sinks_swallow(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_do_gas_swallow)
+          runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+        else if (t->subtype == task_subtype_sink_do_sink_swallow)
+          runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
         else
-          error("Sub Pair Unknown/invalid task subtype (%s/%s).",
+          error("Unknown/invalid task subtype (%s/%s).",
                 taskID_names[t->type], subtaskID_names[t->subtype]);
         break;
 
@@ -1487,7 +1532,19 @@ void *runner_main2(void *data) {
         /* Cleanup only if any of the indices went stale. */
         runner_do_hydro_sort(
             r, ci, t->flags,
-            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin,
+            cell_get_flag(ci, cell_flag_rt_requests_sort), 1);
+        /* Reset the sort flags as our work here is done. */
+        t->flags = 0;
+        break;
+      case task_type_rt_sort:
+        /* Cleanup only if any of the indices went stale.
+         * NOTE: we check whether we reset the sort flags when the
+         * recv tasks are running. Cells without an RT recv task
+         * don't have rt_sort tasks. */
+        runner_do_hydro_sort(
+            r, ci, t->flags,
+            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1);
         /* Reset the sort flags as our work here is done. */
         t->flags = 0;
         break;
@@ -1546,8 +1603,8 @@ void *runner_main2(void *data) {
       case task_type_end_grav_force:
         runner_do_end_grav_force(r, ci, 1);
         break;
-      case task_type_logger:
-        runner_do_logger(r, ci, 1);
+      case task_type_csds:
+        runner_do_csds(r, ci, 1);
         break;
       case task_type_timestep:
         runner_do_timestep(r, ci, 1);
@@ -1558,15 +1615,15 @@ void *runner_main2(void *data) {
       case task_type_timestep_sync:
         runner_do_sync(r, ci, 0, 1);
         break;
+      case task_type_collect:
+        runner_do_timestep_collect(r, ci, 1);
+        break;
+      case task_type_rt_collect_times:
+        runner_do_collect_rt_times(r, ci, 1);
+        break;
 #ifdef WITH_MPI
       case task_type_send:
-        if (t->subtype == task_subtype_tend_part) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_gpart) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_spart) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_bpart) {
+        if (t->subtype == task_subtype_tend) {
           free(t->buff);
         } else if (t->subtype == task_subtype_sf_counts) {
           free(t->buff);
@@ -1574,21 +1631,13 @@ void *runner_main2(void *data) {
           free(t->buff);
         } else if (t->subtype == task_subtype_bpart_merger) {
           free(t->buff);
+        } else if (t->subtype == task_subtype_limiter) {
+          free(t->buff);
         }
         break;
       case task_type_recv:
-        if (t->subtype == task_subtype_tend_part) {
-          cell_unpack_end_step_hydro(ci, (struct pcell_step_hydro *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_gpart) {
-          cell_unpack_end_step_grav(ci, (struct pcell_step_grav *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_spart) {
-          cell_unpack_end_step_stars(ci, (struct pcell_step_stars *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_tend_bpart) {
-          cell_unpack_end_step_black_holes(
-              ci, (struct pcell_step_black_holes *)t->buff);
+        if (t->subtype == task_subtype_tend) {
+          cell_unpack_end_step(ci, (struct pcell_step *)t->buff);
           free(t->buff);
         } else if (t->subtype == task_subtype_sf_counts) {
           cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
@@ -1600,32 +1649,44 @@ void *runner_main2(void *data) {
           runner_do_recv_part(r, ci, 0, 1);
         } else if (t->subtype == task_subtype_gradient) {
           runner_do_recv_part(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_rt_gradient) {
+          runner_do_recv_part(r, ci, 2, 1);
+        } else if (t->subtype == task_subtype_rt_transport) {
+          runner_do_recv_part(r, ci, -1, 1);
         } else if (t->subtype == task_subtype_part_swallow) {
-          cell_unpack_part_swallow(ci, (struct black_holes_part_data *)t->buff);
+          cell_unpack_part_swallow(ci,
+                                   (struct black_holes_part_data *)t->buff);
           free(t->buff);
         } else if (t->subtype == task_subtype_bpart_merger) {
           cell_unpack_bpart_swallow(ci,
                                     (struct black_holes_bpart_data *)t->buff);
           free(t->buff);
         } else if (t->subtype == task_subtype_limiter) {
-          runner_do_recv_part(r, ci, 0, 1);
+          /* Nothing to do here. Unpacking done in a separate task */
         } else if (t->subtype == task_subtype_gpart) {
           runner_do_recv_gpart(r, ci, 1);
-        } else if (t->subtype == task_subtype_spart) {
+        } else if (t->subtype == task_subtype_spart_density) {
           runner_do_recv_spart(r, ci, 1, 1);
+        } else if (t->subtype == task_subtype_part_prep1) {
+          runner_do_recv_part(r, ci, 0, 1);
+        } else if (t->subtype == task_subtype_spart_prep2) {
+          runner_do_recv_spart(r, ci, 0, 1);
         } else if (t->subtype == task_subtype_bpart_rho) {
           runner_do_recv_bpart(r, ci, 1, 1);
-        } else if (t->subtype == task_subtype_bpart_swallow) {
-          runner_do_recv_bpart(r, ci, 0, 1);
         } else if (t->subtype == task_subtype_bpart_feedback) {
           runner_do_recv_bpart(r, ci, 0, 1);
-        } else if (t->subtype == task_subtype_multipole) {
-          cell_unpack_multipoles(ci, (struct gravity_tensors *)t->buff);
-          free(t->buff);
         } else {
           error("Unknown/invalid task subtype (%d).", t->subtype);
         }
         break;
+
+      case task_type_pack:
+        runner_do_pack_limiter(r, ci, &t->buff, 1);
+        task_get_unique_dependent(t)->buff = t->buff;
+        break;
+      case task_type_unpack:
+        runner_do_unpack_limiter(r, ci, t->buff, 1);
+        break;
 #endif
       case task_type_grav_down:
         runner_do_grav_down(r, t->ci, 1);
@@ -1642,6 +1703,9 @@ void *runner_main2(void *data) {
       case task_type_star_formation:
         runner_do_star_formation(r, t->ci, 1);
         break;
+      case task_type_star_formation_sink:
+        runner_do_star_formation_sink(r, t->ci, 1);
+        break;
       case task_type_stars_resort:
         runner_do_stars_resort(r, t->ci, 1);
         break;
@@ -1649,17 +1713,36 @@ void *runner_main2(void *data) {
         runner_do_sink_formation(r, t->ci);
         break;
       case task_type_fof_self:
-        runner_do_fof_self(r, t->ci, 1);
+        runner_do_fof_search_self(r, t->ci, 1);
         break;
       case task_type_fof_pair:
-        runner_do_fof_pair(r, t->ci, t->cj, 1);
+        runner_do_fof_search_pair(r, t->ci, t->cj, 1);
+        break;
+      case task_type_fof_attach_self:
+        runner_do_fof_attach_self(r, t->ci, 1);
+        break;
+      case task_type_fof_attach_pair:
+        runner_do_fof_attach_pair(r, t->ci, t->cj, 1);
+        break;
+      case task_type_neutrino_weight:
+        runner_do_neutrino_weighting(r, ci, 1);
         break;
       case task_type_rt_ghost1:
         runner_do_rt_ghost1(r, t->ci, 1);
         break;
+      case task_type_rt_ghost2:
+        runner_do_rt_ghost2(r, t->ci, 1);
+        break;
+      case task_type_rt_tchem:
+        runner_do_rt_tchem(r, t->ci, 1);
+        break;
+      case task_type_rt_advance_cell_time:
+        runner_do_rt_advance_cell_time(r, t->ci, 1);
+        break;
       default:
         error("Unknown/invalid task type (%d).", t->type);
-      }
+    }
+    r->active_time += (getticks() - task_beg);
 
 /* Mark that we have run this task on these cells */
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1729,6 +1812,7 @@ void *runner_main2(void *data) {
 ////  }
     /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
+#ifdef DUMP_TIMINGS
 #ifdef GPUOFFLOAD
 //        char buffer[30];
 //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", r->cpuid, step);
@@ -1766,6 +1850,7 @@ void *runner_main2(void *data) {
 		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
 				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
 #endif
+#endif
 //    }
 	fflush(fgpu_steps);
 	fclose(fgpu_steps);
diff --git a/src/scheduler.h b/src/scheduler.h
index 578a1442d4..155360ede6 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -350,5 +350,7 @@ void scheduler_write_task_level(const struct scheduler *s, int step);
 void scheduler_dump_queues(struct engine *e);
 void scheduler_report_task_times(const struct scheduler *s,
                                  const int nr_threads);
+struct task *enqueue_dependencies(struct scheduler *s, struct task *t);
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space_getsid.h b/src/space_getsid.h
index df81615d3c..8b115a251d 100644
--- a/src/space_getsid.h
+++ b/src/space_getsid.h
@@ -46,7 +46,6 @@
 __attribute__((always_inline, nonnull)) INLINE static int
 space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
                             struct cell **cj, double shift[3]) {
-
   /* Get the relative distance between the pairs, wrapping. */
   const int periodic = s->periodic;
   double dx[3];
@@ -79,4 +78,90 @@ space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
   return sid;
 }
 
+__attribute__((always_inline, nonnull)) INLINE static int // A. Nasar Same as usual but only used to pack GPU cells
+space_getsid_GPU(const struct space *s, struct cell **ci,
+             struct cell **cj, double *shift_x, double *shift_y,
+			 double *shift_z) {
+  /* Get the relative distance between the pairs, wrapping. */
+  const int periodic = s->periodic;
+  double dx[3];
+  for(int k = 0; k < 3; k++)
+	  dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+
+  if (periodic && dx[0] < -s->dim[0] / 2)
+    *(shift_x) = s->dim[0];
+  else if (periodic && dx[0] > s->dim[0] / 2)
+    *(shift_x) = -s->dim[0];
+  else
+	  *(shift_x) = 0.0;
+
+  dx[0] += *(shift_x);
+
+  if (periodic && dx[1] < -s->dim[1] / 2)
+    *(shift_y) = s->dim[1];
+  else if (periodic && dx[1] > s->dim[1] / 2)
+    *(shift_y) = -s->dim[1];
+  else
+	  *(shift_y) = 0.0;
+
+  dx[1] += *(shift_y);
+
+  if (periodic && dx[2] < -s->dim[2] / 2)
+    *(shift_z) = s->dim[2];
+  else if (periodic && dx[2] > s->dim[2] / 2)
+    *(shift_z) = -s->dim[2];
+  else
+	  *(shift_z) = 0.0;
+
+  dx[2] += *(shift_z);
+
+  /* Get the sorting index. */
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
+    sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+  /* Switch the cells around? */
+  if (runner_flip[sid]) {
+    struct cell *temp = *ci;
+    *ci = *cj;
+    *cj = temp;
+    *(shift_x) = -*(shift_x);
+    *(shift_y) = -*(shift_y);
+    *(shift_z) = -*(shift_z);
+  }
+  sid = sortlistID[sid];
+
+  /* Return the sort ID. */
+  return sid;
+}
+
+__attribute__((always_inline, nonnull)) INLINE static int
+space_getsid_filter(const struct space *s, struct cell **ci, struct cell **cj,
+             double shift[3]) {
+
+  /* Get the relative distance between the pairs, wrapping. */
+  const int periodic = s->periodic;
+  double dx[3];
+  for (int k = 0; k < 3; k++) {
+    dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+    if (periodic && dx[k] < -s->dim[k] / 2)
+      shift[k] = s->dim[k];
+    else if (periodic && dx[k] > s->dim[k] / 2)
+      shift[k] = -s->dim[k];
+    else
+      shift[k] = 0.0;
+    dx[k] += shift[k];
+  }
+
+  /* Get the sorting index. */
+  int sid = 0;
+  for (int k = 0; k < 3; k++)
+    sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1));
+
+  sid = sortlistID[sid];
+
+  /* Return the sort ID. */
+  return sid;
+}
+
 #endif /* SWIFT_SPACE_GETSID_H */
diff --git a/src/space_recycle.c b/src/space_recycle.c
index cf84227302..47ed2e43d7 100644
--- a/src/space_recycle.c
+++ b/src/space_recycle.c
@@ -232,6 +232,12 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->mpi.recv = NULL;
     c->mpi.send = NULL;
 #endif
+    c->hydro.density_pack = NULL; // A. Nasar
+    c->hydro.density_unpack = NULL;
+    c->hydro.gradient_pack = NULL;
+    c->hydro.gradient_unpack = NULL;
+    c->hydro.force_pack = NULL;
+    c->hydro.force_unpack = NULL;
   }
 }
 

From 294f7aec0212250b8fa8b32a6747a88a6960d206 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 25 Oct 2024 12:39:20 +0100
Subject: [PATCH 016/217] Added code to engine_marktasks.c

---
 src/cell_unskip.c      |  1 -
 src/engine_marktasks.c | 67 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 5fe8b0ef3f..95368320f4 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1913,7 +1913,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
     }
-
     for (struct link *l = c->hydro.limiter; l != NULL; l = l->next)
       scheduler_activate(s, l->t);
     for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 27b31c99c4..0975b7e9d8 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,6 +86,20 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     const enum task_types t_type = t->type;
     const enum task_subtypes t_subtype = t->subtype;
 
+    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
+    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
+    		t_subtype == task_subtype_gpu_unpack_g ||
+			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+      scheduler_activate(s, t);
+    }
+
+    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
+    		t_subtype == task_subtype_gpu_unpack_g ||
+			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+      scheduler_activate(s, t);
+//      fprintf(stderr,"activated pair unpack in marktasks\n");
+    }
+
     /* Single-cell task? */
     if (t_type == task_type_self || t_type == task_type_sub_self) {
 
@@ -115,6 +129,36 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         }
       }
 
+      /* Activate packing for GPU A. Nasar */
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done = 0;
+          ci->gpu_done = 0;
+          ci->unpack_done = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_g = 0;
+          ci->gpu_done_g = 0;
+          ci->unpack_done_g = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_f = 0;
+          ci->gpu_done_f = 0;
+          ci->unpack_done_f = 0;
+        }
+      }
+
       /* Store current values of dx_max and h_max. */
       else if (t_type == task_type_sub_self &&
                t_subtype == task_subtype_density) {
@@ -409,6 +453,29 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int ci_active_rt = cell_is_rt_active(ci, e);
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
+      /* Activate packing for GPU A. Nasar */
+	  if(t_subtype == task_subtype_gpu_pack &&
+	   ((ci_active_hydro && ci_nodeID == nodeID) ||
+		(cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair = 0;
+	    cj->gpu_done_pair = 0;
+	  }
+	  else if (t_subtype == task_subtype_gpu_pack_g &&
+		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+		      (cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair_g = 0;
+	    cj->gpu_done_pair_g = 0;
+	  }
+	  else if (t_subtype == task_subtype_gpu_pack_f &&
+		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+		      (cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair_f = 0;
+	    cj->gpu_done_pair_f = 0;
+      }
+
       /* Only activate tasks that involve a local active cell. */
       if ((t_subtype == task_subtype_density ||
            t_subtype == task_subtype_gradient ||

From 61c83a1f9b001b06552224fbf84ad7a1d06fcf99 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 25 Oct 2024 18:44:31 +0100
Subject: [PATCH 017/217] All coded up but there seems to be a problem with
 duplicate unlocks. Not sure where the issue is

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   9 +-
 src/cell_unskip.c                             |  31 +-
 src/engine_maketasks.c                        | 349 +++++++++++++++++-
 src/engine_marktasks.c                        |  11 +
 4 files changed, 375 insertions(+), 25 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index a95a0eae32..e058f5117d 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,10 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 15
+  max_top_level_cells: 16
+  cell_split_size: 1000
+  dependency_graph_frequency: 1
+  tasks_per_cell: 300
 
 # Parameters governing the time integration
 TimeIntegration:
@@ -29,10 +32,10 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        2.01   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
-  periodic:   1
\ No newline at end of file
+  periodic:   1
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 95368320f4..3f1ea8baa8 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1917,18 +1917,16 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
       scheduler_activate(s, l->t);
     for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
       scheduler_activate(s, l->t);
-      if (l->t->ci != NULL)
+      if (l->t->ci != NULL){
         l->t->ci->pack_done = 0;
-      if (l->t->ci != NULL)
         l->t->ci->gpu_done = 0;
-      if (l->t->ci != NULL)
         l->t->ci->unpack_done = 0;
-      if (l->t->cj != NULL)
+      }
+      if (l->t->cj != NULL){
         l->t->cj->pack_done = 0;
-      if (l->t->cj != NULL)
         l->t->cj->gpu_done = 0;
-      if (l->t->cj != NULL)
         l->t->cj->unpack_done = 0;
+      }
     }
     for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1936,18 +1934,16 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      if (l->t->ci != NULL)
+      if (l->t->ci != NULL){
         l->t->ci->pack_done_f = 0;
-      if (l->t->ci != NULL)
         l->t->ci->gpu_done_f = 0;
-      if (l->t->ci != NULL)
         l->t->ci->unpack_done_f = 0;
-      if (l->t->cj != NULL)
+      }
+      if (l->t->cj != NULL){
         l->t->cj->pack_done_f = 0;
-      if (l->t->cj != NULL)
         l->t->cj->gpu_done_f = 0;
-      if (l->t->cj != NULL)
         l->t->cj->unpack_done_f = 0;
+      }
     }
     for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1957,18 +1953,16 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      if (l->t->ci != NULL)
+      if (l->t->ci != NULL){
         l->t->ci->pack_done_g = 0;
-      if (l->t->ci != NULL)
         l->t->ci->gpu_done_g = 0;
-      if (l->t->ci != NULL)
         l->t->ci->unpack_done_g = 0;
-      if (l->t->cj != NULL)
+      }
+      if (l->t->cj != NULL){
         l->t->cj->pack_done_g = 0;
-      if (l->t->cj != NULL)
         l->t->cj->gpu_done_g = 0;
-      if (l->t->cj != NULL)
         l->t->cj->unpack_done_g = 0;
+      }
     }
     for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -2000,6 +1994,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
    * so, we have to do this now, from the active remote cell). */
   else if (c->nodeID != nodeID && c_active) {
 #if defined(MPI_SYMMETRIC_FORCE_INTERACTION) && defined(WITH_MPI)
+	  // A. Nasar POSSIBLE BUG HERE MISSING ACTIVATION OF PACK TASKS
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       struct task *t = l->t;
       if (t->type != task_type_pair && t->type != task_type_sub_pair) continue;
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 1c5a65d88f..da03b2bfc5 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -584,7 +584,11 @@ void engine_addtasks_recv_hydro(
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
 
   /* Have we reached a level where there are any hydro tasks ? */
+#ifdef WITH_CUDA // A. Nasar
+  if (t_xv == NULL && c->hydro.density != NULL  && c->hydro.density_pack != NULL) {
+#else
   if (t_xv == NULL && c->hydro.density != NULL) {
+#endif /*WITH_CUDA*/
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Make sure this cell has a valid tag. */
@@ -711,6 +715,13 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
     }
+#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE)*/
+    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_xv, l->t);
+      scheduler_addunlock(s, l->t, t_rho);
+    }
+    scheduler_addunlock(s, c->hydro.super->hydro.d_unpack, t_rho);
+#endif
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
@@ -720,12 +731,32 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_gradient, l->t);
       scheduler_addunlock(s, l->t, tend);
     }
-#else
+#ifdef WITH_CUDA
+    for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_rho, l->t);
+//      scheduler_addunlock(s, l->t, t_gradient);
+    }
+    scheduler_addunlock(s, c->hydro.super->hydro.g_unpack, t_gradient);
+
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_gradient, l->t);
+//      scheduler_addunlock(s, l->t, t_ti);
+    }
+    scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+#endif   /*WITH_CUDA*/
+#else /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
       scheduler_addunlock(s, l->t, tend);
     }
-#endif
+#ifdef WITH_CUDA
+    for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, t_rho, l->t);
+//      scheduler_addunlock(s, l->t, t_ti);
+    }
+    scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+#endif   /*WITH_CUDA*/
+#endif/*EXTRA_HYDRO_LOOP*/
 
     if (with_limiter) {
       for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
@@ -2116,6 +2147,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+        engine_addlink(e, &ci->hydro.density_pack, t);
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        engine_addlink(e, &ci->hydro.force_pack, t);
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2130,6 +2167,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        engine_addlink(e, &cj->hydro.density_pack, t);
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        engine_addlink(e, &ci->hydro.force_pack, t);
+        engine_addlink(e, &cj->hydro.force_pack, t);
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        engine_addlink(e, &cj->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2146,6 +2192,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2160,6 +2212,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
+      } else if (t_subtype == task_subtype_gpu_pack) {
+        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        error("Abouzied: you need to code this up!");
+      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2425,12 +2483,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
   const int with_sink = (e->policy & engine_policy_sinks);
 #ifdef EXTRA_HYDRO_LOOP
   struct task *t_gradient = NULL;
+  struct task *t_gradient_gpu = NULL; // A. Nasar
 #endif
 #ifdef EXTRA_STAR_LOOPS
   struct task *t_star_prep1 = NULL;
   struct task *t_star_prep2 = NULL;
 #endif
   struct task *t_force = NULL;
+  struct task *t_force_gpu = NULL;
   struct task *t_limiter = NULL;
   struct task *t_star_density = NULL;
   struct task *t_star_feedback = NULL;
@@ -2466,6 +2526,25 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
     }
 
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack && ci->nodeID == nodeID) {
+      scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+    }
+
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack) {
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      if (ci->hydro.super != cj->hydro.super){
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+      }
+      if(ci->nodeID == nodeID){
+        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      }
+      if((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)){
+        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+      }
+    }
+
     /* Sort tasks depend on the drift of the cell (stars version). */
     else if (t_type == task_type_stars_sort && ci->nodeID == nodeID) {
       scheduler_addunlock(sched, ci->hydro.super->stars.drift, t);
@@ -2482,7 +2561,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Task for the second hydro loop, */
       t_force = scheduler_addtask(sched, task_type_self, task_subtype_force,
                                   flags, 0, ci, NULL);
-
+      /* Task for the second GPU hydro loop A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
+                                  0, 0, ci, NULL);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
         t_limiter = scheduler_addtask(sched, task_type_self,
@@ -2583,18 +2665,28 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
 
+      /* Same work for the additional GPU hydro loop A. Nasar */
+      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
+                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+
       /* Add the link between the new loops and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
 
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
+
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -2750,6 +2842,12 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force */
       t_force = scheduler_addtask(sched, task_type_pair, task_subtype_force,
                                   flags, 0, ci, cj);
+      /* New task for the force A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
+                                  0, 0, ci, cj);
+      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -2900,10 +2998,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
+      /* Start by constructing the task for the second and third hydro loop A. Nasar */
+      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
+      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -2917,6 +3021,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
       }
+      if(ci->nodeID == nodeID){
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+      }
 #else
 
       /* Now, build all the dependencies for the hydro for the cells */
@@ -2931,6 +3043,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
       }
+      if (ci->nodeID == nodeID)
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super))
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+
 #endif
 
       if (with_feedback) {
@@ -4142,9 +4259,13 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       continue;
 
     /* If the cell is local build a self-interaction */
+    struct task *t_pack_self;// A. Nasar
     if (ci->nodeID == nodeID) {
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
+      struct task *t_pack = scheduler_addtask(
+          sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci, NULL); // A. Nasar
+      t_pack_self = t_pack;
     }
 
     /* Now loop over all the neighbours of this cell */
@@ -4178,6 +4299,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
+          struct task *t_pack = scheduler_addtask(
+              sched, task_type_pair, task_subtype_gpu_pack, sid, 0, ci, cj); // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4600,11 +4723,15 @@ void engine_maketasks(struct engine *e) {
   struct cell *cells = s->cells_top;
   const int nr_cells = s->nr_cells;
   const ticks tic = getticks();
-
+  sched->pack_tasks_ind = NULL; // A. Nasar
   /* Re-set the scheduler. */
   scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
   ticks tic2 = getticks();
+  /*Initialise GPU task size in prep. for creation A. Nasar */
+  sched->target_gpu_tasks = s->nr_cells; // OK AS LONG AS NOT SPLITTING
+  const int target_gpu_tasks = sched->target_gpu_tasks;
+  sched->pack_tasks_ind = (int *)calloc(target_gpu_tasks, sizeof(int));
 
   /* Construct the first hydro loop over neighbours */
   if (e->policy & engine_policy_hydro)
@@ -4716,6 +4843,220 @@ void engine_maketasks(struct engine *e) {
      *                threadpool_auto_chunk_size, e); */
   }
 
+  /* Now, create unpack tasks based on the existing packs and create
+   * the dependencies pack->unpack->ghost_in A. Nasar */
+
+  const int pack_size = sched->pack_size;
+
+  int count_current_self = 0;
+  int count_current_pair = 0;
+
+  struct task *last_created_self_unpack = NULL;
+  struct task *last_created_pair_unpack = NULL;
+
+  /* Loop over all the currently existing pack tasks */
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack)
+      continue;
+
+    if (t->type == task_type_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+//        last_created_self_unpack->ci_unpack = (struct cell **)calloc(pack_size, sizeof(struct cell *));
+        last_created_self_unpack->gpu_done = 0;
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.ghost_in);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
+      /*Create a link between each unpack task and all the pack task cells it unlocks*/
+//      last_created_self_unpack->ci_unpack[packed_counter]=t->ci;
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair) {
+      if (count_current_pair % pack_size == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      if(t->ci->hydro.ghost_in == NULL)
+        fprintf(stderr, "Ghost in for cell i is NULL\n");
+      if(t->cj->hydro.ghost_in == NULL)
+        fprintf(stderr, "Ghost in for cell j is NULL\n");
+
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+//      if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
+//      scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->ci->hydro.super->hydro.ghost_in);
+//      if(t->cj->hydro.super == t->cj && t->cj->nodeID == e->nodeID)
+//      scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->cj->hydro.super->hydro.ghost_in);
+
+      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
+
+
+      t->ci->hydro.d_unpack = last_created_pair_unpack;
+      t->cj->hydro.d_unpack = last_created_pair_unpack;
+
+//      t->ci->hydro.super->hydro.d_unpack = last_created_self_unpack;
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+  message("nr unpacks %i\n", count_current_pair);
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks)
+    error("We did not find the correct number of self pack tasks!!");
+  if (count_current_pair != sched->nr_pair_pack_tasks)
+    error("We did not find the correct number of pair pack tasks!!");
+#endif
+
+  /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
+  count_current_self = 0;
+  count_current_pair = 0;
+
+  last_created_self_unpack = NULL;
+  last_created_pair_unpack = NULL;
+  /* Loop over all the currently existing gradient pack tasks */
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack_g)
+      continue;
+
+    if (t->type == task_type_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+        last_created_self_unpack->gpu_done = 0;
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.extra_ghost);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
+
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair) {
+      if (count_current_pair % pack_size == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.extra_ghost);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.extra_ghost);
+
+      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
+
+//      t->ci->hydro.super->hydro.d_unpack = last_created_pair_unpack;
+//      t->cj->hydro.super->hydro.d_unpack = last_created_pair_unpack;
+      t->ci->hydro.g_unpack = last_created_pair_unpack;
+      t->cj->hydro.g_unpack = last_created_pair_unpack;
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks_g)
+    error("We did not find the correct number of G self pack tasks!! count %i what it shoudl be %i", count_current_self, sched->nr_self_pack_tasks_g);
+  if (count_current_pair != sched->nr_pair_pack_tasks_g)
+	    error("We did not find the correct number of G pair pack tasks!! count %i what it shoudl be %i", count_current_pair, sched->nr_pair_pack_tasks_g);
+#endif
+
+  /*Now create unpacks for all gpu_pack_f (force) tasks*/
+  count_current_self = 0;
+  count_current_pair = 0;
+
+  last_created_self_unpack = NULL;
+  last_created_pair_unpack = NULL;
+  /* Loop over all the currently existing gradient pack tasks */
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype != task_subtype_gpu_pack_f)
+      continue;
+
+    if (t->type == task_type_self) {
+
+      if (count_current_self % pack_size == 0) {
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+        last_created_self_unpack->gpu_done = 0;
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.end_force);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
+
+      ++count_current_self;
+    }
+
+    else if (t->type == task_type_pair) {
+      if (count_current_pair % pack_size == 0) {
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+      }
+
+      /* pack -> unpack -> ghost_in */
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if (t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.end_force);
+      if ((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.end_force);
+
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
+
+
+      t->ci->hydro.f_unpack = last_created_pair_unpack;
+      t->cj->hydro.f_unpack = last_created_pair_unpack;
+
+      ++count_current_pair;
+    } else {
+      /* Abouzied: I need to implement the sub-self and sub-pair version */
+      error("Something bad happened");
+    }
+  }
+#ifdef SWIFT_DEBUG_CHECKS
+  if (count_current_self != sched->nr_self_pack_tasks_f)
+    error("We did not find the correct number of F self pack tasks!!");
+  if (count_current_pair != sched->nr_pair_pack_tasks_f)
+    error("We did not find the correct number of F pair pack tasks!!");
+#endif
+
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 0975b7e9d8..a9bf819ce6 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -91,12 +91,14 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     		t_subtype == task_subtype_gpu_unpack_g ||
 			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
       scheduler_activate(s, t);
+      continue;
     }
 
     if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
     		t_subtype == task_subtype_gpu_unpack_g ||
 			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
       scheduler_activate(s, t);
+      continue;
 //      fprintf(stderr,"activated pair unpack in marktasks\n");
     }
 
@@ -107,7 +109,16 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
 
 #ifdef SWIFT_DEBUG_CHECKS
+#ifndef WITH_CUDA // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
+#else
+      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
+    		  t_subtype != task_subtype_gpu_unpack_f &&
+			  t_subtype != task_subtype_gpu_unpack_g){
+    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
+      }
+#endif
 #endif
 
       const int ci_active_hydro = cell_is_active_hydro(ci, e);

From 21ed5cde81f3a5d90cde8febd0e11fab687c4239 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 28 Oct 2024 18:19:02 +0000
Subject: [PATCH 018/217] Made some changes here and there to try and get deps
 right for unpack tasks. Code now hangs so there must be some issue with task
 activation. problem for tomorrow!

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  8 +-
 src/engine.c                                  |  6 +-
 src/engine_maketasks.c                        | 83 ++++++++++---------
 src/engine_marktasks.c                        |  2 +-
 src/scheduler.c                               | 25 +++---
 src/scheduler.h                               |  5 --
 src/task.c                                    | 20 ++---
 7 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index e058f5117d..4325c9b9c3 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -9,21 +9,21 @@ InternalUnitSystem:
 Scheduler:
   max_top_level_cells: 16
   cell_split_size: 1000
+  dependency_graph_cell: 10
   dependency_graph_frequency: 1
-  tasks_per_cell: 300
 
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   1.    # The end time of the simulation (in internal units).
+  time_end:   1.0    # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
   basename:            gresho # Common part of the name of output files
   time_first:          0.     # Time of the first output (in internal units)
-  delta_time:          1e-1   # Time difference between consecutive outputs (in internal units)
+  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
   compression:         1
   
 # Parameters governing the conserved quantities statistics
diff --git a/src/engine.c b/src/engine.c
index b461060084..b353dd4496 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1092,9 +1092,9 @@ int engine_estimate_nr_tasks(const struct engine *e) {
      */
     n1 += 38;
     n2 += 2;
-#ifdef WITH_CUDA
-    n1 += 2; //Self force and density packs
-    n1 += 26; //Pair force and density packs
+#ifdef WITH_CUDA // A. Nasar
+    n1 += 4; //Self force and density packs should be 2 but doubled to prevent code crash due to unpack tasks
+    n1 += 52; //Pair force and density packs should be 26 but doubled to prevent code crash due to unpack tasks
 #endif
 #ifdef WITH_MPI
     n1 += 6;
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index da03b2bfc5..c9e779e781 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2857,8 +2857,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
+      /* Make GPU force tasks depend on the sorts A. Nasar */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -2998,7 +3001,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-      /* Start by constructing the task for the second and third hydro loop A. Nasar */
+      /* Start by constructing the task for the second and third GPU hydro loop A. Nasar */
       t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
@@ -3015,17 +3018,15 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
+        /*Same for GPU tasks*/
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
-      }
-      if(ci->nodeID == nodeID){
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
-      }
-      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        /*Same for GPU tasks*/
         scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
         scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
@@ -3037,16 +3038,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                              with_cooling,
                                              with_timestep_limiter);
+        // GPU tasks A. Nasar
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
-      }
-      if (ci->nodeID == nodeID)
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
-      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super))
+        // GPU tasks A. Nasar
         scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+      }
 
 #endif
 
@@ -4723,7 +4724,6 @@ void engine_maketasks(struct engine *e) {
   struct cell *cells = s->cells_top;
   const int nr_cells = s->nr_cells;
   const ticks tic = getticks();
-  sched->pack_tasks_ind = NULL; // A. Nasar
   /* Re-set the scheduler. */
   scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
@@ -4731,7 +4731,6 @@ void engine_maketasks(struct engine *e) {
   /*Initialise GPU task size in prep. for creation A. Nasar */
   sched->target_gpu_tasks = s->nr_cells; // OK AS LONG AS NOT SPLITTING
   const int target_gpu_tasks = sched->target_gpu_tasks;
-  sched->pack_tasks_ind = (int *)calloc(target_gpu_tasks, sizeof(int));
 
   /* Construct the first hydro loop over neighbours */
   if (e->policy & engine_policy_hydro)
@@ -4894,12 +4893,6 @@ void engine_maketasks(struct engine *e) {
         fprintf(stderr, "Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-//      if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
-//      scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->ci->hydro.super->hydro.ghost_in);
-//      if(t->cj->hydro.super == t->cj && t->cj->nodeID == e->nodeID)
-//      scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->cj->hydro.super->hydro.ghost_in);
 
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
@@ -4916,14 +4909,23 @@ void engine_maketasks(struct engine *e) {
       error("Something bad happened");
     }
   }
-  message("nr unpacks %i\n", count_current_pair);
 #ifdef SWIFT_DEBUG_CHECKS
   if (count_current_self != sched->nr_self_pack_tasks)
     error("We did not find the correct number of self pack tasks!!");
   if (count_current_pair != sched->nr_pair_pack_tasks)
     error("We did not find the correct number of pair pack tasks!!");
 #endif
-
+  /* Loop over all the currently existing ghost_in tasks to add unpack dependency*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+    struct task *t = &sched->tasks[i];
+    if (t->type != task_type_ghost_in)
+      continue;
+    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
+          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
+    for (struct link *l = t->ci->hydro.density_pack; l != NULL; l = l->next) {
+      if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
+    }
+  }
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
   count_current_pair = 0;
@@ -4963,18 +4965,10 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
-        scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.extra_ghost);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-        scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.extra_ghost);
 
       engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
 
-//      t->ci->hydro.super->hydro.d_unpack = last_created_pair_unpack;
-//      t->cj->hydro.super->hydro.d_unpack = last_created_pair_unpack;
       t->ci->hydro.g_unpack = last_created_pair_unpack;
       t->cj->hydro.g_unpack = last_created_pair_unpack;
 
@@ -4990,7 +4984,17 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_g)
 	    error("We did not find the correct number of G pair pack tasks!! count %i what it shoudl be %i", count_current_pair, sched->nr_pair_pack_tasks_g);
 #endif
-
+  /* Loop over all the currently existing extra_ghost tasks to add unpack dependency*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+    struct task *t = &sched->tasks[i];
+    if (t->type != task_type_extra_ghost)
+      continue;
+    if(t->ci->nodeID == e->nodeID)
+      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
+    for (struct link *l = t->ci->hydro.gradient_pack; l != NULL; l = l->next) {
+    	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
+    }
+  }
   /*Now create unpacks for all gpu_pack_f (force) tasks*/
   count_current_self = 0;
   count_current_pair = 0;
@@ -5030,17 +5034,10 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if (t->ci->nodeID == e->nodeID)
-        scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.end_force);
-      if ((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-        scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.end_force);
 
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
 
-
       t->ci->hydro.f_unpack = last_created_pair_unpack;
       t->cj->hydro.f_unpack = last_created_pair_unpack;
 
@@ -5056,7 +5053,17 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_f)
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
-
+  /* Loop over all the currently existing end_force tasks to add unpack dependency*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+    struct task *t = &sched->tasks[i];
+    if (t->type != task_type_end_hydro_force)
+      continue;
+    if(t->ci->nodeID == e->nodeID)
+      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
+    for (struct link *l = t->ci->hydro.force_pack; l != NULL; l = l->next) {
+    	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
+    }
+  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index a9bf819ce6..11d8b46d92 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -487,7 +487,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 	    cj->gpu_done_pair_f = 0;
       }
 
-      /* Only activate tasks that involve a local active cell. */
+      /* Only activate tasks that involve a local active cell. A. Nasar THIS COULD BE SOURCE OF BUG */
       if ((t_subtype == task_subtype_density ||
            t_subtype == task_subtype_gradient ||
            t_subtype == task_subtype_limiter ||
diff --git a/src/scheduler.c b/src/scheduler.c
index b4eb8ae70d..805ea4ce6d 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -901,7 +901,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
   int local_count = 0;
   for (int i = 0; i < s->nr_tasks; i++) {
     const struct task *ta = &s->tasks[i];
-
+//    if(ta->subtype == task_subtype_gpu_unpack
+//  		  || ta->subtype == task_subtype_gpu_unpack_f
+//			  || ta->subtype == task_subtype_gpu_unpack_g)continue;
     /* Are we using this task?
      * For the 0-step, we wish to show all the tasks (even the inactives). */
     if (step != 0 && ta->skip) continue;
@@ -953,7 +955,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
     /* and their dependencies */
     for (int j = 0; j < ta->nr_unlock_tasks; j++) {
       const struct task *tb = ta->unlock_tasks[j];
-
+      if(tb->subtype == task_subtype_gpu_unpack
+        		  || tb->subtype == task_subtype_gpu_unpack_f
+      			  || tb->subtype == task_subtype_gpu_unpack_g)continue;
       /* Are we using this task?
        * For the 0-step, we wish to show all the tasks (even the inactive). */
       if (step != 0 && tb->skip) continue;
@@ -1865,6 +1869,8 @@ void scheduler_set_unlocks(struct scheduler *s) {
     struct task *t = &s->tasks[k];
     for (int i = 0; i < t->nr_unlock_tasks; i++) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
+//        if (t->unlock_tasks[i] == t->unlock_tasks[j] && t->subtype != task_subtype_gpu_unpack
+//        		&& t->subtype != task_subtype_gpu_unpack_g && t->subtype != task_subtype_gpu_unpack_f)
         if (t->unlock_tasks[i] == t->unlock_tasks[j])
           error("duplicate unlock! t->type=%s/%s unlocking type=%s/%s",
                 taskID_names[t->type], subtaskID_names[t->subtype],
@@ -1986,8 +1992,6 @@ void scheduler_reset(struct scheduler *s, int size) {
   s->total_ticks = 0;
   s->pack_size = N_TASKS_PER_PACK_SELF;
   s->pack_size_pair = N_TASKS_PER_PACK_PAIR;
-  if (s->pack_tasks_ind != NULL)
-    free(s->pack_tasks_ind);
   /* Set the task pointers in the queues. */
   for (int k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks;
 }
@@ -2617,6 +2621,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_pair:
       case task_type_sub_pair:
+        if(t->subtype == task_subtype_gpu_unpack ||
+            t->subtype == task_subtype_gpu_unpack_f ||
+  		   t->subtype == task_subtype_gpu_unpack_g) qid = -1;
+        break;
         qid = t->ci->super->owner;
         owner = &t->ci->super->owner;
         if ((qid < 0) ||
@@ -2625,10 +2633,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
           qid = t->cj->super->owner;
           owner = &t->cj->super->owner;
         }
-        if(t->subtype == task_subtype_gpu_unpack ||
-            t->subtype == task_subtype_gpu_unpack_f ||
-		     t->subtype == task_subtype_gpu_unpack_g) qid = -1;
-        break;
       case task_type_recv:
 #ifdef WITH_MPI
       {
@@ -3170,7 +3174,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   s->size = 0;
   s->tasks = NULL;
   s->tasks_ind = NULL;
-  s->pack_tasks_ind = NULL; // A. Nasar
   scheduler_reset(s, nr_tasks);
 
 #if defined(SWIFT_DEBUG_CHECKS)
@@ -3234,10 +3237,6 @@ void scheduler_free_tasks(struct scheduler *s) {
   s->size = 0;
   s->nr_tasks = 0;
   //reset GPU task counters too
-  if (s->pack_tasks_ind != NULL) { // A. Nasar
-    swift_free("pack_tasks_ind", s->pack_tasks_ind);
-    s->pack_tasks_ind = NULL;
-  }
   s->nr_self_pack_tasks = 0;
   s->nr_self_pack_tasks_f = 0;
   s->nr_self_pack_tasks_g = 0;
diff --git a/src/scheduler.h b/src/scheduler.h
index 155360ede6..16ef975be8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -67,7 +67,6 @@ struct scheduler {
   int nr_packs_pair_forc_done;
   int nr_packs_self_grad_done;
   int nr_packs_pair_grad_done;
-
   /* Actual number of GPU tasks. */
   int nr_gpu_tasks;
   /* Number of tasks we want*/
@@ -78,10 +77,6 @@ struct scheduler {
   int nr_self_pack_tasks_f, nr_pair_pack_tasks_f;
   /* Actual number of gradient pack tasks. */
   int nr_self_pack_tasks_g, nr_pair_pack_tasks_g;
-  /* Pack task indices */
-
-  // MATTHIEU: To be removed as unused !!!
-  int *pack_tasks_ind;
 
   /*how many tasks we want to try and work on at once on the GPU*/
   int pack_size;
diff --git a/src/task.c b/src/task.c
index d1bfe8d9ca..e9476e1919 100644
--- a/src/task.c
+++ b/src/task.c
@@ -902,16 +902,16 @@ int task_lock(struct task *t) {
         if (cell_locktree(ci) != 0)
           return 0;
       } else if (subtype == task_subtype_gpu_unpack) {
-  //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
-  //    	  if (t->ci_unpack[pp]->gpu_done == 0){
-  //    		  message("trying to queue an unpack before all packs done on GPU");
-  //    		  return 0;
-  //    	  }
-  ////          if (t->ci_unpack[pp]->hydro.hold)
-  ////    		return 0;
-  ////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
-  ////            return 0;
-  //        }
+//        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
+//    	  if (t->ci_unpack[pp]->gpu_done == 0){
+//    		  message("trying to queue an unpack before all packs done on GPU");
+//    		  return 0;
+//    	  }
+////          if (t->ci_unpack[pp]->hydro.hold)
+////    		return 0;
+////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
+////            return 0;
+//        }
         /* Nothing to do here */
         return 1;
       } else if (subtype == task_subtype_gpu_unpack_f) {

From aa3eeabc8d1d41a43a8acc45a605c8ddc41b05ba Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 29 Oct 2024 14:08:49 +0000
Subject: [PATCH 019/217] Commented out GPU code from engine_marktasks.c to see
 if that could have been source of hanging bug. Not the case :(

---
 src/cell_unskip.c        |   2 +-
 src/engine_marktasks.c   | 146 +++++++++++++++++++--------------------
 src/runner_main_clean.cu |   6 +-
 3 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 3f1ea8baa8..f5b910d79d 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -884,7 +884,7 @@ void cell_activate_subcell_hydro_tasks(struct cell *ci, struct cell *cj,
       cell_activate_hydro_sorts(ci, sid, s);
       cell_activate_hydro_sorts(cj, sid, s);
     }
-  } /* Otherwise, pair interation */
+  } /* Otherwise, pair interaction */
 }
 
 /**
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 11d8b46d92..97af2fdc8d 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,21 +86,21 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     const enum task_types t_type = t->type;
     const enum task_subtypes t_subtype = t->subtype;
 
-    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
-    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
-    		t_subtype == task_subtype_gpu_unpack_g ||
-			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
-      scheduler_activate(s, t);
-      continue;
-    }
-
-    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
-    		t_subtype == task_subtype_gpu_unpack_g ||
-			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
-      scheduler_activate(s, t);
-      continue;
-//      fprintf(stderr,"activated pair unpack in marktasks\n");
-    }
+//    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
+//    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
+//    		t_subtype == task_subtype_gpu_unpack_g ||
+//			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+//      scheduler_activate(s, t);
+//      continue;
+//    }
+//
+//    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
+//    		t_subtype == task_subtype_gpu_unpack_g ||
+//			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+//      scheduler_activate(s, t);
+//      continue;
+////      fprintf(stderr,"activated pair unpack in marktasks\n");
+//    }
 
     /* Single-cell task? */
     if (t_type == task_type_self || t_type == task_type_sub_self) {
@@ -109,16 +109,16 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
 
 #ifdef SWIFT_DEBUG_CHECKS
-#ifndef WITH_CUDA // A. Nasar
+//#ifndef WITH_CUDA // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
-#else
-      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
-    		  t_subtype != task_subtype_gpu_unpack_f &&
-			  t_subtype != task_subtype_gpu_unpack_g){
-    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
-    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
-      }
-#endif
+//#else
+//      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
+//    		  t_subtype != task_subtype_gpu_unpack_f &&
+//			  t_subtype != task_subtype_gpu_unpack_g){
+//    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+//    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
+//      }
+//#endif
 #endif
 
       const int ci_active_hydro = cell_is_active_hydro(ci, e);
@@ -141,34 +141,34 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       /* Activate packing for GPU A. Nasar */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
-        if (ci_active_hydro) {
-          scheduler_activate(s, t);
-          ci->pack_done = 0;
-          ci->gpu_done = 0;
-          ci->unpack_done = 0;
-        }
-      }
-
-      /* Activate packing for GPU */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
-        if (ci_active_hydro) {
-          scheduler_activate(s, t);
-          ci->pack_done_g = 0;
-          ci->gpu_done_g = 0;
-          ci->unpack_done_g = 0;
-        }
-      }
-
-      /* Activate packing for GPU */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
-        if (ci_active_hydro) {
-          scheduler_activate(s, t);
-          ci->pack_done_f = 0;
-          ci->gpu_done_f = 0;
-          ci->unpack_done_f = 0;
-        }
-      }
+//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
+//        if (ci_active_hydro) {
+//          scheduler_activate(s, t);
+//          ci->pack_done = 0;
+//          ci->gpu_done = 0;
+//          ci->unpack_done = 0;
+//        }
+//      }
+//
+//      /* Activate packing for GPU */
+//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
+//        if (ci_active_hydro) {
+//          scheduler_activate(s, t);
+//          ci->pack_done_g = 0;
+//          ci->gpu_done_g = 0;
+//          ci->unpack_done_g = 0;
+//        }
+//      }
+//
+//      /* Activate packing for GPU */
+//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
+//        if (ci_active_hydro) {
+//          scheduler_activate(s, t);
+//          ci->pack_done_f = 0;
+//          ci->gpu_done_f = 0;
+//          ci->unpack_done_f = 0;
+//        }
+//      }
 
       /* Store current values of dx_max and h_max. */
       else if (t_type == task_type_sub_self &&
@@ -465,27 +465,27 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
       /* Activate packing for GPU A. Nasar */
-	  if(t_subtype == task_subtype_gpu_pack &&
-	   ((ci_active_hydro && ci_nodeID == nodeID) ||
-		(cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair = 0;
-	    cj->gpu_done_pair = 0;
-	  }
-	  else if (t_subtype == task_subtype_gpu_pack_g &&
-		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-		      (cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair_g = 0;
-	    cj->gpu_done_pair_g = 0;
-	  }
-	  else if (t_subtype == task_subtype_gpu_pack_f &&
-		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-		      (cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair_f = 0;
-	    cj->gpu_done_pair_f = 0;
-      }
+//	  if(t_subtype == task_subtype_gpu_pack &&
+//	   ((ci_active_hydro && ci_nodeID == nodeID) ||
+//		(cj_active_hydro && cj_nodeID == nodeID))) {
+//	    scheduler_activate(s, t);
+//	    ci->gpu_done_pair = 0;
+//	    cj->gpu_done_pair = 0;
+//	  }
+//	  else if (t_subtype == task_subtype_gpu_pack_g &&
+//		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+//		      (cj_active_hydro && cj_nodeID == nodeID))) {
+//	    scheduler_activate(s, t);
+//	    ci->gpu_done_pair_g = 0;
+//	    cj->gpu_done_pair_g = 0;
+//	  }
+//	  else if (t_subtype == task_subtype_gpu_pack_f &&
+//		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+//		      (cj_active_hydro && cj_nodeID == nodeID))) {
+//	    scheduler_activate(s, t);
+//	    ci->gpu_done_pair_f = 0;
+//	    cj->gpu_done_pair_f = 0;
+//      }
 
       /* Only activate tasks that involve a local active cell. A. Nasar THIS COULD BE SOURCE OF BUG */
       if ((t_subtype == task_subtype_density ||
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index ce59cc5f0b..efbb962dda 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -20,7 +20,7 @@
  ******************************************************************************/
 /* Config parameters. */
 //#define GPUOFFLOAD 1 //off-load hydro to GPU
-#define DO_CORNERS 1 //do corner pair tasks on CPU
+//#define DO_CORNERS 1 //do corner pair tasks on CPU
 //#define DUMP_TIMINGS 1
 #include "../config.h"
 
@@ -1840,8 +1840,7 @@ void *runner_main2(void *data) {
 ///////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////
 
-#endif
-#ifndef GPUOFFLOAD
+#else GPUOFFLOAD
 		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
 				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
 				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
@@ -1873,6 +1872,7 @@ void *runner_main2(void *data) {
     density = 0;
     density_sub = 0;
     unpacked = 0;
+    message("reached end of runner_main2\n");
 //	if(step == 2)cudaProfilerStop();
 //	if(step == 2)exit(0);
 //	  size_t free_byte ;

From 9c1b4941846378e29feb765610cfbc4849b28977 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 29 Oct 2024 15:35:03 +0000
Subject: [PATCH 020/217] Removed duplicate engine_addlink for g and f pack
 tasks. And re-wired deps for unpack tasks. CPU version works perfectly but
 GPU code now hangs for some reason

---
 src/engine_maketasks.c   | 68 +++++++++++++++++++++-------------------
 src/runner_main_clean.cu |  4 +--
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index c9e779e781..1de0e17db5 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2564,7 +2564,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Task for the second GPU hydro loop A. Nasar */
       t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
                                   0, 0, ci, NULL);
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
         t_limiter = scheduler_addtask(sched, task_type_self,
@@ -2672,13 +2672,13 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Add the link between the new loops and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
 
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
-
+      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and will be used to create downstream deps later
       scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
       scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
 #else
@@ -2845,9 +2845,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force A. Nasar */
       t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
                                   0, 0, ci, cj);
-      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+//      /* Add the link between the new loop and both cells */
+//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+//      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3008,9 +3008,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
-      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+//      /* Add the link between the new loop and both cells */
+//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+//      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -4265,7 +4265,7 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
       struct task *t_pack = scheduler_addtask(
-          sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci, NULL); // A. Nasar
+          sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci, NULL); // A. Nasar also add a pack task for GPU
       t_pack_self = t_pack;
     }
 
@@ -4865,7 +4865,6 @@ void engine_maketasks(struct engine *e) {
       if (count_current_self % pack_size == 0) {
         last_created_self_unpack = scheduler_addtask(
             sched, task_type_self, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
-//        last_created_self_unpack->ci_unpack = (struct cell **)calloc(pack_size, sizeof(struct cell *));
         last_created_self_unpack->gpu_done = 0;
       }
 
@@ -4873,10 +4872,9 @@ void engine_maketasks(struct engine *e) {
       scheduler_addunlock(sched, t, last_created_self_unpack);
       scheduler_addunlock(sched, last_created_self_unpack,
                           t->ci->hydro.super->hydro.ghost_in);
-      /*Creating links between a each cell and its unpack task*/
+      /*Creating links between each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
-      /*Create a link between each unpack task and all the pack task cells it unlocks*/
-//      last_created_self_unpack->ci_unpack[packed_counter]=t->ci;
+      t->ci->hydro.d_unpack = last_created_self_unpack;
       ++count_current_self;
     }
 
@@ -4897,9 +4895,10 @@ void engine_maketasks(struct engine *e) {
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
 
-
-      t->ci->hydro.d_unpack = last_created_pair_unpack;
-      t->cj->hydro.d_unpack = last_created_pair_unpack;
+      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
+       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
+//      t->ci->hydro.d_unpack = last_created_pair_unpack;
+//      t->cj->hydro.d_unpack = last_created_pair_unpack;
 
 //      t->ci->hydro.super->hydro.d_unpack = last_created_self_unpack;
 
@@ -4920,9 +4919,9 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->type != task_type_ghost_in)
       continue;
-    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
-          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-    for (struct link *l = t->ci->hydro.density_pack; l != NULL; l = l->next) {
+//    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
+//          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
+    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
       if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -4953,6 +4952,7 @@ void engine_maketasks(struct engine *e) {
                           t->ci->hydro.super->hydro.extra_ghost);
       /*Creating links between a each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
+      t->ci->hydro.g_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
@@ -4968,9 +4968,10 @@ void engine_maketasks(struct engine *e) {
 
       engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
-
-      t->ci->hydro.g_unpack = last_created_pair_unpack;
-      t->cj->hydro.g_unpack = last_created_pair_unpack;
+      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
+       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
+//      t->ci->hydro.g_unpack = last_created_pair_unpack;
+//      t->cj->hydro.g_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -4989,9 +4990,9 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->type != task_type_extra_ghost)
       continue;
-    if(t->ci->nodeID == e->nodeID)
-      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-    for (struct link *l = t->ci->hydro.gradient_pack; l != NULL; l = l->next) {
+//    if(t->ci->nodeID == e->nodeID)
+//      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
+    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -5023,6 +5024,8 @@ void engine_maketasks(struct engine *e) {
       /*Creating links between a each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
 
+      t->ci->hydro.f_unpack = last_created_self_unpack;
+
       ++count_current_self;
     }
 
@@ -5037,9 +5040,10 @@ void engine_maketasks(struct engine *e) {
 
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
-
-      t->ci->hydro.f_unpack = last_created_pair_unpack;
-      t->cj->hydro.f_unpack = last_created_pair_unpack;
+      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
+       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
+//      t->ci->hydro.f_unpack = last_created_pair_unpack;
+//      t->cj->hydro.f_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -5058,9 +5062,9 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->type != task_type_end_hydro_force)
       continue;
-    if(t->ci->nodeID == e->nodeID)
-      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-    for (struct link *l = t->ci->hydro.force_pack; l != NULL; l = l->next) {
+//    if(t->ci->nodeID == e->nodeID)
+//      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
+    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index efbb962dda..aef406bb86 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,8 +19,8 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-//#define GPUOFFLOAD 1 //off-load hydro to GPU
-//#define DO_CORNERS 1 //do corner pair tasks on CPU
+#define GPUOFFLOAD 1 //off-load hydro to GPU
+#define DO_CORNERS 1 //do corner pair tasks on CPU
 //#define DUMP_TIMINGS 1
 #include "../config.h"
 

From 5e082227ee8e8756128cf75f8e8d97d06cdfbcc6 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 29 Oct 2024 17:25:41 +0000
Subject: [PATCH 021/217] Minor changes here and there

---
 src/runner_main_clean.cu | 6 +++---
 src/scheduler.c          | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index aef406bb86..b7ee4372c5 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,8 +19,8 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD 1 //off-load hydro to GPU
-#define DO_CORNERS 1 //do corner pair tasks on CPU
+//#define GPUOFFLOAD 1 //off-load hydro to GPU
+//#define DO_CORNERS 1 //do corner pair tasks on CPU
 //#define DUMP_TIMINGS 1
 #include "../config.h"
 
@@ -1840,7 +1840,7 @@ void *runner_main2(void *data) {
 ///////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////
 
-#else GPUOFFLOAD
+#else //GPUOFFLOAD
 		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
 				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
 				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
diff --git a/src/scheduler.c b/src/scheduler.c
index 805ea4ce6d..87b27187fc 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2889,6 +2889,8 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   /* Mark the task as skip. */
   t->skip = 1;
 
+  t->done = 1;
+
   /* Return the next best task. Note that we currently do not
      implement anything that does this, as getting it to respect
      priorities is too tricky and currently unnecessary. */

From af0d256addb07160ee4b43ddb703fbae25d89717 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 15:23:55 +0000
Subject: [PATCH 022/217] Found a bug in task.c -> Wasn't unlocking gradient
 pack task

---
 src/runner_main_clean.cu | 13 ++++++-------
 src/task.c               |  1 +
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index b7ee4372c5..3dd3740e2c 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,9 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-//#define GPUOFFLOAD 1 //off-load hydro to GPU
-//#define DO_CORNERS 1 //do corner pair tasks on CPU
-//#define DUMP_TIMINGS 1
+#define GPUOFFLOAD 1 //off-load hydro to GPU
+#define DO_CORNERS 1 //do corner pair tasks on CPU
+#define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -1783,8 +1783,7 @@ void *runner_main2(void *data) {
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
       }
-#endif //GPUOFFLOAD
-#ifndef GPUOFFLOAD
+#else //GPUOFFLOAD
         t = scheduler_done(sched, t);
 #endif //GPUOFFLOAD
 
@@ -1848,8 +1847,8 @@ void *runner_main2(void *data) {
 
 		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
 				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-#endif
-#endif
+#endif //GPUOFFLOAD
+#endif //DUMPTIMINGS
 //    }
 	fflush(fgpu_steps);
 	fclose(fgpu_steps);
diff --git a/src/task.c b/src/task.c
index e9476e1919..29093634c0 100644
--- a/src/task.c
+++ b/src/task.c
@@ -618,6 +618,7 @@ void task_unlock(struct task *t) {
       } else if (subtype == task_subtype_gpu_pack_f) {
       	cell_unlocktree(ci);
       } else if (subtype == task_subtype_gpu_pack_g) {
+      	cell_unlocktree(ci);
       } else { /* hydro */
         cell_unlocktree(ci);
       }

From 655102745d581026c20f0fa0c39a3f49900a0d35 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 15:38:54 +0000
Subject: [PATCH 023/217] Found a bug in task.c -> Wasn't unlocking gradient
 pack task

---
 src/runner_doiact_functions_hydro_gpu.h |  8 +++---
 src/runner_main_clean.cu                | 20 ++++-----------
 src/scheduler.c                         | 34 ++++++++++++-------------
 3 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 2b047ae25a..d97664dfc9 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2643,10 +2643,8 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-
+          tii->skip = 1;
 		  tii->gpu_done = 1;
-
-
 		  }
 		}
 	}
@@ -3294,7 +3292,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-
+          tii->skip = 1;
 		  tii->gpu_done = 1;
 
 
@@ -3973,7 +3971,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-
+          tii->skip = 1;
 		  tii->gpu_done = 1;
 
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 3dd3740e2c..20baa4a4bf 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1763,22 +1763,12 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack){
+      if (t->subtype == task_subtype_gpu_pack ||
+    	  t->subtype == task_subtype_gpu_pack_g ||
+		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
-        t->skip = 1;
-        t = NULL;
-      }
-//      else if (t->subtype == task_subtype_gpu_pack_g && t->type == task_type_self){
-      else if (t->subtype == task_subtype_gpu_pack_g){
-      	/* Don't enqueue unpacks yet. Just signal the runners */
-        t->skip = 1;
-        t = NULL;
-      }
-//      else if (t->subtype == task_subtype_gpu_pack_f && t->type == task_type_self){
-      else if (t->subtype == task_subtype_gpu_pack_f){
-      	/* Don't enqueue unpacks yet. Just signal the runners */
-        t->skip = 1;
-        t = NULL;
+//        t->skip = 1;
+//        t = NULL;
       }
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
diff --git a/src/scheduler.c b/src/scheduler.c
index 87b27187fc..d296f44687 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2903,30 +2903,30 @@ struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
 
   /* Task definitely done, signal any sleeping runners. */
   if (!t->implicit) {
-    t->toc = getticks();
-    t->total_ticks += t->toc - t->tic;
-    pthread_mutex_lock(&s->sleep_mutex);
-    atomic_dec(&s->waiting);
-    pthread_cond_broadcast(&s->sleep_cond);
-    pthread_mutex_unlock(&s->sleep_mutex);
+	t->toc = getticks();
+	t->total_ticks += t->toc - t->tic;
+	pthread_mutex_lock(&s->sleep_mutex);
+	atomic_dec(&s->waiting);
+	pthread_cond_broadcast(&s->sleep_cond);
+	pthread_mutex_unlock(&s->sleep_mutex);
   }
   return NULL;
 }
 
 struct task *enqueue_dependencies(struct scheduler *s, struct task *t) {
-//  t->skip = 1;
+
   /* Loop through the dependencies and add them to a queue if
-     they are ready. */
+	 they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
-    struct task *t2 = t->unlock_tasks[k];
-    if (t2->skip)
-      continue;
-    const int res = atomic_dec(&t2->wait);
-    if (res < 1) {
-      error("Negative wait!");
-    } else if (res == 1) {
-      scheduler_enqueue(s, t2);
-    }
+	struct task *t2 = t->unlock_tasks[k];
+	if (t2->skip) continue;
+
+	const int res = atomic_dec(&t2->wait);
+	if (res < 1) {
+	  error("Negative wait!");
+	} else if (res == 1) {
+	  scheduler_enqueue(s, t2);
+	}
   }
 
   return NULL;

From 1dc8451d81fea54a372fb89de45aa205130d504c Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 16:05:00 +0000
Subject: [PATCH 024/217] Code still hanging. Will try starting from scratch
 with runner_main_clean.cu

---
 src/runner_doiact_functions_hydro_gpu.h | 8 +++++---
 src/runner_main_clean.cu                | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index d97664dfc9..2b047ae25a 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2643,8 +2643,10 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-          tii->skip = 1;
+
 		  tii->gpu_done = 1;
+
+
 		  }
 		}
 	}
@@ -3292,7 +3294,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-          tii->skip = 1;
+
 		  tii->gpu_done = 1;
 
 
@@ -3971,7 +3973,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 		  signal_sleeping_runners(s, tii);
-          tii->skip = 1;
+
 		  tii->gpu_done = 1;
 
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 20baa4a4bf..caabc163dc 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1767,8 +1767,8 @@ void *runner_main2(void *data) {
     	  t->subtype == task_subtype_gpu_pack_g ||
 		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
-//        t->skip = 1;
-//        t = NULL;
+        t->skip = 1;
+        t = NULL;
       }
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);

From 8103149fa5aaf5a378c9914da32e09923fbfde16 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 16:28:59 +0000
Subject: [PATCH 025/217] Copied over both runner_main_clean and
 runner_doiact_functions_hydro_gpu and re-worked runner_main_clean.cu but code
 still hangs

---
 src/runner_doiact_functions_hydro_gpu.h | 311 +++++++++++++++++++-----
 src/runner_main_clean.cu                | 120 ++++-----
 2 files changed, 308 insertions(+), 123 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 2b047ae25a..777ebff6b7 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -116,6 +116,8 @@ void runner_doself1_pack(struct runner *r, struct scheduler *s, struct pack_vars
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -171,6 +173,8 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 //	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -228,6 +232,8 @@ void runner_doself1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -283,6 +289,8 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	//    /* Identify row in particle arrays where this task starts*/
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -338,6 +346,8 @@ void runner_doself1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -392,6 +402,8 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	//    /* Identify row in particle arrays where this task starts*/
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+	////    /* id for the task*/
+	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -438,16 +450,21 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
+	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
 	pack_vars->ci_list[tasks_packed] = ci;
 	pack_vars->cj_list[tasks_packed] = cj;
 
+    const double cjx = cj->loc[0];
+    const double cjy = cj->loc[1];
+    const double cjz = cj->loc[2];
+
     float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
 
     const int count_ci = ci->hydro.count;
@@ -513,8 +530,8 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars
 	cell_unlocktree(cj);
 }
 
-double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -522,10 +539,11 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
+	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -540,6 +558,8 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -619,7 +639,7 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -634,7 +654,7 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
 	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
 	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	/*    /*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
 	pack_vars->shiftx[tid_tmp + 1] = cjx;
 	pack_vars->shifty[tid_tmp + 1] = cjy;
 	pack_vars->shiftz[tid_tmp + 1] = cjz;
@@ -693,8 +713,8 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 }
 
 
-double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -702,10 +722,11 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
+	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -720,6 +741,8 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -800,7 +823,7 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -815,7 +838,7 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
 	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
 	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	/*    /*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
 	pack_vars->shiftx[tid_tmp + 1] = cjx;
 	pack_vars->shifty[tid_tmp + 1] = cjy;
 	pack_vars->shiftz[tid_tmp + 1] = cjz;
@@ -873,8 +896,8 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
+		 struct cell * restrict cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -882,10 +905,11 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
+	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -900,6 +924,8 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
+    /*Indexing increment per task is 2 fot these arrays*/
+    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -1012,6 +1038,8 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1020,8 +1048,10 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
+		  last_task = tid;
 		}
 	  }
+	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1041,6 +1071,8 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 //	  }
 //#endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1050,6 +1082,7 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 //	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
@@ -1137,7 +1170,7 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int2 * d_task_first_part_self_dens_f4,
 		int devId, int2 * task_first_part_f4, int2 * d_task_first_part_f4, cudaEvent_t * self_end){
 
-	struct timespec t0, t1, tp0, tp1; //
+	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 
 	/* Identify the number of GPU bundles to run in ideal case*/
@@ -1236,6 +1269,8 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 //	  }
 //#endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1245,6 +1280,7 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //	  const char *loop_type = "density";
 //	  struct first_part first_parts;
@@ -1302,6 +1338,7 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -1410,6 +1447,8 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1418,8 +1457,10 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
+		  last_task = tid;
 		}
 	  }
+	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1439,6 +1480,8 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1448,6 +1491,7 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -1535,7 +1579,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 		cudaEvent_t *self_end, double *unpack_time){
 
 
-	struct timespec t0, t1, tp0, tp1;
+	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1;
     clock_gettime(CLOCK_REALTIME, &t0);
 
 	/* Identify the number of GPU bundles to run in ideal case*/
@@ -1582,6 +1626,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 		  last_task = tid;
 		}
 	  }
+	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1605,6 +1650,8 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1614,6 +1661,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -1669,6 +1717,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -1773,6 +1822,8 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
+	  const int first_task = bid * bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1781,8 +1832,10 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
+		  last_task = tid;
 		}
 	  }
+	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1802,6 +1855,8 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1811,6 +1866,7 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -1957,6 +2013,7 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 		  last_task = tid;
 		}
 	  }
+	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1978,6 +2035,8 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  int tid = 0;
+	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1987,7 +2046,9 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  const char *loop_type = "density";
 	  // Launch the kernel
 	  launch_force_aos_f4(
 		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
@@ -2041,6 +2102,7 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -2133,12 +2195,14 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_va
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -2184,7 +2248,10 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_va
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
+      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -2300,6 +2367,9 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -2318,6 +2388,7 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
@@ -2331,6 +2402,7 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2365,6 +2437,8 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 
 	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
       int tasks_left = tasksperbundle;
       if (bid == nBundles_temp - 1) {
         tasks_left =
@@ -2375,7 +2449,10 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
       int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+      const char *loop_type = "density";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopairci_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -2486,6 +2563,9 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -2504,17 +2584,21 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2523,6 +2607,8 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
+
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2541,12 +2627,26 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
         error("Something's up with your cuda code");
       }
 #endif
+
+	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+      const char *loop_type = "density";
+
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
 		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
@@ -2602,6 +2702,7 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -2677,7 +2778,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
+	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -2697,18 +2798,21 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2717,6 +2821,8 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
+
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2742,11 +2848,14 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
             int max_parts_j = 0;
             int parts_in_bundle_ci = 0;
             int parts_in_bundle_cj = 0;
+            const int first_task = bid * pack_vars->bundle_size;
+      	  int last_task = (bid + 1) * bundle_size;
             for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
+            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
       															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -2755,20 +2864,31 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
       															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
+
+      		  last_task = tid;
               }
             }
             const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
             const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+      const char *loop_type = "density";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -2792,11 +2912,14 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
+          const int first_task = bid * pack_vars->bundle_size;
+    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
+          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
     															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
               parts_in_bundle_ci += count_i;
@@ -2805,6 +2928,8 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
     															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
+
+    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2851,7 +2976,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
-//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -2963,12 +3088,14 @@ void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -3014,7 +3141,10 @@ void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
+      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -3131,7 +3261,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
+	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3151,18 +3281,21 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -3171,6 +3304,8 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 															  - fparti_fpartj_lparti_lpartj[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
+
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3190,14 +3325,24 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
       }
 #endif
 
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
+
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+      const char *loop_type = "gradient";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -3254,7 +3399,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -3328,7 +3473,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
+	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3348,20 +3493,21 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -3371,7 +3517,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-//		  last_task = tid;
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3397,13 +3543,14 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//      	  int last_task = (bid + 1) * bundle_size;
+          const int first_task = bid * pack_vars->bundle_size;
+      	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
+            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj[tid].z
       															  - fparti_fpartj_lparti_lpartj[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -3413,21 +3560,30 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
 
-//      		  last_task = tid;
+      		  last_task = tid;
               }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
           const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
-//	      const int tasksperbundle = pack_vars->tasksperbundle;
+	      const int tasksperbundle = pack_vars->tasksperbundle;
 	      /* LAUNCH THE GPU KERNELS for ci & cj */
+          int tid = 0;
+          int offset = bid * tasksperbundle;
+          int tasks_left = tasksperbundle;
+          if (bid == nBundles_temp - 1) {
+            tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+          }
+
           // Setup 2d grid of GPU thread blocks for ci (number of tasks is
           // the y dimension and max_parts is the x dimension
           int numBlocks_y = 0;//tasks_left;
           int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
           int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+          const char *loop_type = "density";
 
           /* Launch the kernel for ci using data for ci and cj */
           runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -3451,13 +3607,14 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//    	  int last_task = (bid + 1) * bundle_size;
+          const int first_task = bid * pack_vars->bundle_size;
+    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
+          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj[tid].z
     															  - fparti_fpartj_lparti_lpartj[tid].x;
               parts_in_bundle_ci += count_i;
@@ -3467,7 +3624,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
 
-//    		  last_task = tid;
+    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3514,7 +3671,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
-//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -3626,14 +3783,14 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
+      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -3679,8 +3836,10 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
-//      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -3797,7 +3956,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
+	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3817,20 +3976,21 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -3840,7 +4000,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-//		  last_task = tid;
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3860,23 +4020,24 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
       }
 #endif
 
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
+	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-//      int tid = 0;
-//      int offset = bid * tasksperbundle;
-//      int tasks_left = tasksperbundle;
-//      if (bid == nBundles_temp - 1) {
-//        tasks_left =
-//        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-//      }
+      int tid = 0;
+      int offset = bid * tasksperbundle;
+      int tasks_left = tasksperbundle;
+      if (bid == nBundles_temp - 1) {
+        tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+      }
 
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+      const char *loop_type = "gradient";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -3933,7 +4094,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -4006,6 +4167,9 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
+	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
+	const int packed_tmp = 2 * (tasks_packed - 1);
+
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -4024,19 +4188,21 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
+      const int first_task = bid * pack_vars->bundle_size;
+	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
+      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -4046,7 +4212,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-//		  last_task = tid;
+		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -4072,13 +4238,14 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//      	  int last_task = (bid + 1) * bundle_size;
+          const int first_task = bid * pack_vars->bundle_size;
+      	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
+            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj[tid].z
       															  - fparti_fpartj_lparti_lpartj[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -4088,20 +4255,30 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
 
-//      		  last_task = tid;
+      		  last_task = tid;
               }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
           const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
+	      const int tasksperbundle = pack_vars->tasksperbundle;
 	      /* LAUNCH THE GPU KERNELS for ci & cj */
+          int tid = 0;
+          int offset = bid * tasksperbundle;
+          int tasks_left = tasksperbundle;
+          if (bid == nBundles_temp - 1) {
+            tasks_left =
+        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+          }
+
           // Setup 2d grid of GPU thread blocks for ci (number of tasks is
           // the y dimension and max_parts is the x dimension
           int numBlocks_y = 0;//tasks_left;
           int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
           int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
+          const char *loop_type = "density";
 
           /* Launch the kernel for ci using data for ci and cj */
           runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -4125,11 +4302,14 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
+          const int first_task = bid * pack_vars->bundle_size;
+    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
+          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj[tid].z
     															  - fparti_fpartj_lparti_lpartj[tid].x;
               parts_in_bundle_ci += count_i;
@@ -4138,6 +4318,8 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
     															  - fparti_fpartj_lparti_lpartj[tid].y;
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
+
+    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -4184,6 +4366,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
+	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index caabc163dc..97a2773a99 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -21,7 +21,6 @@
 /* Config parameters. */
 #define GPUOFFLOAD 1 //off-load hydro to GPU
 #define DO_CORNERS 1 //do corner pair tasks on CPU
-#define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -33,14 +32,6 @@
 extern "C" {
 #endif
 
-/* Config parameters. */
-#include <config.h>
-
-/* MPI headers. */
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 /* This object's header. */
 #include "runner.h"
 
@@ -202,7 +193,6 @@ void *runner_main2(void *data) {
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
   struct space *space = e->s;
-
   /*pack_vars contain data required for packing tasks destined for the GPU*/
   struct pack_vars_self *pack_vars_self_dens;
   struct pack_vars_self *pack_vars_self_forc;
@@ -874,6 +864,8 @@ void *runner_main2(void *data) {
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
     struct task *prev = NULL;
+    int zeropacks = 0;
+    int lesspacks = 0;
     /*Some bits for output in case of debug*/
     char buf5[20];
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
@@ -919,12 +911,13 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+        t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }
 #endif
 
+
 #ifdef SWIFT_DEBUG_CHECKS
       /* Check that we haven't scheduled an inactive task */
       t->ti_run = e->ti_current;
@@ -986,7 +979,7 @@ void *runner_main2(void *data) {
 //	        		d_parts_aos_dens, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
 //					&tot_time_for_hard_memcpys);
           } /*End of GPU work Self*/
-#endif //GPUDENSSELF
+#endif //GPUOFFLOAD
         } /* self / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
           packed_self_g++;
@@ -1070,46 +1063,46 @@ void *runner_main2(void *data) {
                 (t1.tv_nsec - t0.tv_nsec) /
                     1000000000.0;
 #endif //GPUFORCSELF
-        } else if (t->subtype == task_subtype_limiter)
-          runner_doself1_branch_limiter(r, ci);
-        else if (t->subtype == task_subtype_grav)
-          runner_doself_recursive_grav(r, ci, 1);
-        else if (t->subtype == task_subtype_external_grav)
-          runner_do_grav_external(r, ci, 1);
-        else if (t->subtype == task_subtype_stars_density)
-          runner_doself_branch_stars_density(r, ci);
+        }else if (t->subtype == task_subtype_limiter)
+		  runner_doself1_branch_limiter(r, ci);
+		else if (t->subtype == task_subtype_grav)
+		  runner_doself_recursive_grav(r, ci, 1);
+		else if (t->subtype == task_subtype_external_grav)
+		  runner_do_grav_external(r, ci, 1);
+		else if (t->subtype == task_subtype_stars_density)
+		  runner_doself_branch_stars_density(r, ci);
 #ifdef EXTRA_STAR_LOOPS
-        else if (t->subtype == task_subtype_stars_prep1)
-          runner_doself_branch_stars_prep1(r, ci);
-        else if (t->subtype == task_subtype_stars_prep2)
-          runner_doself_branch_stars_prep2(r, ci);
+		else if (t->subtype == task_subtype_stars_prep1)
+		  runner_doself_branch_stars_prep1(r, ci);
+		else if (t->subtype == task_subtype_stars_prep2)
+		  runner_doself_branch_stars_prep2(r, ci);
 #endif
-        else if (t->subtype == task_subtype_stars_feedback)
-          runner_doself_branch_stars_feedback(r, ci);
-        else if (t->subtype == task_subtype_bh_density)
-          runner_doself_branch_bh_density(r, ci);
-        else if (t->subtype == task_subtype_bh_swallow)
-          runner_doself_branch_bh_swallow(r, ci);
-        else if (t->subtype == task_subtype_do_gas_swallow)
-          runner_do_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_do_bh_swallow)
-          runner_do_bh_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_bh_feedback)
-          runner_doself_branch_bh_feedback(r, ci);
-        else if (t->subtype == task_subtype_rt_gradient)
-          runner_doself1_branch_rt_gradient(r, ci);
-        else if (t->subtype == task_subtype_rt_transport)
-          runner_doself2_branch_rt_transport(r, ci);
-        else if (t->subtype == task_subtype_sink_swallow)
-          runner_doself_branch_sinks_swallow(r, ci);
-        else if (t->subtype == task_subtype_sink_do_gas_swallow)
-          runner_do_sinks_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_do_sink_swallow)
-          runner_do_sinks_sink_swallow_self(r, ci, 1);
-        else
-          error("Unknown/invalid task subtype (%s).",
-                subtaskID_names[t->subtype]);
-        break;
+		else if (t->subtype == task_subtype_stars_feedback)
+		  runner_doself_branch_stars_feedback(r, ci);
+		else if (t->subtype == task_subtype_bh_density)
+		  runner_doself_branch_bh_density(r, ci);
+		else if (t->subtype == task_subtype_bh_swallow)
+		  runner_doself_branch_bh_swallow(r, ci);
+		else if (t->subtype == task_subtype_do_gas_swallow)
+		  runner_do_gas_swallow_self(r, ci, 1);
+		else if (t->subtype == task_subtype_do_bh_swallow)
+		  runner_do_bh_swallow_self(r, ci, 1);
+		else if (t->subtype == task_subtype_bh_feedback)
+		  runner_doself_branch_bh_feedback(r, ci);
+		else if (t->subtype == task_subtype_rt_gradient)
+		  runner_doself1_branch_rt_gradient(r, ci);
+		else if (t->subtype == task_subtype_rt_transport)
+		  runner_doself2_branch_rt_transport(r, ci);
+		else if (t->subtype == task_subtype_sink_swallow)
+		  runner_doself_branch_sinks_swallow(r, ci);
+		else if (t->subtype == task_subtype_sink_do_gas_swallow)
+		  runner_do_sinks_gas_swallow_self(r, ci, 1);
+		else if (t->subtype == task_subtype_sink_do_sink_swallow)
+		  runner_do_sinks_sink_swallow_self(r, ci, 1);
+		else
+		  error("Unknown/invalid task subtype (%s).",
+				subtaskID_names[t->subtype]);
+		break;
 
       case task_type_pair:
         if (t->subtype == task_subtype_density) {
@@ -1763,17 +1756,28 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack ||
-    	  t->subtype == task_subtype_gpu_pack_g ||
-		  t->subtype == task_subtype_gpu_pack_f){
+      if (t->subtype == task_subtype_gpu_pack){
     	/* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
       }
+//      else if (t->subtype == task_subtype_gpu_pack_g && t->type == task_type_self){
+      else if (t->subtype == task_subtype_gpu_pack_g){
+      	/* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+      }
+//      else if (t->subtype == task_subtype_gpu_pack_f && t->type == task_type_self){
+      else if (t->subtype == task_subtype_gpu_pack_f){
+      	/* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+      }
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
       }
-#else //GPUOFFLOAD
+#endif //GPUOFFLOAD
+#ifndef GPUOFFLOAD
         t = scheduler_done(sched, t);
 #endif //GPUOFFLOAD
 
@@ -1801,7 +1805,6 @@ void *runner_main2(void *data) {
 ////  }
     /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
-#ifdef DUMP_TIMINGS
 #ifdef GPUOFFLOAD
 //        char buffer[30];
 //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", r->cpuid, step);
@@ -1829,7 +1832,8 @@ void *runner_main2(void *data) {
 ///////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////
 
-#else //GPUOFFLOAD
+#endif
+#ifndef GPUOFFLOAD
 		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
 				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
 				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
@@ -1837,8 +1841,7 @@ void *runner_main2(void *data) {
 
 		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
 				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-#endif //GPUOFFLOAD
-#endif //DUMPTIMINGS
+#endif
 //    }
 	fflush(fgpu_steps);
 	fclose(fgpu_steps);
@@ -1861,7 +1864,6 @@ void *runner_main2(void *data) {
     density = 0;
     density_sub = 0;
     unpacked = 0;
-    message("reached end of runner_main2\n");
 //	if(step == 2)cudaProfilerStop();
 //	if(step == 2)exit(0);
 //	  size_t free_byte ;

From badc9d4060dbb106f0cfcd794094ab5bf427e543 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 16:34:21 +0000
Subject: [PATCH 026/217] Copied over both runner_main_clean and
 runner_doiact_functions_hydro_gpu and re-worked runner_main_clean.cu but code
 still hangs

---
 src/runner_main_clean.cu | 150 +--------------------------------------
 1 file changed, 3 insertions(+), 147 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 97a2773a99..9e0dd86b68 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1124,51 +1124,6 @@ void *runner_main2(void *data) {
 	    else if (t->subtype == task_subtype_gpu_pack) {
 	      packed_pair++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-		    clock_gettime(CLOCK_REALTIME, &t0);
-		    runner_dopair1_branch_density(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_dens->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
 	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
 	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
@@ -1185,61 +1140,12 @@ void *runner_main2(void *data) {
 						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
 						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
 		    }
-#ifdef DO_CORNERS
-		  } /* End of GPU work Pairs */
-#endif //DO_CORNERS
 #endif //GPUDENS
+		  } /* End of GPU work Pairs */
 	    } /* pair / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
   	      packed_pair_g++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-    	    clock_gettime(CLOCK_REALTIME, &t0);
-	    	runner_dopair1_branch_gradient(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left_g == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair_g +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_grad->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
-//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
-//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
 	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
 	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
             /* No pack tasks left in queue, flag that we want to run */
@@ -1255,58 +1161,11 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
 					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
 		    }
-#ifdef DO_CORNERS
-	      }/* End of GPU work Pairs */
-#endif //DO_CORNERS
+	    }/* End of GPU work Pairs */
 #endif //GPUGRADPAIR
-        }
         else if (t->subtype == task_subtype_gpu_pack_f){
     	    packed_pair_f++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-  	      struct timespec t0, t1, dt;
-  	      clock_gettime(CLOCK_REALTIME, &t0);
-  	      double shift[3] = {0.0};
-  	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-    	  clock_gettime(CLOCK_REALTIME, &t1);
-  		  packing_time_pair +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-  	    	runner_dopair1_branch_force(r, ci, cj);
-  		    t->corner_pair = 1;
-  			int qid = r->qid;
-  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-  			/* Tell the cells they have been packed */
-  			ci->pack_done++;
-  			cj->pack_done++;
-  			t->done = 1;
-  			int launch = 0, launch_leftovers = 0;
-  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
-  				launch_leftovers = 1;
-  			/* Tasks done. Release the lock ! */
-  		    task_unlock(t);
-  			/*schedule my dependencies (Only unpacks really)*/
-  			enqueue_dependencies(sched, t);
-  			/*Signal sleeping runners*/
-  			signal_sleeping_runners(sched, t);
-  		    clock_gettime(CLOCK_REALTIME, &t1);
-  		    packing_time_pair_f +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-  			if (launch_leftovers) {
-  			  pack_vars_pair_forc->launch_leftovers = 1;
-  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-  			}
-  	      }
-  	      else{
-#endif //DO_CORNERS
 //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
 //            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
   	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
@@ -1324,11 +1183,8 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
 					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
   		    } /* End of GPU work Pairs */
-#ifdef DO_CORNERS
-  	      }
-#endif //DO_CORNERS
 #endif //GPUFORCPAIR
-        }
+  	    }
 	    else if (t->subtype == task_subtype_gpu_unpack) {
 	  	      unpacked_pair++;
         }

From 45ea651d9a8eb456a35f332c7f7568437a601354 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 16:41:32 +0000
Subject: [PATCH 027/217] Issue was not with #ifdefs in runner_main_clean.cu or
 runner_doiact... Removed them all and code still hangs. Will copy back
 previous files innext commit

---
 src/runner_main_clean.cu | 127 +++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 71 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 9e0dd86b68..7644e98ae0 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -937,22 +937,19 @@ void *runner_main2(void *data) {
             unpacked_f++;
         } else if (t->subtype == task_subtype_density) {
             cpu_self++;
-#ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_doself1_branch_density(r, ci);
-          clock_gettime(CLOCK_REALTIME, &t1);
-          tasks_done_cpu++;
-          time_for_density_cpu +=
-              (t1.tv_sec - t0.tv_sec) +
-              (t1.tv_nsec - t0.tv_nsec) /
-                  1000000000.0;
-          density++;
-#endif
+//          struct timespec t0, t1, dt;
+//          clock_gettime(CLOCK_REALTIME, &t0);
+//          runner_doself1_branch_density(r, ci);
+//          clock_gettime(CLOCK_REALTIME, &t1);
+//          tasks_done_cpu++;
+//          time_for_density_cpu +=
+//              (t1.tv_sec - t0.tv_sec) +
+//              (t1.tv_nsec - t0.tv_nsec) /
+//                  1000000000.0;
+//          density++;
 	    /* GPU WORK */
         } else if (t->subtype == task_subtype_gpu_pack) {
           packed_self++;
-#ifdef GPUOFFLOAD
 //          struct timespec t0, t1; //
 //          clock_gettime(CLOCK_REALTIME, &t0);
           packing_time += runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci,
@@ -979,11 +976,9 @@ void *runner_main2(void *data) {
 //	        		d_parts_aos_dens, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
 //					&tot_time_for_hard_memcpys);
           } /*End of GPU work Self*/
-#endif //GPUOFFLOAD
         } /* self / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
           packed_self_g++;
-#ifdef GPUOFFLOAD
 //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
 //        		  t, parts_aos_grad, &packing_time_g);
           packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
@@ -1003,11 +998,9 @@ void *runner_main2(void *data) {
       	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
       	      		self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
-#endif //GPUGRADSELF
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
           packed_self_f++;
-#ifdef GPUOFFLOAD
 //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
 //        		  t, parts_aos_forc, &packing_time_f);
           packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
@@ -1032,36 +1025,35 @@ void *runner_main2(void *data) {
 					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
 					self_end_f, &unpack_time_self_f);
           } /*End of GPU work Self*/
-#endif //GPUFORCSELF
         }
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
 #ifndef GPUOFFLOAD
-            struct timespec t0, t1, dt;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            runner_doself1_branch_gradient(r, ci);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            tasks_done_cpu++;
-            time_for_cpu_g +=
-                (t1.tv_sec - t0.tv_sec) +
-                (t1.tv_nsec - t0.tv_nsec) /
-                    1000000000.0;
+//            struct timespec t0, t1, dt;
+//            clock_gettime(CLOCK_REALTIME, &t0);
+//            runner_doself1_branch_gradient(r, ci);
+//            clock_gettime(CLOCK_REALTIME, &t1);
+//            tasks_done_cpu++;
+//            time_for_cpu_g +=
+//                (t1.tv_sec - t0.tv_sec) +
+//                (t1.tv_nsec - t0.tv_nsec) /
+//                    1000000000.0;
 #endif //GPUGRADSELF
         }
 #endif
         else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
 #ifndef GPUOFFLOAD
-            struct timespec t0, t1;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            runner_doself2_branch_force(r, ci);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            tasks_done_cpu++;
-            time_for_cpu_f +=
-                (t1.tv_sec - t0.tv_sec) +
-                (t1.tv_nsec - t0.tv_nsec) /
-                    1000000000.0;
+//            struct timespec t0, t1;
+//            clock_gettime(CLOCK_REALTIME, &t0);
+//            runner_doself2_branch_force(r, ci);
+//            clock_gettime(CLOCK_REALTIME, &t1);
+//            tasks_done_cpu++;
+//            time_for_cpu_f +=
+//                (t1.tv_sec - t0.tv_sec) +
+//                (t1.tv_nsec - t0.tv_nsec) /
+//                    1000000000.0;
 #endif //GPUFORCSELF
         }else if (t->subtype == task_subtype_limiter)
 		  runner_doself1_branch_limiter(r, ci);
@@ -1109,21 +1101,20 @@ void *runner_main2(void *data) {
 	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
           cpu_pair++;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-	      runner_dopair1_branch_density(r, ci, cj);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-	      tasks_done_cpu++;
-	      time_for_density_cpu_pair +=
-	          (t1.tv_sec - t0.tv_sec) +
-	          (t1.tv_nsec - t0.tv_nsec) /
-	           1000000000.0;
+//          struct timespec t0, t1, dt;
+//          clock_gettime(CLOCK_REALTIME, &t0);
+//	      runner_dopair1_branch_density(r, ci, cj);
+//	      clock_gettime(CLOCK_REALTIME, &t1);
+//	      tasks_done_cpu++;
+//	      time_for_density_cpu_pair +=
+//	          (t1.tv_sec - t0.tv_sec) +
+//	          (t1.tv_nsec - t0.tv_nsec) /
+//	           1000000000.0;
 #endif
 	    }
 	    /* GPU WORK */
 	    else if (t->subtype == task_subtype_gpu_pack) {
 	      packed_pair++;
-#ifdef GPUOFFLOAD
 	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
 	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
@@ -1139,13 +1130,10 @@ void *runner_main2(void *data) {
 			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
 						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
 						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-		    }
-#endif //GPUDENS
-		  } /* End of GPU work Pairs */
+		    }/* End of GPU work Pairs */
 	    } /* pair / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
   	      packed_pair_g++;
-#ifdef GPUOFFLOAD
 	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
 	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
             /* No pack tasks left in queue, flag that we want to run */
@@ -1162,10 +1150,8 @@ void *runner_main2(void *data) {
 					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
 		    }
 	    }/* End of GPU work Pairs */
-#endif //GPUGRADPAIR
         else if (t->subtype == task_subtype_gpu_pack_f){
     	    packed_pair_f++;
-#ifdef GPUOFFLOAD
 //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
 //            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
   	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
@@ -1183,7 +1169,6 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
 					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
   		    } /* End of GPU work Pairs */
-#endif //GPUFORCPAIR
   	    }
 	    else if (t->subtype == task_subtype_gpu_unpack) {
 	  	      unpacked_pair++;
@@ -1198,30 +1183,30 @@ void *runner_main2(void *data) {
         else if (t->subtype == task_subtype_gradient){
           int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_dopair1_branch_gradient(r, ci, cj);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-	      tasks_done_cpu++;
-	      time_for_cpu_pair_g +=
-	          (t1.tv_sec - t0.tv_sec) +
-	          (t1.tv_nsec - t0.tv_nsec) /
-	           1000000000.0;
+//          struct timespec t0, t1, dt;
+//          clock_gettime(CLOCK_REALTIME, &t0);
+//          runner_dopair1_branch_gradient(r, ci, cj);
+//	      clock_gettime(CLOCK_REALTIME, &t1);
+//	      tasks_done_cpu++;
+//	      time_for_cpu_pair_g +=
+//	          (t1.tv_sec - t0.tv_sec) +
+//	          (t1.tv_nsec - t0.tv_nsec) /
+//	           1000000000.0;
 #endif //GPUGRADPAIR
         }
 #endif //EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_force){
           int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_dopair2_branch_force(r, ci, cj);
-  	      clock_gettime(CLOCK_REALTIME, &t1);
-  	      tasks_done_cpu++;
-  	      time_for_cpu_pair_f +=
-  	          (t1.tv_sec - t0.tv_sec) +
-  	          (t1.tv_nsec - t0.tv_nsec) /
-  	           1000000000.0;
+//          struct timespec t0, t1, dt;
+//          clock_gettime(CLOCK_REALTIME, &t0);
+//          runner_dopair2_branch_force(r, ci, cj);
+//  	      clock_gettime(CLOCK_REALTIME, &t1);
+//  	      tasks_done_cpu++;
+//  	      time_for_cpu_pair_f +=
+//  	          (t1.tv_sec - t0.tv_sec) +
+//  	          (t1.tv_nsec - t0.tv_nsec) /
+//  	           1000000000.0;
 #endif //GPUFORCPAIR
         }
         else if (t->subtype == task_subtype_limiter)

From a9f81dd0fc21a6751a1c909ddcaea324e247ef9e Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 30 Oct 2024 17:27:00 +0000
Subject: [PATCH 028/217] Issue is probably with how I am locking and unlocking
 tasks or something to with not unskipping tasks properly. IFDEFs are totally
 correct and code does not hang when GPU code commented out

---
 src/runner_doiact_functions_hydro_gpu.h | 313 ++++---------------
 src/runner_main_clean.cu                | 393 +++++++++++++++++-------
 2 files changed, 340 insertions(+), 366 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 777ebff6b7..51a49e3fba 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -116,8 +116,6 @@ void runner_doself1_pack(struct runner *r, struct scheduler *s, struct pack_vars
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -173,8 +171,6 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 //	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -208,7 +204,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 	  clock_gettime(CLOCK_REALTIME, &t1);
 		/* Release the lock on the cell */
 //		task_unlock(t);
-		cell_unlocktree(ci);
+//		cell_unlocktree(ci);
 	  return (t1.tv_sec - t0.tv_sec) +
 			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
@@ -232,8 +228,6 @@ void runner_doself1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -289,8 +283,6 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	//    /* Identify row in particle arrays where this task starts*/
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -346,8 +338,6 @@ void runner_doself1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->cell_list[tasks_packed] = ci;
 	//    /* Identify row in particle arrays where this task starts*/
 	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -402,8 +392,6 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	//    /* Identify row in particle arrays where this task starts*/
 //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
 	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	////    /* id for the task*/
-	const int tid_d = tasks_packed;
 	int *count_parts_self = &pack_vars->count_parts;
 	/* This re-arranges the particle data from cell->hydro->parts into a
 	long array of part structs*/
@@ -450,21 +438,16 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
-	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
 	pack_vars->ci_list[tasks_packed] = ci;
 	pack_vars->cj_list[tasks_packed] = cj;
 
-    const double cjx = cj->loc[0];
-    const double cjy = cj->loc[1];
-    const double cjz = cj->loc[2];
-
     float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
 
     const int count_ci = ci->hydro.count;
@@ -530,8 +513,8 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars
 	cell_unlocktree(cj);
 }
 
-double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -539,11 +522,10 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
-	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -558,8 +540,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -639,7 +619,7 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -654,7 +634,7 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
 	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
 	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*    /*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
 	pack_vars->shiftx[tid_tmp + 1] = cjx;
 	pack_vars->shifty[tid_tmp + 1] = cjy;
 	pack_vars->shiftz[tid_tmp + 1] = cjz;
@@ -713,8 +693,8 @@ void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_va
 }
 
 
-double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -722,11 +702,10 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
-	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -741,8 +720,6 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -823,7 +800,7 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -838,7 +815,7 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
 	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
 	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
 	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*    /*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
+	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
 	pack_vars->shiftx[tid_tmp + 1] = cjx;
 	pack_vars->shifty[tid_tmp + 1] = cjy;
 	pack_vars->shiftz[tid_tmp + 1] = cjz;
@@ -896,8 +873,8 @@ void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_va
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * restrict ci,
-		 struct cell * restrict cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
+		 struct cell * cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
 
     /* Timers for how long this all takes.
     * t0 and t1 are from start to finish including GPU calcs
@@ -905,11 +882,10 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
     struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 	int tasks_packed = pack_vars->tasks_packed;
-	const int tid_tmp = 2 * tasks_packed;
 
     double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
     /*Get the shifts in case of periodics*/
-    const int sid = space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
 	/*Get pointers to the list of tasks and cells packed*/
 	pack_vars->task_list[tasks_packed] = t;
@@ -924,8 +900,6 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 
     /*Assign an id for this task*/
     const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
 
     /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
 //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
@@ -1038,8 +1012,6 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1048,10 +1020,8 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
-		  last_task = tid;
 		}
 	  }
-	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1071,8 +1041,6 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 //	  }
 //#endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1082,7 +1050,6 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_va
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 //	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
@@ -1170,7 +1137,7 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int2 * d_task_first_part_self_dens_f4,
 		int devId, int2 * task_first_part_f4, int2 * d_task_first_part_f4, cudaEvent_t * self_end){
 
-	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1; //
+	struct timespec t0, t1, tp0, tp1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 
 	/* Identify the number of GPU bundles to run in ideal case*/
@@ -1269,8 +1236,6 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 //	  }
 //#endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1280,7 +1245,6 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //	  const char *loop_type = "density";
 //	  struct first_part first_parts;
@@ -1338,7 +1302,6 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -1447,8 +1410,6 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1457,10 +1418,8 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
-		  last_task = tid;
 		}
 	  }
-	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1480,8 +1439,6 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1491,7 +1448,6 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -1579,7 +1535,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 		cudaEvent_t *self_end, double *unpack_time){
 
 
-	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1;
+	struct timespec t0, t1, tp0, tp1;
     clock_gettime(CLOCK_REALTIME, &t0);
 
 	/* Identify the number of GPU bundles to run in ideal case*/
@@ -1626,7 +1582,6 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 		  last_task = tid;
 		}
 	  }
-	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1650,8 +1605,6 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1661,7 +1614,6 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -1717,7 +1669,6 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -1822,8 +1773,6 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 
 	  max_parts = 0;
 	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
 		   tid++) {
 		if (tid < tasks_packed) {
@@ -1832,10 +1781,8 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
 		  parts_in_bundle += count;
 		  max_parts = max(max_parts, count);
-		  last_task = tid;
 		}
 	  }
-	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -1855,8 +1802,6 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -1866,7 +1811,6 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 	  const char *loop_type = "density";
 	  // Launch the kernel
@@ -2013,7 +1957,6 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 		  last_task = tid;
 		}
 	  }
-	  const int n_tasks = last_task - first_task;
 
 	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
 	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
@@ -2035,8 +1978,6 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 	  }
 #endif
 	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tid = 0;
-	  int offset = bid * tasksperbundle;
 	  int tasks_left = tasksperbundle;
 	  if (bid == nBundles_temp - 1) {
 		tasks_left =
@@ -2046,9 +1987,7 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 	  // the y dimension and max_parts is the x dimension
 	  int numBlocks_y = tasks_left;
 	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_part_0 = pack_vars->bundle_first_part[bid];
 	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-	  const char *loop_type = "density";
 	  // Launch the kernel
 	  launch_force_aos_f4(
 		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
@@ -2102,7 +2041,6 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -2195,14 +2133,12 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_va
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -2248,10 +2184,7 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_va
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
-      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -2367,9 +2300,6 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -2388,7 +2318,6 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
@@ -2402,7 +2331,6 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2437,8 +2365,6 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 
 	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
       int tasks_left = tasksperbundle;
       if (bid == nBundles_temp - 1) {
         tasks_left =
@@ -2449,10 +2375,7 @@ void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
       int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-      const char *loop_type = "density";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopairci_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -2563,9 +2486,6 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -2584,21 +2504,17 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2607,8 +2523,6 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
-
-		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2627,26 +2541,12 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
         error("Something's up with your cuda code");
       }
 #endif
-
-	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
-
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-      const char *loop_type = "density";
-
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
 		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
@@ -2702,7 +2602,6 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -2778,7 +2677,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
+//	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -2798,21 +2697,18 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
+//	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
 															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
           parts_in_bundle_ci += count_i;
@@ -2821,8 +2717,6 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
-
-		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2848,14 +2742,11 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
             int max_parts_j = 0;
             int parts_in_bundle_ci = 0;
             int parts_in_bundle_cj = 0;
-            const int first_task = bid * pack_vars->bundle_size;
-      	  int last_task = (bid + 1) * bundle_size;
             for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
-            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
       															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -2864,31 +2755,20 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
       															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
-
-      		  last_task = tid;
               }
             }
             const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
             const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
-	  const int tasksperbundle = pack_vars->tasksperbundle;
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
-
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-      const char *loop_type = "density";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -2912,14 +2792,11 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-          const int first_task = bid * pack_vars->bundle_size;
-    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
-          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
     															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
               parts_in_bundle_ci += count_i;
@@ -2928,8 +2805,6 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
     															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
-
-    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2976,7 +2851,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -3088,14 +2963,12 @@ void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -3141,10 +3014,7 @@ void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
-      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -3261,7 +3131,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
+//	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3281,21 +3151,18 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
+//	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -3304,8 +3171,6 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 															  - fparti_fpartj_lparti_lpartj[tid].y;
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
-
-		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3325,24 +3190,14 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
       }
 #endif
 
-	  const int tasksperbundle = pack_vars->tasksperbundle;
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
-
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-      const char *loop_type = "gradient";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -3399,7 +3254,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -3473,7 +3328,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
+//	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3493,21 +3348,20 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
+//	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -3517,7 +3371,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-		  last_task = tid;
+//		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3543,14 +3397,13 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-          const int first_task = bid * pack_vars->bundle_size;
-      	  int last_task = (bid + 1) * bundle_size;
+//          const int first_task = bid * pack_vars->bundle_size;
+//      	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
-            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj[tid].z
       															  - fparti_fpartj_lparti_lpartj[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -3560,30 +3413,21 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
 
-      		  last_task = tid;
+//      		  last_task = tid;
               }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
           const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
-	      const int tasksperbundle = pack_vars->tasksperbundle;
+//	      const int tasksperbundle = pack_vars->tasksperbundle;
 	      /* LAUNCH THE GPU KERNELS for ci & cj */
-          int tid = 0;
-          int offset = bid * tasksperbundle;
-          int tasks_left = tasksperbundle;
-          if (bid == nBundles_temp - 1) {
-            tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-          }
-
           // Setup 2d grid of GPU thread blocks for ci (number of tasks is
           // the y dimension and max_parts is the x dimension
           int numBlocks_y = 0;//tasks_left;
           int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
           int bundle_part_0 = pack_vars->bundle_first_part[bid];
-          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-          const char *loop_type = "density";
 
           /* Launch the kernel for ci using data for ci and cj */
           runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -3607,14 +3451,13 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-          const int first_task = bid * pack_vars->bundle_size;
-    	  int last_task = (bid + 1) * bundle_size;
+//          const int first_task = bid * pack_vars->bundle_size;
+//    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
-          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj[tid].z
     															  - fparti_fpartj_lparti_lpartj[tid].x;
               parts_in_bundle_ci += count_i;
@@ -3624,7 +3467,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
 
-    		  last_task = tid;
+//    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3671,7 +3514,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -3783,14 +3626,14 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
+//	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
+//      const int first_task = bid * pack_vars->bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
@@ -3836,10 +3679,8 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = tasks_left;
-      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
+//      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
       const char *loop_type = "density";
 
         /* Launch the kernel for ci using data for ci and cj */
@@ -3956,7 +3797,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 	const int bundle_size = pack_vars->bundle_size;
 
 	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
+//	const int packed_tmp = 2 * (tasks_packed - 1);
 
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
@@ -3976,21 +3817,20 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
+//	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -4000,7 +3840,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-		  last_task = tid;
+//		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -4020,24 +3860,23 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
       }
 #endif
 
-	  const int tasksperbundle = pack_vars->tasksperbundle;
+//	  const int tasksperbundle = pack_vars->tasksperbundle;
 	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
+//      int tid = 0;
+//      int offset = bid * tasksperbundle;
+//      int tasks_left = tasksperbundle;
+//      if (bid == nBundles_temp - 1) {
+//        tasks_left =
+//        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+//      }
 
       // Setup 2d grid of GPU thread blocks for ci (number of tasks is
       // the y dimension and max_parts is the x dimension
       int numBlocks_y = 0;//tasks_left;
       int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
       int bundle_part_0 = pack_vars->bundle_first_part[bid];
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-      const char *loop_type = "gradient";
 
       /* Launch the kernel for ci using data for ci and cj */
       runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -4094,7 +3933,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 
 	    /*Time unpacking*/
 //		clock_gettime(CLOCK_REALTIME, &tp0);
-		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
@@ -4167,9 +4006,6 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/*How many tasks should be in a bundle?*/
 	const int bundle_size = pack_vars->bundle_size;
 
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
 	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
 	if (pack_vars->launch_leftovers) {
 	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
@@ -4188,21 +4024,19 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 	/* Launch the copies for each bundle and run the GPU kernel */
 	/*We don't go into this loop if tasks_left_self == 1 as
 	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
 	for (int bid = 0; bid < nBundles_temp; bid++) {
 
       int max_parts_i = 0;
       int max_parts_j = 0;
       int parts_in_bundle_ci = 0;
       int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
+//      const int first_task = bid * pack_vars->bundle_size;
+//	  int last_task = (bid + 1) * bundle_size;
       for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
            tid++) {
         if (tid < tasks_packed) {
           /*Get an estimate for the max number of parts per cell in each bundle.
            *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
           int count_i = fparti_fpartj_lparti_lpartj[tid].z
 															  - fparti_fpartj_lparti_lpartj[tid].x;
           parts_in_bundle_ci += count_i;
@@ -4212,7 +4046,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           parts_in_bundle_cj += count_j;
           max_parts_j = max(max_parts_j, count_j);
 
-		  last_task = tid;
+//		  last_task = tid;
         }
       }
       const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -4238,14 +4072,13 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-          const int first_task = bid * pack_vars->bundle_size;
-      	  int last_task = (bid + 1) * bundle_size;
+//          const int first_task = bid * pack_vars->bundle_size;
+//      	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                  tid++) {
               if (tid < tasks_packed) {
                 /*Get an estimate for the max number of parts per cell in each bundle.
                  *  Used for determining the number of GPU CUDA blocks*/
-            	const int tid_tmp = 2 * tid;
                 int count_i = fparti_fpartj_lparti_lpartj[tid].z
       															  - fparti_fpartj_lparti_lpartj[tid].x;
                 parts_in_bundle_ci += count_i;
@@ -4255,30 +4088,20 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
                 parts_in_bundle_cj += count_j;
                 max_parts_j = max(max_parts_j, count_j);
 
-      		  last_task = tid;
+//      		  last_task = tid;
               }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
           const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 //////////////////////////////////
-	      const int tasksperbundle = pack_vars->tasksperbundle;
 	      /* LAUNCH THE GPU KERNELS for ci & cj */
-          int tid = 0;
-          int offset = bid * tasksperbundle;
-          int tasks_left = tasksperbundle;
-          if (bid == nBundles_temp - 1) {
-            tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-          }
-
           // Setup 2d grid of GPU thread blocks for ci (number of tasks is
           // the y dimension and max_parts is the x dimension
           int numBlocks_y = 0;//tasks_left;
           int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
           int bundle_part_0 = pack_vars->bundle_first_part[bid];
-          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-          const char *loop_type = "density";
 
           /* Launch the kernel for ci using data for ci and cj */
           runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
@@ -4302,14 +4125,11 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
           int max_parts_j = 0;
           int parts_in_bundle_ci = 0;
           int parts_in_bundle_cj = 0;
-          const int first_task = bid * pack_vars->bundle_size;
-    	  int last_task = (bid + 1) * bundle_size;
           for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
                tid++) {
             if (tid < tasks_packed) {
               /*Get an estimate for the max number of parts per cell in each bundle.
                *  Used for determining the number of GPU CUDA blocks*/
-          	const int tid_tmp = 2 * tid;
               int count_i = fparti_fpartj_lparti_lpartj[tid].z
     															  - fparti_fpartj_lparti_lpartj[tid].x;
               parts_in_bundle_ci += count_i;
@@ -4318,8 +4138,6 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
     															  - fparti_fpartj_lparti_lpartj[tid].y;
               parts_in_bundle_cj += count_j;
               max_parts_j = max(max_parts_j, count_j);
-
-    		  last_task = tid;
             }
           }
           const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -4366,7 +4184,6 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler
 
 	    /*Time unpacking*/
 //	clock_gettime(CLOCK_REALTIME, &tp0);
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
 	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7644e98ae0..caabc163dc 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -21,6 +21,7 @@
 /* Config parameters. */
 #define GPUOFFLOAD 1 //off-load hydro to GPU
 #define DO_CORNERS 1 //do corner pair tasks on CPU
+#define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -32,6 +33,14 @@
 extern "C" {
 #endif
 
+/* Config parameters. */
+#include <config.h>
+
+/* MPI headers. */
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
 /* This object's header. */
 #include "runner.h"
 
@@ -193,6 +202,7 @@ void *runner_main2(void *data) {
   struct engine *e = r->e;
   struct scheduler *sched = &e->sched;
   struct space *space = e->s;
+
   /*pack_vars contain data required for packing tasks destined for the GPU*/
   struct pack_vars_self *pack_vars_self_dens;
   struct pack_vars_self *pack_vars_self_forc;
@@ -864,8 +874,6 @@ void *runner_main2(void *data) {
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
     struct task *prev = NULL;
-    int zeropacks = 0;
-    int lesspacks = 0;
     /*Some bits for output in case of debug*/
     char buf5[20];
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
@@ -911,13 +919,12 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift);
+        t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }
 #endif
 
-
 #ifdef SWIFT_DEBUG_CHECKS
       /* Check that we haven't scheduled an inactive task */
       t->ti_run = e->ti_current;
@@ -937,19 +944,22 @@ void *runner_main2(void *data) {
             unpacked_f++;
         } else if (t->subtype == task_subtype_density) {
             cpu_self++;
-//          struct timespec t0, t1, dt;
-//          clock_gettime(CLOCK_REALTIME, &t0);
-//          runner_doself1_branch_density(r, ci);
-//          clock_gettime(CLOCK_REALTIME, &t1);
-//          tasks_done_cpu++;
-//          time_for_density_cpu +=
-//              (t1.tv_sec - t0.tv_sec) +
-//              (t1.tv_nsec - t0.tv_nsec) /
-//                  1000000000.0;
-//          density++;
+#ifndef GPUOFFLOAD
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_doself1_branch_density(r, ci);
+          clock_gettime(CLOCK_REALTIME, &t1);
+          tasks_done_cpu++;
+          time_for_density_cpu +=
+              (t1.tv_sec - t0.tv_sec) +
+              (t1.tv_nsec - t0.tv_nsec) /
+                  1000000000.0;
+          density++;
+#endif
 	    /* GPU WORK */
         } else if (t->subtype == task_subtype_gpu_pack) {
           packed_self++;
+#ifdef GPUOFFLOAD
 //          struct timespec t0, t1; //
 //          clock_gettime(CLOCK_REALTIME, &t0);
           packing_time += runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci,
@@ -976,9 +986,11 @@ void *runner_main2(void *data) {
 //	        		d_parts_aos_dens, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
 //					&tot_time_for_hard_memcpys);
           } /*End of GPU work Self*/
+#endif //GPUDENSSELF
         } /* self / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
           packed_self_g++;
+#ifdef GPUOFFLOAD
 //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
 //        		  t, parts_aos_grad, &packing_time_g);
           packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
@@ -998,9 +1010,11 @@ void *runner_main2(void *data) {
       	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
       	      		self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
+#endif //GPUGRADSELF
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
           packed_self_f++;
+#ifdef GPUOFFLOAD
 //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
 //        		  t, parts_aos_forc, &packing_time_f);
           packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
@@ -1025,96 +1039,143 @@ void *runner_main2(void *data) {
 					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
 					self_end_f, &unpack_time_self_f);
           } /*End of GPU work Self*/
+#endif //GPUFORCSELF
         }
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
 #ifndef GPUOFFLOAD
-//            struct timespec t0, t1, dt;
-//            clock_gettime(CLOCK_REALTIME, &t0);
-//            runner_doself1_branch_gradient(r, ci);
-//            clock_gettime(CLOCK_REALTIME, &t1);
-//            tasks_done_cpu++;
-//            time_for_cpu_g +=
-//                (t1.tv_sec - t0.tv_sec) +
-//                (t1.tv_nsec - t0.tv_nsec) /
-//                    1000000000.0;
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself1_branch_gradient(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_g +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) /
+                    1000000000.0;
 #endif //GPUGRADSELF
         }
 #endif
         else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
 #ifndef GPUOFFLOAD
-//            struct timespec t0, t1;
-//            clock_gettime(CLOCK_REALTIME, &t0);
-//            runner_doself2_branch_force(r, ci);
-//            clock_gettime(CLOCK_REALTIME, &t1);
-//            tasks_done_cpu++;
-//            time_for_cpu_f +=
-//                (t1.tv_sec - t0.tv_sec) +
-//                (t1.tv_nsec - t0.tv_nsec) /
-//                    1000000000.0;
+            struct timespec t0, t1;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself2_branch_force(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_f +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) /
+                    1000000000.0;
 #endif //GPUFORCSELF
-        }else if (t->subtype == task_subtype_limiter)
-		  runner_doself1_branch_limiter(r, ci);
-		else if (t->subtype == task_subtype_grav)
-		  runner_doself_recursive_grav(r, ci, 1);
-		else if (t->subtype == task_subtype_external_grav)
-		  runner_do_grav_external(r, ci, 1);
-		else if (t->subtype == task_subtype_stars_density)
-		  runner_doself_branch_stars_density(r, ci);
+        } else if (t->subtype == task_subtype_limiter)
+          runner_doself1_branch_limiter(r, ci);
+        else if (t->subtype == task_subtype_grav)
+          runner_doself_recursive_grav(r, ci, 1);
+        else if (t->subtype == task_subtype_external_grav)
+          runner_do_grav_external(r, ci, 1);
+        else if (t->subtype == task_subtype_stars_density)
+          runner_doself_branch_stars_density(r, ci);
 #ifdef EXTRA_STAR_LOOPS
-		else if (t->subtype == task_subtype_stars_prep1)
-		  runner_doself_branch_stars_prep1(r, ci);
-		else if (t->subtype == task_subtype_stars_prep2)
-		  runner_doself_branch_stars_prep2(r, ci);
+        else if (t->subtype == task_subtype_stars_prep1)
+          runner_doself_branch_stars_prep1(r, ci);
+        else if (t->subtype == task_subtype_stars_prep2)
+          runner_doself_branch_stars_prep2(r, ci);
 #endif
-		else if (t->subtype == task_subtype_stars_feedback)
-		  runner_doself_branch_stars_feedback(r, ci);
-		else if (t->subtype == task_subtype_bh_density)
-		  runner_doself_branch_bh_density(r, ci);
-		else if (t->subtype == task_subtype_bh_swallow)
-		  runner_doself_branch_bh_swallow(r, ci);
-		else if (t->subtype == task_subtype_do_gas_swallow)
-		  runner_do_gas_swallow_self(r, ci, 1);
-		else if (t->subtype == task_subtype_do_bh_swallow)
-		  runner_do_bh_swallow_self(r, ci, 1);
-		else if (t->subtype == task_subtype_bh_feedback)
-		  runner_doself_branch_bh_feedback(r, ci);
-		else if (t->subtype == task_subtype_rt_gradient)
-		  runner_doself1_branch_rt_gradient(r, ci);
-		else if (t->subtype == task_subtype_rt_transport)
-		  runner_doself2_branch_rt_transport(r, ci);
-		else if (t->subtype == task_subtype_sink_swallow)
-		  runner_doself_branch_sinks_swallow(r, ci);
-		else if (t->subtype == task_subtype_sink_do_gas_swallow)
-		  runner_do_sinks_gas_swallow_self(r, ci, 1);
-		else if (t->subtype == task_subtype_sink_do_sink_swallow)
-		  runner_do_sinks_sink_swallow_self(r, ci, 1);
-		else
-		  error("Unknown/invalid task subtype (%s).",
-				subtaskID_names[t->subtype]);
-		break;
+        else if (t->subtype == task_subtype_stars_feedback)
+          runner_doself_branch_stars_feedback(r, ci);
+        else if (t->subtype == task_subtype_bh_density)
+          runner_doself_branch_bh_density(r, ci);
+        else if (t->subtype == task_subtype_bh_swallow)
+          runner_doself_branch_bh_swallow(r, ci);
+        else if (t->subtype == task_subtype_do_gas_swallow)
+          runner_do_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_do_bh_swallow)
+          runner_do_bh_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_bh_feedback)
+          runner_doself_branch_bh_feedback(r, ci);
+        else if (t->subtype == task_subtype_rt_gradient)
+          runner_doself1_branch_rt_gradient(r, ci);
+        else if (t->subtype == task_subtype_rt_transport)
+          runner_doself2_branch_rt_transport(r, ci);
+        else if (t->subtype == task_subtype_sink_swallow)
+          runner_doself_branch_sinks_swallow(r, ci);
+        else if (t->subtype == task_subtype_sink_do_gas_swallow)
+          runner_do_sinks_gas_swallow_self(r, ci, 1);
+        else if (t->subtype == task_subtype_sink_do_sink_swallow)
+          runner_do_sinks_sink_swallow_self(r, ci, 1);
+        else
+          error("Unknown/invalid task subtype (%s).",
+                subtaskID_names[t->subtype]);
+        break;
 
       case task_type_pair:
         if (t->subtype == task_subtype_density) {
 	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
           cpu_pair++;
 #ifndef GPUOFFLOAD
-//          struct timespec t0, t1, dt;
-//          clock_gettime(CLOCK_REALTIME, &t0);
-//	      runner_dopair1_branch_density(r, ci, cj);
-//	      clock_gettime(CLOCK_REALTIME, &t1);
-//	      tasks_done_cpu++;
-//	      time_for_density_cpu_pair +=
-//	          (t1.tv_sec - t0.tv_sec) +
-//	          (t1.tv_nsec - t0.tv_nsec) /
-//	           1000000000.0;
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+	      runner_dopair1_branch_density(r, ci, cj);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+	      tasks_done_cpu++;
+	      time_for_density_cpu_pair +=
+	          (t1.tv_sec - t0.tv_sec) +
+	          (t1.tv_nsec - t0.tv_nsec) /
+	           1000000000.0;
 #endif
 	    }
 	    /* GPU WORK */
 	    else if (t->subtype == task_subtype_gpu_pack) {
 	      packed_pair++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+		    clock_gettime(CLOCK_REALTIME, &t0);
+		    runner_dopair1_branch_density(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_dens->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
 	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
 	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
@@ -1130,10 +1191,62 @@ void *runner_main2(void *data) {
 			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
 						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
 						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-		    }/* End of GPU work Pairs */
+		    }
+#ifdef DO_CORNERS
+		  } /* End of GPU work Pairs */
+#endif //DO_CORNERS
+#endif //GPUDENS
 	    } /* pair / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
   	      packed_pair_g++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+    	    clock_gettime(CLOCK_REALTIME, &t0);
+	    	runner_dopair1_branch_gradient(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left_g == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair_g +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_grad->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
+//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
+//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
 	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
 	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
             /* No pack tasks left in queue, flag that we want to run */
@@ -1149,9 +1262,58 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
 					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
 		    }
-	    }/* End of GPU work Pairs */
+#ifdef DO_CORNERS
+	      }/* End of GPU work Pairs */
+#endif //DO_CORNERS
+#endif //GPUGRADPAIR
+        }
         else if (t->subtype == task_subtype_gpu_pack_f){
     	    packed_pair_f++;
+#ifdef GPUOFFLOAD
+#ifdef DO_CORNERS
+  	      struct timespec t0, t1, dt;
+  	      clock_gettime(CLOCK_REALTIME, &t0);
+  	      double shift[3] = {0.0};
+  	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+    	  clock_gettime(CLOCK_REALTIME, &t1);
+  		  packing_time_pair +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+  	    	runner_dopair1_branch_force(r, ci, cj);
+  		    t->corner_pair = 1;
+  			int qid = r->qid;
+  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+  			/* Tell the cells they have been packed */
+  			ci->pack_done++;
+  			cj->pack_done++;
+  			t->done = 1;
+  			int launch = 0, launch_leftovers = 0;
+  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
+  				launch_leftovers = 1;
+  			/* Tasks done. Release the lock ! */
+  		    task_unlock(t);
+  			/*schedule my dependencies (Only unpacks really)*/
+  			enqueue_dependencies(sched, t);
+  			/*Signal sleeping runners*/
+  			signal_sleeping_runners(sched, t);
+  		    clock_gettime(CLOCK_REALTIME, &t1);
+  		    packing_time_pair_f +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+  			if (launch_leftovers) {
+  			  pack_vars_pair_forc->launch_leftovers = 1;
+  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+  			}
+  	      }
+  	      else{
+#endif //DO_CORNERS
 //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
 //            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
   	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
@@ -1169,7 +1331,11 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
 					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
   		    } /* End of GPU work Pairs */
-  	    }
+#ifdef DO_CORNERS
+  	      }
+#endif //DO_CORNERS
+#endif //GPUFORCPAIR
+        }
 	    else if (t->subtype == task_subtype_gpu_unpack) {
 	  	      unpacked_pair++;
         }
@@ -1183,30 +1349,30 @@ void *runner_main2(void *data) {
         else if (t->subtype == task_subtype_gradient){
           int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-//          struct timespec t0, t1, dt;
-//          clock_gettime(CLOCK_REALTIME, &t0);
-//          runner_dopair1_branch_gradient(r, ci, cj);
-//	      clock_gettime(CLOCK_REALTIME, &t1);
-//	      tasks_done_cpu++;
-//	      time_for_cpu_pair_g +=
-//	          (t1.tv_sec - t0.tv_sec) +
-//	          (t1.tv_nsec - t0.tv_nsec) /
-//	           1000000000.0;
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_dopair1_branch_gradient(r, ci, cj);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+	      tasks_done_cpu++;
+	      time_for_cpu_pair_g +=
+	          (t1.tv_sec - t0.tv_sec) +
+	          (t1.tv_nsec - t0.tv_nsec) /
+	           1000000000.0;
 #endif //GPUGRADPAIR
         }
 #endif //EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_force){
           int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-//          struct timespec t0, t1, dt;
-//          clock_gettime(CLOCK_REALTIME, &t0);
-//          runner_dopair2_branch_force(r, ci, cj);
-//  	      clock_gettime(CLOCK_REALTIME, &t1);
-//  	      tasks_done_cpu++;
-//  	      time_for_cpu_pair_f +=
-//  	          (t1.tv_sec - t0.tv_sec) +
-//  	          (t1.tv_nsec - t0.tv_nsec) /
-//  	           1000000000.0;
+          struct timespec t0, t1, dt;
+          clock_gettime(CLOCK_REALTIME, &t0);
+          runner_dopair2_branch_force(r, ci, cj);
+  	      clock_gettime(CLOCK_REALTIME, &t1);
+  	      tasks_done_cpu++;
+  	      time_for_cpu_pair_f +=
+  	          (t1.tv_sec - t0.tv_sec) +
+  	          (t1.tv_nsec - t0.tv_nsec) /
+  	           1000000000.0;
 #endif //GPUFORCPAIR
         }
         else if (t->subtype == task_subtype_limiter)
@@ -1597,28 +1763,17 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack){
+      if (t->subtype == task_subtype_gpu_pack ||
+    	  t->subtype == task_subtype_gpu_pack_g ||
+		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
       }
-//      else if (t->subtype == task_subtype_gpu_pack_g && t->type == task_type_self){
-      else if (t->subtype == task_subtype_gpu_pack_g){
-      	/* Don't enqueue unpacks yet. Just signal the runners */
-        t->skip = 1;
-        t = NULL;
-      }
-//      else if (t->subtype == task_subtype_gpu_pack_f && t->type == task_type_self){
-      else if (t->subtype == task_subtype_gpu_pack_f){
-      	/* Don't enqueue unpacks yet. Just signal the runners */
-        t->skip = 1;
-        t = NULL;
-      }
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
       }
-#endif //GPUOFFLOAD
-#ifndef GPUOFFLOAD
+#else //GPUOFFLOAD
         t = scheduler_done(sched, t);
 #endif //GPUOFFLOAD
 
@@ -1646,6 +1801,7 @@ void *runner_main2(void *data) {
 ////  }
     /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
+#ifdef DUMP_TIMINGS
 #ifdef GPUOFFLOAD
 //        char buffer[30];
 //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", r->cpuid, step);
@@ -1673,8 +1829,7 @@ void *runner_main2(void *data) {
 ///////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////
 
-#endif
-#ifndef GPUOFFLOAD
+#else //GPUOFFLOAD
 		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
 				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
 				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
@@ -1682,7 +1837,8 @@ void *runner_main2(void *data) {
 
 		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
 				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-#endif
+#endif //GPUOFFLOAD
+#endif //DUMPTIMINGS
 //    }
 	fflush(fgpu_steps);
 	fclose(fgpu_steps);
@@ -1705,6 +1861,7 @@ void *runner_main2(void *data) {
     density = 0;
     density_sub = 0;
     unpacked = 0;
+    message("reached end of runner_main2\n");
 //	if(step == 2)cudaProfilerStop();
 //	if(step == 2)exit(0);
 //	  size_t free_byte ;

From 2cc07c5f843d276f5e025dadfad34ac778a6b077 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 31 Oct 2024 15:06:22 +0000
Subject: [PATCH 029/217] signalling sleeping runners just after packing seems
 to prevent hanging. Code is doing something but I don't think it's actually
 progressing through time steps

---
 src/engine_maketasks.c                  |  63 +++++-----
 src/engine_marktasks.c                  | 146 ++++++++++++------------
 src/runner_doiact_functions_hydro_gpu.h |  20 ++--
 src/runner_main_clean.cu                |   7 +-
 4 files changed, 121 insertions(+), 115 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 1de0e17db5..bec401e284 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2149,10 +2149,10 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
         engine_addlink(e, &ci->hydro.density, t);
       } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
-      } else if (t_subtype == task_subtype_gpu_pack_f) {
-        engine_addlink(e, &ci->hydro.force_pack, t);
-      } else if (t_subtype == task_subtype_gpu_pack_g) {
-        engine_addlink(e, &ci->hydro.gradient_pack, t);
+//      } else if (t_subtype == task_subtype_gpu_pack_f) {
+//        engine_addlink(e, &ci->hydro.force_pack, t);
+//      } else if (t_subtype == task_subtype_gpu_pack_g) {
+//        engine_addlink(e, &ci->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2170,12 +2170,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
         engine_addlink(e, &cj->hydro.density_pack, t);
-      } else if (t_subtype == task_subtype_gpu_pack_f) {
-        engine_addlink(e, &ci->hydro.force_pack, t);
-        engine_addlink(e, &cj->hydro.force_pack, t);
-      } else if (t_subtype == task_subtype_gpu_pack_g) {
-        engine_addlink(e, &ci->hydro.gradient_pack, t);
-        engine_addlink(e, &cj->hydro.gradient_pack, t);
+//      } else if (t_subtype == task_subtype_gpu_pack_f) {
+//        engine_addlink(e, &ci->hydro.force_pack, t);
+//        engine_addlink(e, &cj->hydro.force_pack, t);
+//      } else if (t_subtype == task_subtype_gpu_pack_g) {
+//        engine_addlink(e, &ci->hydro.gradient_pack, t);
+//        engine_addlink(e, &cj->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2564,7 +2564,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Task for the second GPU hydro loop A. Nasar */
       t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
                                   0, 0, ci, NULL);
-//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
         t_limiter = scheduler_addtask(sched, task_type_self,
@@ -2672,7 +2672,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Add the link between the new loops and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
 
-//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
@@ -2846,8 +2846,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
                                   0, 0, ci, cj);
 //      /* Add the link between the new loop and both cells */
-//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-//      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3009,8 +3009,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
 //      /* Add the link between the new loop and both cells */
-//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-//      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -4827,21 +4827,6 @@ void engine_maketasks(struct engine *e) {
 
   tic2 = getticks();
 
-  /* Run through the tasks and make force tasks for each density task.
-     Each force task depends on the cell ghosts and unlocks the kick task
-     of its super-cell. */
-  if (e->policy & engine_policy_hydro) {
-
-    /* Note that this does not scale well at all so we do not use the
-     * threadpool version here until the reason for this is found.
-     * We call the mapper function directly as if there was only 1 thread
-     * in the pool. */
-    engine_make_extra_hydroloop_tasks_mapper(sched->tasks, sched->nr_tasks, e);
-    /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
-     *                sched->tasks, sched->nr_tasks, sizeof(struct task),
-     *                threadpool_auto_chunk_size, e); */
-  }
-
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
 
@@ -4925,6 +4910,22 @@ void engine_maketasks(struct engine *e) {
       if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
+  /* Run through the tasks and make force tasks for each density task.
+     Each force task depends on the cell ghosts and unlocks the kick task
+     of its super-cell. */
+  if (e->policy & engine_policy_hydro) {
+
+    /* Note that this does not scale well at all so we do not use the
+     * threadpool version here until the reason for this is found.
+     * We call the mapper function directly as if there was only 1 thread
+     * in the pool. */
+    engine_make_extra_hydroloop_tasks_mapper(sched->tasks, sched->nr_tasks, e);
+    /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
+     *                sched->tasks, sched->nr_tasks, sizeof(struct task),
+     *                threadpool_auto_chunk_size, e); */
+  }
+
+
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
   count_current_pair = 0;
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 97af2fdc8d..11d8b46d92 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,21 +86,21 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     const enum task_types t_type = t->type;
     const enum task_subtypes t_subtype = t->subtype;
 
-//    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
-//    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
-//    		t_subtype == task_subtype_gpu_unpack_g ||
-//			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
-//      scheduler_activate(s, t);
-//      continue;
-//    }
-//
-//    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
-//    		t_subtype == task_subtype_gpu_unpack_g ||
-//			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
-//      scheduler_activate(s, t);
-//      continue;
-////      fprintf(stderr,"activated pair unpack in marktasks\n");
-//    }
+    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
+    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
+    		t_subtype == task_subtype_gpu_unpack_g ||
+			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+      scheduler_activate(s, t);
+      continue;
+    }
+
+    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
+    		t_subtype == task_subtype_gpu_unpack_g ||
+			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+      scheduler_activate(s, t);
+      continue;
+//      fprintf(stderr,"activated pair unpack in marktasks\n");
+    }
 
     /* Single-cell task? */
     if (t_type == task_type_self || t_type == task_type_sub_self) {
@@ -109,16 +109,16 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
 
 #ifdef SWIFT_DEBUG_CHECKS
-//#ifndef WITH_CUDA // A. Nasar
+#ifndef WITH_CUDA // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
-//#else
-//      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
-//    		  t_subtype != task_subtype_gpu_unpack_f &&
-//			  t_subtype != task_subtype_gpu_unpack_g){
-//    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
-//    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
-//      }
-//#endif
+#else
+      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
+    		  t_subtype != task_subtype_gpu_unpack_f &&
+			  t_subtype != task_subtype_gpu_unpack_g){
+    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
+      }
+#endif
 #endif
 
       const int ci_active_hydro = cell_is_active_hydro(ci, e);
@@ -141,34 +141,34 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       /* Activate packing for GPU A. Nasar */
-//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
-//        if (ci_active_hydro) {
-//          scheduler_activate(s, t);
-//          ci->pack_done = 0;
-//          ci->gpu_done = 0;
-//          ci->unpack_done = 0;
-//        }
-//      }
-//
-//      /* Activate packing for GPU */
-//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
-//        if (ci_active_hydro) {
-//          scheduler_activate(s, t);
-//          ci->pack_done_g = 0;
-//          ci->gpu_done_g = 0;
-//          ci->unpack_done_g = 0;
-//        }
-//      }
-//
-//      /* Activate packing for GPU */
-//      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
-//        if (ci_active_hydro) {
-//          scheduler_activate(s, t);
-//          ci->pack_done_f = 0;
-//          ci->gpu_done_f = 0;
-//          ci->unpack_done_f = 0;
-//        }
-//      }
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done = 0;
+          ci->gpu_done = 0;
+          ci->unpack_done = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_g = 0;
+          ci->gpu_done_g = 0;
+          ci->unpack_done_g = 0;
+        }
+      }
+
+      /* Activate packing for GPU */
+      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+          ci->pack_done_f = 0;
+          ci->gpu_done_f = 0;
+          ci->unpack_done_f = 0;
+        }
+      }
 
       /* Store current values of dx_max and h_max. */
       else if (t_type == task_type_sub_self &&
@@ -465,27 +465,27 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
       /* Activate packing for GPU A. Nasar */
-//	  if(t_subtype == task_subtype_gpu_pack &&
-//	   ((ci_active_hydro && ci_nodeID == nodeID) ||
-//		(cj_active_hydro && cj_nodeID == nodeID))) {
-//	    scheduler_activate(s, t);
-//	    ci->gpu_done_pair = 0;
-//	    cj->gpu_done_pair = 0;
-//	  }
-//	  else if (t_subtype == task_subtype_gpu_pack_g &&
-//		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-//		      (cj_active_hydro && cj_nodeID == nodeID))) {
-//	    scheduler_activate(s, t);
-//	    ci->gpu_done_pair_g = 0;
-//	    cj->gpu_done_pair_g = 0;
-//	  }
-//	  else if (t_subtype == task_subtype_gpu_pack_f &&
-//		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-//		      (cj_active_hydro && cj_nodeID == nodeID))) {
-//	    scheduler_activate(s, t);
-//	    ci->gpu_done_pair_f = 0;
-//	    cj->gpu_done_pair_f = 0;
-//      }
+	  if(t_subtype == task_subtype_gpu_pack &&
+	   ((ci_active_hydro && ci_nodeID == nodeID) ||
+		(cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair = 0;
+	    cj->gpu_done_pair = 0;
+	  }
+	  else if (t_subtype == task_subtype_gpu_pack_g &&
+		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+		      (cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair_g = 0;
+	    cj->gpu_done_pair_g = 0;
+	  }
+	  else if (t_subtype == task_subtype_gpu_pack_f &&
+		     ((ci_active_hydro && ci_nodeID == nodeID) ||
+		      (cj_active_hydro && cj_nodeID == nodeID))) {
+	    scheduler_activate(s, t);
+	    ci->gpu_done_pair_f = 0;
+	    cj->gpu_done_pair_f = 0;
+      }
 
       /* Only activate tasks that involve a local active cell. A. Nasar THIS COULD BE SOURCE OF BUG */
       if ((t_subtype == task_subtype_density ||
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 51a49e3fba..2df38b122b 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -204,7 +204,8 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 	  clock_gettime(CLOCK_REALTIME, &t1);
 		/* Release the lock on the cell */
 //		task_unlock(t);
-//		cell_unlocktree(ci);
+		cell_unlocktree(ci);
+		signal_sleeping_runners(s, t);
 	  return (t1.tv_sec - t0.tv_sec) +
 			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
@@ -316,6 +317,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	/* Release the lock on the cell */
 //	task_unlock(t);
 	cell_unlocktree(ci);
+	signal_sleeping_runners(s, t);
     return (t1.tv_sec - t0.tv_sec) +
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -425,6 +427,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	/* Release the lock on the cell */
 //	task_unlock(t);
 	cell_unlocktree(ci);
+	signal_sleeping_runners(s, t);
     return (t1.tv_sec - t0.tv_sec) +
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -585,6 +588,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
+	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -765,6 +769,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
+	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -945,6 +950,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
+	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -1337,7 +1343,7 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 			  /*schedule my dependencies (Only unpacks really)*/
 			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
+//			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 
@@ -1701,7 +1707,7 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 			  /*schedule my dependencies (Only unpacks really)*/
 			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
+//			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 
@@ -2069,7 +2075,7 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 			  /*schedule my dependencies (Only unpacks really)*/
 			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
+//			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 		  }
@@ -2642,7 +2648,7 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  /*schedule my dependencies (Only unpacks really)*/
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
+//		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
@@ -3293,7 +3299,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		  /*schedule my dependencies (Only unpacks really)*/
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
+//		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
@@ -3972,7 +3978,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		  /*schedule my dependencies (Only unpacks really)*/
 		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
+//		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index caabc163dc..778790ac54 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,9 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD 1 //off-load hydro to GPU
-#define DO_CORNERS 1 //do corner pair tasks on CPU
-#define DUMP_TIMINGS 1
+//#define GPUOFFLOAD 1 //off-load hydro to GPU
+//#define DO_CORNERS 1 //do corner pair tasks on CPU
+//#define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -1861,7 +1861,6 @@ void *runner_main2(void *data) {
     density = 0;
     density_sub = 0;
     unpacked = 0;
-    message("reached end of runner_main2\n");
 //	if(step == 2)cudaProfilerStop();
 //	if(step == 2)exit(0);
 //	  size_t free_byte ;

From 4d6fe3d5b3c97d6dc7a979108361ab307a0f2cbe Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 1 Nov 2024 11:39:17 +0000
Subject: [PATCH 030/217] Testing to see if code still hangs when making deps
 on pack tasks instead of unpack tasks

---
 src/engine_maketasks.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index bec401e284..b16756d5c8 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4906,7 +4906,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
 //          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.density_pack; l != NULL; l = l->next) {
       if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -4993,7 +4993,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->nodeID == e->nodeID)
 //      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.gradient_pack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -5065,7 +5065,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->nodeID == e->nodeID)
 //      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.force_pack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }

From 57d77a2e6d93cf8a30acee3289df46edbdcf8aa7 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 1 Nov 2024 14:14:57 +0000
Subject: [PATCH 031/217] Added scheduler_done to
 runner_doiact_functions_hydro_gpu.h. Also commented out code for packing in
 runner_main_clean.cu so we only offload seld density tasks for testing

---
 src/cell_unskip.c                       |  43 +-
 src/engine_maketasks.c                  |   6 +-
 src/runner_doiact_functions_hydro_gpu.h |  64 +--
 src/runner_main_clean.cu                | 500 ++++++++++++------------
 4 files changed, 313 insertions(+), 300 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index f5b910d79d..071f3c212e 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1657,7 +1657,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     if ((ci_active && ci_nodeID == nodeID) ||
         (cj_active && cj_nodeID == nodeID)) {
       scheduler_activate(s, t);
-
       /* Activate hydro drift */
       if (t->type == task_type_self) {
         if (ci_nodeID == nodeID) cell_activate_drift_part(ci, s);
@@ -1903,10 +1902,28 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
 #endif
     }
   }
-
   /* Unskip all the other task types. */
   int c_active = cell_is_active_hydro(c, e);
   if (c->nodeID == nodeID && c_active) {
+    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
+	  scheduler_activate(s, l->t);
+//	  message("activating pair pack\n");
+	if (l->t->ci != NULL){
+	  l->t->ci->pack_done = 0;
+	  l->t->ci->gpu_done = 0;
+	  l->t->ci->unpack_done = 0;
+	}
+	if (l->t->cj != NULL){
+	  l->t->cj->pack_done = 0;
+	  l->t->cj->gpu_done = 0;
+	  l->t->cj->unpack_done = 0;
+	}
+    }
+    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+	  scheduler_activate(s, l->t);
+//	  message("activating pair UN-pack\n");
+	  l->t->gpu_done = 0;
+    }
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
     }
@@ -1915,25 +1932,10 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.limiter; l != NULL; l = l->next)
       scheduler_activate(s, l->t);
-    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
-      scheduler_activate(s, l->t);
-      if (l->t->ci != NULL){
-        l->t->ci->pack_done = 0;
-        l->t->ci->gpu_done = 0;
-        l->t->ci->unpack_done = 0;
-      }
-      if (l->t->cj != NULL){
-        l->t->cj->pack_done = 0;
-        l->t->cj->gpu_done = 0;
-        l->t->cj->unpack_done = 0;
-      }
-    }
-    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
-      scheduler_activate(s, l->t);
-      l->t->gpu_done = 0;
-    }
+    // A. Nasar activate force and gradient packing tasks
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
+//      message("activating pair pack force\n");
       if (l->t->ci != NULL){
         l->t->ci->pack_done_f = 0;
         l->t->ci->gpu_done_f = 0;
@@ -1947,12 +1949,14 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
+//      message("activating pair UN-pack force\n");
       l->t->gpu_done = 0;
     }
 
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
+//      message("activating pair pack gradient\n");
       if (l->t->ci != NULL){
         l->t->ci->pack_done_g = 0;
         l->t->ci->gpu_done_g = 0;
@@ -1966,6 +1970,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
+//      message("activating pair UN-pack gradient\n");
       l->t->gpu_done = 0;
     }
 #endif
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index b16756d5c8..bec401e284 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4906,7 +4906,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
 //          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-    for (struct link *l = t->ci->hydro.density_pack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
       if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -4993,7 +4993,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->nodeID == e->nodeID)
 //      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-    for (struct link *l = t->ci->hydro.gradient_pack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
@@ -5065,7 +5065,7 @@ void engine_maketasks(struct engine *e) {
       continue;
 //    if(t->ci->nodeID == e->nodeID)
 //      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-    for (struct link *l = t->ci->hydro.force_pack; l != NULL; l = l->next) {
+    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
     	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
     }
   }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 2df38b122b..b090637308 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -205,7 +205,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 		/* Release the lock on the cell */
 //		task_unlock(t);
 		cell_unlocktree(ci);
-		signal_sleeping_runners(s, t);
+//		signal_sleeping_runners(s, t);
 	  return (t1.tv_sec - t0.tv_sec) +
 			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
@@ -317,7 +317,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	/* Release the lock on the cell */
 //	task_unlock(t);
 	cell_unlocktree(ci);
-	signal_sleeping_runners(s, t);
+//	signal_sleeping_runners(s, t);
     return (t1.tv_sec - t0.tv_sec) +
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -427,7 +427,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	/* Release the lock on the cell */
 //	task_unlock(t);
 	cell_unlocktree(ci);
-	signal_sleeping_runners(s, t);
+//	signal_sleeping_runners(s, t);
     return (t1.tv_sec - t0.tv_sec) +
                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -588,7 +588,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-	signal_sleeping_runners(s, t);
+//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -769,7 +769,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-	signal_sleeping_runners(s, t);
+//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -950,7 +950,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-	signal_sleeping_runners(s, t);
+//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -1337,11 +1337,12 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
+			  scheduler_done(s, tii);
 			  /* Release the lock */
-			  cell_unlocktree(cii);
+//			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
+//			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
 //			  signal_sleeping_runners(s, tii);
 
@@ -1700,12 +1701,13 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 
 			  /* Record things for debugging */
 			  cii->gpu_done_g++;
+			  scheduler_done(s, tii);
 
 			  /* Release the lock */
-			  cell_unlocktree(cii);
+//			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
+//			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
 //			  signal_sleeping_runners(s, tii);
 
@@ -2069,11 +2071,13 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 			  clock_gettime(CLOCK_REALTIME, &tp1);
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+			  scheduler_done(s, tii);
 			  /* Release the lock */
-			  cell_unlocktree(cii);
+//			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
+//			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
 //			  signal_sleeping_runners(s, tii);
 
@@ -2635,10 +2639,11 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  cii->gpu_done_pair++;
 		  cjj->gpu_done_pair++;
 
-		  /* Release the locks */
-		  cell_unlocktree(cii);
-		  /* Release the locks */
-		  cell_unlocktree(cjj);
+		  scheduler_done(s, tii);
+//		  /* Release the locks */
+//		  cell_unlocktree(cii);
+//		  /* Release the locks */
+//		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -2646,7 +2651,7 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
+//		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 //		  signal_sleeping_runners(s, tii);
 
@@ -3286,10 +3291,11 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		  cii->gpu_done_pair_g++;
 		  cjj->gpu_done_pair_g++;
 
-		  /* Release the locks */
-		  cell_unlocktree(cii);
-		  /* Release the locks */
-		  cell_unlocktree(cjj);
+		  scheduler_done(s, tii);
+//		  /* Release the locks */
+//		  cell_unlocktree(cii);
+//		  /* Release the locks */
+//		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -3297,7 +3303,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
+//		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
 //		  signal_sleeping_runners(s, tii);
 
@@ -3965,10 +3971,12 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		  cii->gpu_done_pair_f++;
 		  cjj->gpu_done_pair_f++;
 
-		  /* Release the locks */
-		  cell_unlocktree(cii);
-		  /* Release the locks */
-		  cell_unlocktree(cjj);
+		  scheduler_done(s, tii);
+
+//		  /* Release the locks */
+//		  cell_unlocktree(cii);
+//		  /* Release the locks */
+//		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -3976,8 +3984,8 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
-		  /*Signal sleeping runners*/
+//		  enqueue_dependencies(s, tii);
+//		  /*Signal sleeping runners*/
 //		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 778790ac54..1adcb8e5f1 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,7 +19,7 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-//#define GPUOFFLOAD 1 //off-load hydro to GPU
+#define GPUOFFLOAD 1 //off-load hydro to GPU
 //#define DO_CORNERS 1 //do corner pair tasks on CPU
 //#define DUMP_TIMINGS 1
 #include "../config.h"
@@ -991,60 +991,60 @@ void *runner_main2(void *data) {
         else if (t->subtype == task_subtype_gpu_pack_g){
           packed_self_g++;
 #ifdef GPUOFFLOAD
-//          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-//        		  t, parts_aos_grad, &packing_time_g);
-          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
-        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
-          /* No pack tasks left in queue, flag that we want to run */
-  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-  	      /*Packed enough tasks let's go*/
-  	      int launch = pack_vars_self_grad->launch;
-  	      /* Do we have enough stuff to run the GPU ? */
-            if (launch || launch_leftovers) {
-          	/*Launch GPU tasks*/
-//      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
-//      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
-      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
-      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
-      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
-      	      		self_end_g, &unpack_time_self_g);
-            } /*End of GPU work Self*/
+////          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+////        		  t, parts_aos_grad, &packing_time_g);
+//          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
+//        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
+//          /* No pack tasks left in queue, flag that we want to run */
+//  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+//  	      /*Packed enough tasks let's go*/
+//  	      int launch = pack_vars_self_grad->launch;
+//  	      /* Do we have enough stuff to run the GPU ? */
+//            if (launch || launch_leftovers) {
+//          	/*Launch GPU tasks*/
+////      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
+////      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
+//      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
+//      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+//      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
+//      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
+//      	      		self_end_g, &unpack_time_self_g);
+//            } /*End of GPU work Self*/
 #endif //GPUGRADSELF
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
           packed_self_f++;
 #ifdef GPUOFFLOAD
-//          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-//        		  t, parts_aos_forc, &packing_time_f);
-          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
-        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
-//          int count = ci->hydro.count;
-//          for(int i = 0; i < count; i++){
-//        	  int pid = pack_vars_self_forc->count_parts - count + i;
-//        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
-//          }
-          /* No pack tasks left in queue, flag that we want to run */
-	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-	      /*Packed enough tasks let's go*/
-	      int launch = pack_vars_self_forc->launch;
-	      /* Do we have enough stuff to run the GPU ? */
-          if (launch || launch_leftovers) {
-            /*Launch GPU tasks*/
-//  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
-//  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
-	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
-	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
-					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
-					self_end_f, &unpack_time_self_f);
-          } /*End of GPU work Self*/
+////          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+////        		  t, parts_aos_forc, &packing_time_f);
+//          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
+//        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
+////          int count = ci->hydro.count;
+////          for(int i = 0; i < count; i++){
+////        	  int pid = pack_vars_self_forc->count_parts - count + i;
+////        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
+////          }
+//          /* No pack tasks left in queue, flag that we want to run */
+//	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+//	      /*Packed enough tasks let's go*/
+//	      int launch = pack_vars_self_forc->launch;
+//	      /* Do we have enough stuff to run the GPU ? */
+//          if (launch || launch_leftovers) {
+//            /*Launch GPU tasks*/
+////  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
+////  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
+//	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
+//	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+//					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
+//					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
+//					self_end_f, &unpack_time_self_f);
+//          } /*End of GPU work Self*/
 #endif //GPUFORCSELF
         }
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
-#ifndef GPUOFFLOAD
+//#ifndef GPUOFFLOAD
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself1_branch_gradient(r, ci);
@@ -1054,12 +1054,12 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) /
                     1000000000.0;
-#endif //GPUGRADSELF
+//#endif //GPUGRADSELF
         }
 #endif
         else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
-#ifndef GPUOFFLOAD
+//#ifndef GPUOFFLOAD
             struct timespec t0, t1;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself2_branch_force(r, ci);
@@ -1069,7 +1069,7 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) /
                     1000000000.0;
-#endif //GPUFORCSELF
+//#endif //GPUFORCSELF
         } else if (t->subtype == task_subtype_limiter)
           runner_doself1_branch_limiter(r, ci);
         else if (t->subtype == task_subtype_grav)
@@ -1115,7 +1115,7 @@ void *runner_main2(void *data) {
         if (t->subtype == task_subtype_density) {
 	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
           cpu_pair++;
-#ifndef GPUOFFLOAD
+//#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
 	      runner_dopair1_branch_density(r, ci, cj);
@@ -1125,215 +1125,215 @@ void *runner_main2(void *data) {
 	          (t1.tv_sec - t0.tv_sec) +
 	          (t1.tv_nsec - t0.tv_nsec) /
 	           1000000000.0;
-#endif
+//#endif
 	    }
 	    /* GPU WORK */
 	    else if (t->subtype == task_subtype_gpu_pack) {
 	      packed_pair++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-		    clock_gettime(CLOCK_REALTIME, &t0);
-		    runner_dopair1_branch_density(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_dens->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
-	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
-	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
-		    int launch = pack_vars_pair_dens->launch;
-		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-		    /* Do we have enough stuff to run the GPU ? */
-	        if(launch)n_full_p_d_bundles++;
-	        if(launch_leftovers)n_partial_p_d_bundles++;
-		    if(launch || launch_leftovers) {
-		    /*Launch GPU tasks*/
-//				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-//						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
-			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-		    }
-#ifdef DO_CORNERS
-		  } /* End of GPU work Pairs */
-#endif //DO_CORNERS
+//#ifdef DO_CORNERS
+//	      struct timespec t0, t1, dt;
+//	      clock_gettime(CLOCK_REALTIME, &t0);
+//	      double shift[3] = {0.0};
+//	      t->corner_pair = 0;
+//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+//	      clock_gettime(CLOCK_REALTIME, &t1);
+//		  packing_time_pair +=
+//		        (t1.tv_sec - t0.tv_sec) +
+//		        (t1.tv_nsec - t0.tv_nsec) /
+//		         1000000000.0;
+//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+////		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+//		    clock_gettime(CLOCK_REALTIME, &t0);
+//		    runner_dopair1_branch_density(r, ci, cj);
+//		    t->corner_pair = 1;
+//			int qid = r->qid;
+//			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+//			/* Tell the cells they have been packed */
+//			ci->pack_done++;
+//			cj->pack_done++;
+//			t->done = 1;
+//			int launch = 0, launch_leftovers = 0;
+//			if ((sched->queues[qid].n_packs_pair_left == 0))
+//				launch_leftovers = 1;
+//			/* Tasks done. Release the lock ! */
+//		    task_unlock(t);
+//			/*schedule my dependencies (Only unpacks really)*/
+//			enqueue_dependencies(sched, t);
+//			/*Signal sleeping runners*/
+//			signal_sleeping_runners(sched, t);
+//		    clock_gettime(CLOCK_REALTIME, &t1);
+//		    packing_time_pair +=
+//		        (t1.tv_sec - t0.tv_sec) +
+//		        (t1.tv_nsec - t0.tv_nsec) /
+//		         1000000000.0;
+//			if (launch_leftovers) {
+//			  pack_vars_pair_dens->launch_leftovers = 1;
+//			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+//							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+//							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+//			}
+//	      }
+//	      else{
+//#endif //DO_CORNERS
+//	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
+//	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+//		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
+//		    int launch = pack_vars_pair_dens->launch;
+//		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+//		    /* Do we have enough stuff to run the GPU ? */
+//	        if(launch)n_full_p_d_bundles++;
+//	        if(launch_leftovers)n_partial_p_d_bundles++;
+//		    if(launch || launch_leftovers) {
+//		    /*Launch GPU tasks*/
+////				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+////						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
+//			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+//						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+//						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+//		    }
+//#ifdef DO_CORNERS
+//		  } /* End of GPU work Pairs */
+//#endif //DO_CORNERS
 #endif //GPUDENS
 	    } /* pair / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
   	      packed_pair_g++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-    	    clock_gettime(CLOCK_REALTIME, &t0);
-	    	runner_dopair1_branch_gradient(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left_g == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair_g +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_grad->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
-//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
-//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
-	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
-            /* No pack tasks left in queue, flag that we want to run */
-	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-	        /*Packed enough tasks let's go*/
-	        int launch = pack_vars_pair_grad->launch;
-		    /* Do we have enough stuff to run the GPU ? */
-		    if (launch || launch_leftovers) {
-		    /*Launch GPU tasks*/
-//			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-//					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
-			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-		    }
-#ifdef DO_CORNERS
-	      }/* End of GPU work Pairs */
-#endif //DO_CORNERS
+//#ifdef DO_CORNERS
+//	      struct timespec t0, t1, dt;
+//	      clock_gettime(CLOCK_REALTIME, &t0);
+//	      double shift[3] = {0.0};
+//	      t->corner_pair = 0;
+//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+//	      clock_gettime(CLOCK_REALTIME, &t1);
+//		  packing_time_pair +=
+//		        (t1.tv_sec - t0.tv_sec) +
+//		        (t1.tv_nsec - t0.tv_nsec) /
+//		         1000000000.0;
+//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+////          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+//    	    clock_gettime(CLOCK_REALTIME, &t0);
+//	    	runner_dopair1_branch_gradient(r, ci, cj);
+//		    t->corner_pair = 1;
+//			int qid = r->qid;
+//			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+//			/* Tell the cells they have been packed */
+//			ci->pack_done++;
+//			cj->pack_done++;
+//			t->done = 1;
+//			int launch = 0, launch_leftovers = 0;
+//			if ((sched->queues[qid].n_packs_pair_left_g == 0))
+//				launch_leftovers = 1;
+//			/* Tasks done. Release the lock ! */
+//		    task_unlock(t);
+//			/*schedule my dependencies (Only unpacks really)*/
+//			enqueue_dependencies(sched, t);
+//			/*Signal sleeping runners*/
+//			signal_sleeping_runners(sched, t);
+//		    clock_gettime(CLOCK_REALTIME, &t1);
+//		    packing_time_pair_g +=
+//		        (t1.tv_sec - t0.tv_sec) +
+//		        (t1.tv_nsec - t0.tv_nsec) /
+//		         1000000000.0;
+//			if (launch_leftovers) {
+//			  pack_vars_pair_grad->launch_leftovers = 1;
+//			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+//					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+//					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+//			}
+//	      }
+//	      else{
+//#endif //DO_CORNERS
+////          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
+////        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
+//	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+//	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
+//            /* No pack tasks left in queue, flag that we want to run */
+//	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+//	        /*Packed enough tasks let's go*/
+//	        int launch = pack_vars_pair_grad->launch;
+//		    /* Do we have enough stuff to run the GPU ? */
+//		    if (launch || launch_leftovers) {
+//		    /*Launch GPU tasks*/
+////			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+////					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
+//			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+//					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+//					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+//		    }
+//#ifdef DO_CORNERS
+//	      }/* End of GPU work Pairs */
+//#endif //DO_CORNERS
 #endif //GPUGRADPAIR
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
     	    packed_pair_f++;
 #ifdef GPUOFFLOAD
-#ifdef DO_CORNERS
-  	      struct timespec t0, t1, dt;
-  	      clock_gettime(CLOCK_REALTIME, &t0);
-  	      double shift[3] = {0.0};
-  	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-    	  clock_gettime(CLOCK_REALTIME, &t1);
-  		  packing_time_pair +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-  	    	runner_dopair1_branch_force(r, ci, cj);
-  		    t->corner_pair = 1;
-  			int qid = r->qid;
-  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-  			/* Tell the cells they have been packed */
-  			ci->pack_done++;
-  			cj->pack_done++;
-  			t->done = 1;
-  			int launch = 0, launch_leftovers = 0;
-  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
-  				launch_leftovers = 1;
-  			/* Tasks done. Release the lock ! */
-  		    task_unlock(t);
-  			/*schedule my dependencies (Only unpacks really)*/
-  			enqueue_dependencies(sched, t);
-  			/*Signal sleeping runners*/
-  			signal_sleeping_runners(sched, t);
-  		    clock_gettime(CLOCK_REALTIME, &t1);
-  		    packing_time_pair_f +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-  			if (launch_leftovers) {
-  			  pack_vars_pair_forc->launch_leftovers = 1;
-  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-  			}
-  	      }
-  	      else{
-#endif //DO_CORNERS
-//            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
-//            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
-  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
-            /* No pack tasks left in queue, flag that we want to run */
-  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-  	        /*Packed enough tasks let's go*/
-  	        int launch = pack_vars_pair_forc->launch;
-  		    /* Do we have enough stuff to run the GPU ? */
-  		    if (launch || launch_leftovers) {
-  		    /*Launch GPU tasks*/
-//  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-//  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
-  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-  		    } /* End of GPU work Pairs */
-#ifdef DO_CORNERS
-  	      }
-#endif //DO_CORNERS
+//#ifdef DO_CORNERS
+//  	      struct timespec t0, t1, dt;
+//  	      clock_gettime(CLOCK_REALTIME, &t0);
+//  	      double shift[3] = {0.0};
+//  	      t->corner_pair = 0;
+//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+//    	  clock_gettime(CLOCK_REALTIME, &t1);
+//  		  packing_time_pair +=
+//  		        (t1.tv_sec - t0.tv_sec) +
+//  		        (t1.tv_nsec - t0.tv_nsec) /
+//  		         1000000000.0;
+//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+////          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+//  	    	runner_dopair1_branch_force(r, ci, cj);
+//  		    t->corner_pair = 1;
+//  			int qid = r->qid;
+//  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+//  			/* Tell the cells they have been packed */
+//  			ci->pack_done++;
+//  			cj->pack_done++;
+//  			t->done = 1;
+//  			int launch = 0, launch_leftovers = 0;
+//  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
+//  				launch_leftovers = 1;
+//  			/* Tasks done. Release the lock ! */
+//  		    task_unlock(t);
+//  			/*schedule my dependencies (Only unpacks really)*/
+//  			enqueue_dependencies(sched, t);
+//  			/*Signal sleeping runners*/
+//  			signal_sleeping_runners(sched, t);
+//  		    clock_gettime(CLOCK_REALTIME, &t1);
+//  		    packing_time_pair_f +=
+//  		        (t1.tv_sec - t0.tv_sec) +
+//  		        (t1.tv_nsec - t0.tv_nsec) /
+//  		         1000000000.0;
+//  			if (launch_leftovers) {
+//  			  pack_vars_pair_forc->launch_leftovers = 1;
+//  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+//					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+//					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+//  			}
+//  	      }
+//  	      else{
+//#endif //DO_CORNERS
+////            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
+////            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
+//  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+//  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
+//            /* No pack tasks left in queue, flag that we want to run */
+//  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+//  	        /*Packed enough tasks let's go*/
+//  	        int launch = pack_vars_pair_forc->launch;
+//  		    /* Do we have enough stuff to run the GPU ? */
+//  		    if (launch || launch_leftovers) {
+//  		    /*Launch GPU tasks*/
+////  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+////  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
+//  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+//					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+//					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+//  		    } /* End of GPU work Pairs */
+//#ifdef DO_CORNERS
+//  	      }
+//#endif //DO_CORNERS
 #endif //GPUFORCPAIR
         }
 	    else if (t->subtype == task_subtype_gpu_unpack) {
@@ -1348,7 +1348,7 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient){
           int Do_nothing = 0;
-#ifndef GPUOFFLOAD
+//#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
           runner_dopair1_branch_gradient(r, ci, cj);
@@ -1358,12 +1358,12 @@ void *runner_main2(void *data) {
 	          (t1.tv_sec - t0.tv_sec) +
 	          (t1.tv_nsec - t0.tv_nsec) /
 	           1000000000.0;
-#endif //GPUGRADPAIR
+//#endif //GPUGRADPAIR
         }
 #endif //EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_force){
           int Do_nothing = 0;
-#ifndef GPUOFFLOAD
+//#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
           runner_dopair2_branch_force(r, ci, cj);
@@ -1373,7 +1373,7 @@ void *runner_main2(void *data) {
   	          (t1.tv_sec - t0.tv_sec) +
   	          (t1.tv_nsec - t0.tv_nsec) /
   	           1000000000.0;
-#endif //GPUFORCPAIR
+//#endif //GPUFORCPAIR
         }
         else if (t->subtype == task_subtype_limiter)
           runner_dopair1_branch_limiter(r, ci, cj);
@@ -1763,9 +1763,9 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack ||
-    	  t->subtype == task_subtype_gpu_pack_g ||
-		  t->subtype == task_subtype_gpu_pack_f){
+      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){// ||
+//    	  t->subtype == task_subtype_gpu_pack_g ||
+//		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;

From d952965874a1633eca5fec9f35ba6d134d035f11 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 1 Nov 2024 17:26:08 +0000
Subject: [PATCH 032/217] Checked engine_maketasks.c and things seem reasonable
 with nothing missing. Issue maybe in cell_unskip.c

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   1 -
 src/engine_maketasks.c                        | 152 ++++++++++--------
 src/runner_doiact_functions_hydro_gpu.h       |   8 +-
 3 files changed, 93 insertions(+), 68 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 4325c9b9c3..d3e392817b 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -9,7 +9,6 @@ InternalUnitSystem:
 Scheduler:
   max_top_level_cells: 16
   cell_split_size: 1000
-  dependency_graph_cell: 10
   dependency_graph_frequency: 1
 
 # Parameters governing the time integration
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index bec401e284..41f3980936 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2119,7 +2119,9 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &((struct task *)map_data)[ind];
-
+    if (t->ci == NULL) { //Possible fix missing when moving code over. Prevents unpack tasks continuing past here
+      break;
+    }
     struct cell *ci = t->ci;
     struct cell *cj = t->cj;
     const enum task_types t_type = t->type;
@@ -2527,24 +2529,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
 
     /*Make packing depend on sorts and drift A. Nasar */
-    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack && ci->nodeID == nodeID) {
+    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
     }
 
-    /*Make packing depend on sorts and drift A. Nasar */
-    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack) {
-      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
-      if (ci->hydro.super != cj->hydro.super){
-        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
-      }
-      if(ci->nodeID == nodeID){
-        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
-      }
-      if((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)){
-        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
-      }
-    }
-
     /* Sort tasks depend on the drift of the cell (stars version). */
     else if (t_type == task_type_stars_sort && ci->nodeID == nodeID) {
       scheduler_addunlock(sched, ci->hydro.super->stars.drift, t);
@@ -2564,7 +2552,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Task for the second GPU hydro loop A. Nasar */
       t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
                                   0, 0, ci, NULL);
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
         t_limiter = scheduler_addtask(sched, task_type_self,
@@ -2629,8 +2617,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                               flags, 0, ci, NULL);
       }
 
-      /* Link the tasks to the cells */
+      /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
       engine_addlink(e, &ci->hydro.force, t_force);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
       }
@@ -2664,20 +2654,19 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Same work for the additional hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-
       /* Same work for the additional GPU hydro loop A. Nasar */
       t_gradient_gpu = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gpu_pack_g, 0, 0, ci, NULL);
 
-      /* Add the link between the new loops and the cell */
+      /* Add the link between the new loops and the cell. Same for GPU task A. Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-
       engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
+
       // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and will be used to create downstream deps later
       scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
       scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
@@ -2686,7 +2675,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -2819,6 +2808,22 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       }
     }
 
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack) {
+      /* Make all density tasks depend on the drift */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+      }
+      /* Make all density tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+      }
+    }
+
     /* Otherwise, pair interaction? */
     else if (t_type == task_type_pair && t_subtype == task_subtype_density) {
 
@@ -2845,9 +2850,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force A. Nasar */
       t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
                                   0, 0, ci, cj);
-//      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -2950,6 +2952,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
+      /* Do teh same for GPU tasks A. Nasar*/
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3039,14 +3044,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
       }
 
 #endif
@@ -4264,9 +4269,9 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
     if (ci->nodeID == nodeID) {
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
-      struct task *t_pack = scheduler_addtask(
-          sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci, NULL); // A. Nasar also add a pack task for GPU
-      t_pack_self = t_pack;
+      // A. Nasar also add a pack task for GPU
+      scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci,
+    		            NULL);
     }
 
     /* Now loop over all the neighbours of this cell */
@@ -4300,8 +4305,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
-          struct task *t_pack = scheduler_addtask(
-              sched, task_type_pair, task_subtype_gpu_pack, sid, 0, ci, cj); // A. Nasar
+          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
+        		            ci, cj); // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4856,7 +4861,7 @@ void engine_maketasks(struct engine *e) {
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_self_unpack);
       scheduler_addunlock(sched, last_created_self_unpack,
-                          t->ci->hydro.super->hydro.ghost_in);
+                          t->ci->hydro.super->hydro.ghost_in); //Keep self_unpack dependency here, pairs added later using links
       /*Creating links between each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
       t->ci->hydro.d_unpack = last_created_self_unpack;
@@ -4876,6 +4881,12 @@ void engine_maketasks(struct engine *e) {
         fprintf(stderr, "Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+      scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->ci->hydro.super->hydro.ghost_in);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+      scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.ghost_in);
 
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
@@ -4900,16 +4911,17 @@ void engine_maketasks(struct engine *e) {
     error("We did not find the correct number of pair pack tasks!!");
 #endif
   /* Loop over all the currently existing ghost_in tasks to add unpack dependency*/
-  for (int i = 0; i < sched->nr_tasks; i++) {
-    struct task *t = &sched->tasks[i];
-    if (t->type != task_type_ghost_in)
-      continue;
-//    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
-//          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
-      if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
-    }
-  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//    struct task *t = &sched->tasks[i];
+//    if (t->type != task_type_ghost_in)
+//      continue;
+////    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
+////          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
+//    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
+////      if(l->t->type == task_type_pair)
+//    	  scheduler_addunlock(sched, l->t, t);
+//    }
+//  }
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
@@ -4966,6 +4978,12 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.extra_ghost);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.extra_ghost);
 
       engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
@@ -4987,16 +5005,17 @@ void engine_maketasks(struct engine *e) {
 	    error("We did not find the correct number of G pair pack tasks!! count %i what it shoudl be %i", count_current_pair, sched->nr_pair_pack_tasks_g);
 #endif
   /* Loop over all the currently existing extra_ghost tasks to add unpack dependency*/
-  for (int i = 0; i < sched->nr_tasks; i++) {
-    struct task *t = &sched->tasks[i];
-    if (t->type != task_type_extra_ghost)
-      continue;
-//    if(t->ci->nodeID == e->nodeID)
-//      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
-    	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
-    }
-  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//    struct task *t = &sched->tasks[i];
+//    if (t->type != task_type_extra_ghost)
+//      continue;
+////    if(t->ci->nodeID == e->nodeID)
+////      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
+//    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
+////    	if(l->t->type == task_type_pair)
+//    		scheduler_addunlock(sched, l->t, t);
+//    }
+//  }
   /*Now create unpacks for all gpu_pack_f (force) tasks*/
   count_current_self = 0;
   count_current_pair = 0;
@@ -5038,6 +5057,12 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.end_force);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.end_force);
 
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
@@ -5059,16 +5084,17 @@ void engine_maketasks(struct engine *e) {
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
   /* Loop over all the currently existing end_force tasks to add unpack dependency*/
-  for (int i = 0; i < sched->nr_tasks; i++) {
-    struct task *t = &sched->tasks[i];
-    if (t->type != task_type_end_hydro_force)
-      continue;
-//    if(t->ci->nodeID == e->nodeID)
-//      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
-    	if(l->t->type == task_type_pair)scheduler_addunlock(sched, l->t, t);
-    }
-  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//    struct task *t = &sched->tasks[i];
+//    if (t->type != task_type_end_hydro_force)
+//      continue;
+////    if(t->ci->nodeID == e->nodeID)
+////      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
+//    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
+////    	if(l->t->type == task_type_pair)
+//    		scheduler_addunlock(sched, l->t, t);
+//    }
+//  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index b090637308..632dc08597 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1337,14 +1337,14 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
-			  scheduler_done(s, tii);
+//			  scheduler_done(s, tii);
 			  /* Release the lock */
-//			  cell_unlocktree(cii);
+			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-//			  enqueue_dependencies(s, tii);
+			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-//			  signal_sleeping_runners(s, tii);
+			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 

From 13912cecf754db4c24a8552c3c78bc21f4c4fa03 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Fri, 1 Nov 2024 17:49:29 +0000
Subject: [PATCH 033/217] Fixed a bug in scheduler_enqueue() where I had a
 break before where it should've been. Replaced with if statement

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  1 -
 src/scheduler.c                               | 27 ++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index d3e392817b..a20cbf38c0 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -9,7 +9,6 @@ InternalUnitSystem:
 Scheduler:
   max_top_level_cells: 16
   cell_split_size: 1000
-  dependency_graph_frequency: 1
 
 # Parameters governing the time integration
 TimeIntegration:
diff --git a/src/scheduler.c b/src/scheduler.c
index d296f44687..8d2a8e7380 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2580,14 +2580,17 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
           owner = &t->ci->grav.super->owner;
         } else if (t->subtype == task_subtype_gpu_pack) { // A. Nasar
           qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
             //          fprintf(stderr,"nqueues %i waiting %i active_count %i\n",
             //          s->nr_queues, s->waiting, s->active_count);
             //          if(qid==-1)fprintf(stderr,"queue id is negative\n");
             //          else fprintf(stderr,"queue id is %i\n", qid);
         } else if (t->subtype == task_subtype_gpu_pack_f) {
           qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
         } else if (t->subtype == task_subtype_gpu_pack_g) {
           qid = t->ci->hydro.super->owner;
+          owner = &t->ci->hydro.super->owner;
         } else if (t->subtype == task_subtype_gpu_unpack) {
           ////          qid = t->ci->owner;
           qid = -1;
@@ -2622,17 +2625,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
       case task_type_pair:
       case task_type_sub_pair:
         if(t->subtype == task_subtype_gpu_unpack ||
-            t->subtype == task_subtype_gpu_unpack_f ||
-  		   t->subtype == task_subtype_gpu_unpack_g) qid = -1;
-        break;
-        qid = t->ci->super->owner;
-        owner = &t->ci->super->owner;
-        if ((qid < 0) ||
-            ((t->cj->super->owner > -1) &&
-             (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
-          qid = t->cj->super->owner;
-          owner = &t->cj->super->owner;
+           t->subtype == task_subtype_gpu_unpack_f ||
+  		   t->subtype == task_subtype_gpu_unpack_g){
+        	qid = -1;
+        }
+        else{
+          qid = t->ci->super->owner;
+          owner = &t->ci->super->owner;
+          if ((qid < 0) ||
+              ((t->cj->super->owner > -1) &&
+               (s->queues[qid].count > s->queues[t->cj->super->owner].count))) {
+            qid = t->cj->super->owner;
+            owner = &t->cj->super->owner;
+          }
         }
+        break;
       case task_type_recv:
 #ifdef WITH_MPI
       {

From 3632ba20fb66e87c6cf3c40e04a5e31bf040e591 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 4 Nov 2024 12:29:30 +0000
Subject: [PATCH 034/217] Removed all GPU tasks aside from density self pack
 tasks. COde still hangs

---
 src/cuda/part_gpu.h    |   2 +-
 src/engine_maketasks.c | 164 ++++++++++++++++++++---------------------
 2 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index 1af9029416..47e5da4f29 100755
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.3/targets/x86_64-linux/include/vector_types.h"
+#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 41f3980936..7439b83dfe 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2550,8 +2550,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_self, task_subtype_force,
                                   flags, 0, ci, NULL);
       /* Task for the second GPU hydro loop A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
-                                  0, 0, ci, NULL);
+//      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
+//                                  0, 0, ci, NULL);
 
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -2619,7 +2619,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
       engine_addlink(e, &ci->hydro.force, t_force);
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
 
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
@@ -2655,12 +2655,12 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
       /* Same work for the additional GPU hydro loop A. Nasar */
-      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
-                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+//      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
+//                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
 
       /* Add the link between the new loops and the cell. Same for GPU task A. Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
@@ -2668,14 +2668,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                            with_timestep_limiter);
 
       // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and will be used to create downstream deps later
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+//      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+//      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+//      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -2848,8 +2848,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_pair, task_subtype_force,
                                   flags, 0, ci, cj);
       /* New task for the force A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
-                                  0, 0, ci, cj);
+//      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
+//                                  0, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -2860,10 +2860,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
       /* Make GPU force tasks depend on the sorts A. Nasar */
-      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+//      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+//        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -2953,8 +2953,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
       /* Do teh same for GPU tasks A. Nasar*/
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+//      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3007,15 +3007,15 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
       /* Start by constructing the task for the second and third GPU hydro loop A. Nasar */
-      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
-                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
+//      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+//                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
 //      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+//      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -3024,16 +3024,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+//        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+//        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+//        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
+//        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
 #else
 
@@ -3044,14 +3044,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+//        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
+//        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
       }
 
 #endif
@@ -4305,8 +4305,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
-          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
-        		            ci, cj); // A. Nasar
+//          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
+//        		            ci, cj); // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4870,8 +4870,8 @@ void engine_maketasks(struct engine *e) {
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-        last_created_pair_unpack = scheduler_addtask(
-            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+//        last_created_pair_unpack = scheduler_addtask(
+//            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
@@ -4880,16 +4880,16 @@ void engine_maketasks(struct engine *e) {
       if(t->cj->hydro.ghost_in == NULL)
         fprintf(stderr, "Ghost in for cell j is NULL\n");
 
-      scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
-      scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->ci->hydro.super->hydro.ghost_in);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-      scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.ghost_in);
-
-      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
-      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
+//      scheduler_addunlock(sched, t, last_created_pair_unpack);
+//      if(t->ci->nodeID == e->nodeID)
+//      scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->ci->hydro.super->hydro.ghost_in);
+//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+//      scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->cj->hydro.super->hydro.ghost_in);
+//
+//      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
+//      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
 
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
@@ -4954,39 +4954,39 @@ void engine_maketasks(struct engine *e) {
     if (t->type == task_type_self) {
 
       if (count_current_self % pack_size == 0) {
-        last_created_self_unpack = scheduler_addtask(
-            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
-        last_created_self_unpack->gpu_done = 0;
+//        last_created_self_unpack = scheduler_addtask(
+//            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+//        last_created_self_unpack->gpu_done = 0;
       }
 
       /* pack -> unpack -> ghost_in */
-      scheduler_addunlock(sched, t, last_created_self_unpack);
-      scheduler_addunlock(sched, last_created_self_unpack,
-                          t->ci->hydro.super->hydro.extra_ghost);
-      /*Creating links between a each cell and its unpack task*/
-      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
-      t->ci->hydro.g_unpack = last_created_self_unpack;
+//      scheduler_addunlock(sched, t, last_created_self_unpack);
+//      scheduler_addunlock(sched, last_created_self_unpack,
+//                          t->ci->hydro.super->hydro.extra_ghost);
+//      /*Creating links between a each cell and its unpack task*/
+//      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
+//      t->ci->hydro.g_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-        last_created_pair_unpack = scheduler_addtask(
-            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+//        last_created_pair_unpack = scheduler_addtask(
+//            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-      scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
-        scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.extra_ghost);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-        scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.extra_ghost);
-
-      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
-      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
+//      scheduler_addunlock(sched, t, last_created_pair_unpack);
+//      if(t->ci->nodeID == e->nodeID)
+//        scheduler_addunlock(sched, last_created_pair_unpack,
+//    		              t->ci->hydro.super->hydro.extra_ghost);
+//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+//        scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->cj->hydro.super->hydro.extra_ghost);
+//
+//      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
+//      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
 //      t->ci->hydro.g_unpack = last_created_pair_unpack;
@@ -5032,40 +5032,40 @@ void engine_maketasks(struct engine *e) {
     if (t->type == task_type_self) {
 
       if (count_current_self % pack_size == 0) {
-        last_created_self_unpack = scheduler_addtask(
-            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
-        last_created_self_unpack->gpu_done = 0;
+//        last_created_self_unpack = scheduler_addtask(
+//            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+//        last_created_self_unpack->gpu_done = 0;
       }
 
       /* pack -> unpack -> ghost_in */
-      scheduler_addunlock(sched, t, last_created_self_unpack);
-      scheduler_addunlock(sched, last_created_self_unpack,
-                          t->ci->hydro.super->hydro.end_force);
-      /*Creating links between a each cell and its unpack task*/
-      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
-
-      t->ci->hydro.f_unpack = last_created_self_unpack;
+//      scheduler_addunlock(sched, t, last_created_self_unpack);
+//      scheduler_addunlock(sched, last_created_self_unpack,
+//                          t->ci->hydro.super->hydro.end_force);
+//      /*Creating links between a each cell and its unpack task*/
+//      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
+//
+//      t->ci->hydro.f_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-        last_created_pair_unpack = scheduler_addtask(
-            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+//        last_created_pair_unpack = scheduler_addtask(
+//            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-      scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
-        scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.end_force);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-        scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.end_force);
-
-      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
-      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
+//      scheduler_addunlock(sched, t, last_created_pair_unpack);
+//      if(t->ci->nodeID == e->nodeID)
+//        scheduler_addunlock(sched, last_created_pair_unpack,
+//    		              t->ci->hydro.super->hydro.end_force);
+//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+//        scheduler_addunlock(sched, last_created_pair_unpack,
+//                          t->cj->hydro.super->hydro.end_force);
+//
+//      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
+//      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
 //      t->ci->hydro.f_unpack = last_created_pair_unpack;

From d77dd2fe8e7561f4ad19cdc36710970884bbcf12 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 4 Nov 2024 16:18:11 +0000
Subject: [PATCH 035/217] Found bug in how we set n_tasks_left* in
 scheduler_rewait it should be in scheduler_enqueue

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  1 +
 src/engine.c                                  | 34 ++++++++++-
 src/engine_maketasks.c                        | 28 ++++-----
 src/runner_doiact_functions_hydro_gpu.h       | 24 ++++----
 src/runner_main_clean.cu                      | 39 ++++++------
 src/scheduler.c                               | 61 ++++++++++++++-----
 6 files changed, 127 insertions(+), 60 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index a20cbf38c0..ff5ede11dd 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -9,6 +9,7 @@ InternalUnitSystem:
 Scheduler:
   max_top_level_cells: 16
   cell_split_size: 1000
+  deadlock_waiting_time_s: 10.
 
 # Parameters governing the time integration
 TimeIntegration:
diff --git a/src/engine.c b/src/engine.c
index b353dd4496..b0d632ccb5 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2204,8 +2204,24 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   }
 #endif
 
-  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar write deps before running first step
+//  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar write deps before running first step
   /* Now, launch the calculation */
+//  message("n tasks %i", e->sched.nr_tasks);
+//  for (int i = 0; i < e->sched.nr_tasks; i++){
+//	  struct task *tmp_t = &e->sched.tasks[i];
+//	  if(tmp_t->subtype == task_subtype_density){
+//		if(tmp_t->skip == 1)error("inactive density task");
+//	  }
+////	  if(tmp_t->subtype == task_subtype_force){
+////		if(tmp_t->skip == 1)error("inactive force task");
+////	  }
+//	  if(tmp_t->subtype == task_subtype_gpu_pack){
+//		if(tmp_t->skip == 1)error("inactive pack task");
+//	  }
+//	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+//	    if(tmp_t->skip == 1)error("inactive unpack task");
+//	  }
+//  }
   TIMER_TIC;
   engine_launch(e, "tasks");
   TIMER_TOC(timer_runners);
@@ -2293,6 +2309,22 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   scheduler_write_cell_dependencies(&e->sched, e->verbose, e->step);
   if (e->nodeID == 0) scheduler_write_task_level(&e->sched, e->step);
 
+//  for (int i = 0; i < e->sched.nr_tasks; i++){
+//	  struct task *tmp_t = &e->sched.tasks[i];
+//	  if(tmp_t->subtype == task_subtype_density){
+//		if(tmp_t->skip == 1)error("inactive density task");
+//	  }
+//	  if(tmp_t->subtype == task_subtype_force){
+//		if(tmp_t->skip == 1)error("inactive force task");
+//	  }
+//	  if(tmp_t->subtype == task_subtype_gpu_pack){
+//		if(tmp_t->skip == 1)error("inactive pack task");
+//	  }
+//	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+//	    if(tmp_t->skip == 1)error("inactive unpack task");
+//	  }
+//  }
+
   /* Run the 0th time-step */
   TIMER_TIC2;
   engine_launch(e, "tasks");
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 7439b83dfe..c0e8627d50 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4305,8 +4305,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
-//          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
-//        		            ci, cj); // A. Nasar
+          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
+        		            ci, cj); // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4870,8 +4870,8 @@ void engine_maketasks(struct engine *e) {
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-//        last_created_pair_unpack = scheduler_addtask(
-//            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
@@ -4880,16 +4880,16 @@ void engine_maketasks(struct engine *e) {
       if(t->cj->hydro.ghost_in == NULL)
         fprintf(stderr, "Ghost in for cell j is NULL\n");
 
-//      scheduler_addunlock(sched, t, last_created_pair_unpack);
-//      if(t->ci->nodeID == e->nodeID)
-//      scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->ci->hydro.super->hydro.ghost_in);
-//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-//      scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->cj->hydro.super->hydro.ghost_in);
-//
-//      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
-//      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+      scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->ci->hydro.super->hydro.ghost_in);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+      scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.ghost_in);
+
+      engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
 
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 632dc08597..3c6c8792a3 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -203,8 +203,9 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack
     /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
 	  clock_gettime(CLOCK_REALTIME, &t1);
 		/* Release the lock on the cell */
-//		task_unlock(t);
-		cell_unlocktree(ci);
+		task_unlock(t);
+		t->gpu_done = 1;
+//		cell_unlocktree(ci);
 //		signal_sleeping_runners(s, t);
 	  return (t1.tv_sec - t0.tv_sec) +
 			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
@@ -588,7 +589,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -2072,14 +2072,14 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
-			  scheduler_done(s, tii);
+//			  scheduler_done(s, tii);
 			  /* Release the lock */
-//			  cell_unlocktree(cii);
+			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-//			  enqueue_dependencies(s, tii);
+			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-//			  signal_sleeping_runners(s, tii);
+			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 		  }
@@ -2639,11 +2639,11 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  cii->gpu_done_pair++;
 		  cjj->gpu_done_pair++;
 
-		  scheduler_done(s, tii);
+//		  scheduler_done(s, tii);
 //		  /* Release the locks */
-//		  cell_unlocktree(cii);
+		  cell_unlocktree(cii);
 //		  /* Release the locks */
-//		  cell_unlocktree(cjj);
+		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -2651,9 +2651,9 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-//		  enqueue_dependencies(s, tii);
+		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
-//		  signal_sleeping_runners(s, tii);
+		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 1adcb8e5f1..2f42c80dba 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1176,24 +1176,24 @@ void *runner_main2(void *data) {
 //	      }
 //	      else{
 //#endif //DO_CORNERS
-//	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
-//	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-//		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
-//		    int launch = pack_vars_pair_dens->launch;
-//		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-//		    /* Do we have enough stuff to run the GPU ? */
-//	        if(launch)n_full_p_d_bundles++;
-//	        if(launch_leftovers)n_partial_p_d_bundles++;
-//		    if(launch || launch_leftovers) {
-//		    /*Launch GPU tasks*/
-////				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-////						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
-//			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-//						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-//						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
+	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
+		    int launch = pack_vars_pair_dens->launch;
+		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+		    /* Do we have enough stuff to run the GPU ? */
+	        if(launch)n_full_p_d_bundles++;
+	        if(launch_leftovers)n_partial_p_d_bundles++;
+		    if(launch || launch_leftovers) {
+		    /*Launch GPU tasks*/
+//				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+//						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
+			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
 //		    }
 //#ifdef DO_CORNERS
-//		  } /* End of GPU work Pairs */
+		  } /* End of GPU work Pairs */
 //#endif //DO_CORNERS
 #endif //GPUDENS
 	    } /* pair / pack */
@@ -1348,7 +1348,7 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient){
           int Do_nothing = 0;
-//#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
           runner_dopair1_branch_gradient(r, ci, cj);
@@ -1358,7 +1358,7 @@ void *runner_main2(void *data) {
 	          (t1.tv_sec - t0.tv_sec) +
 	          (t1.tv_nsec - t0.tv_nsec) /
 	           1000000000.0;
-//#endif //GPUGRADPAIR
+#endif //GPUGRADPAIR
         }
 #endif //EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_force){
@@ -1763,12 +1763,13 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){// ||
+      if (t->subtype == task_subtype_gpu_pack){// ||
 //    	  t->subtype == task_subtype_gpu_pack_g ||
 //		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
+//        if(t->gpu_done == 0)message("Missed packing a GPU tasks\n");
       }
       else{ /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
diff --git a/src/scheduler.c b/src/scheduler.c
index 8d2a8e7380..0dff9be0f5 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2440,25 +2440,30 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
 
     /* Ignore skipped tasks. */
     if (t->skip) continue;
-    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack) // A. Nasar
-      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
-    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_f)
-      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
-    if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_g)
-      atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
-
-    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack)
-          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
-    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_f)
-          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
-    if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_g)
-          atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
 
     /* Increment the task's own wait counter for the enqueueing. */
     atomic_inc(&t->wait);
     t->done = 0;
     t->gpu_done = 0;
 
+//    if (t->type == task_type_self){ // A. Nasar increment number of waiting tasks
+//      if(t->subtype == task_subtype_gpu_pack)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
+//      if (t->subtype == task_subtype_gpu_pack_f)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
+//      if (t->subtype == task_subtype_gpu_pack_g)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
+//    }
+//
+//    if (t->type == task_type_pair){
+//      if(t->subtype == task_subtype_gpu_pack)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
+//      if (t->subtype == task_subtype_gpu_pack_f)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
+//      if (t->subtype == task_subtype_gpu_pack_g)
+//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
+//    }
+
 #ifdef SWIFT_DEBUG_CHECKS
     /* Check that we don't have more waits that what can be stored. */
     if (t->wait < 0)
@@ -2850,7 +2855,35 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
-
+    //A. Nasar Do the same for the pack tasks
+    if (t->type == task_type_self){
+      if(t->subtype == task_subtype_gpu_pack)
+        atomic_inc(&s->queues[qid].n_packs_self_left);
+      if (t->subtype == task_subtype_gpu_pack_f)
+        atomic_inc(&s->queues[qid].n_packs_self_left_f);
+      if (t->subtype == task_subtype_gpu_pack_g)
+        atomic_inc(&s->queues[qid].n_packs_self_left_g);
+    }
+    if (t->type == task_type_pair){ // A. Nasar NEED to think about how to do this with MPI where ci may not be on this node/rank
+      if(t->subtype == task_subtype_gpu_pack){
+    	  if(t->ci->nodeID == s->nodeID)
+            atomic_inc(&s->queues[qid].n_packs_pair_left);
+    	  else
+    		atomic_inc(&s->queues[qid].n_packs_pair_left);
+      }
+      if (t->subtype == task_subtype_gpu_pack_f){
+    	if(t->ci->nodeID == s->nodeID)
+          atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+    	else
+    	  atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+      }
+      if (t->subtype == task_subtype_gpu_pack_g){
+      	if(t->ci->nodeID == s->nodeID)
+          atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+      	else
+          atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+      }
+    }
     /* Insert the task into that queue. */
     queue_insert(&s->queues[qid], t);
   }

From db4372787d171ced2f9396189218916c40b25ae1 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 4 Nov 2024 17:09:17 +0000
Subject: [PATCH 036/217] Fix is in for force and gradient pack tasks but needs
 de-bugging as code is not giving correct results

---
 src/engine_maketasks.c                  | 131 ++++++++++---------
 src/runner_doiact_functions_hydro_gpu.h |  37 ++----
 src/runner_main_clean.cu                | 164 ++++++++++++------------
 src/scheduler.c                         |   8 +-
 4 files changed, 167 insertions(+), 173 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index c0e8627d50..0033f30150 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2550,8 +2550,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_self, task_subtype_force,
                                   flags, 0, ci, NULL);
       /* Task for the second GPU hydro loop A. Nasar */
-//      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
-//                                  0, 0, ci, NULL);
+      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
+                                  0, 0, ci, NULL);
 
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -2619,7 +2619,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
       engine_addlink(e, &ci->hydro.force, t_force);
-//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
 
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
@@ -2655,12 +2655,12 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
       /* Same work for the additional GPU hydro loop A. Nasar */
-//      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
-//                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
+                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
 
       /* Add the link between the new loops and the cell. Same for GPU task A. Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
@@ -2668,14 +2668,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                            with_timestep_limiter);
 
       // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and will be used to create downstream deps later
-//      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-//      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-//      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -2848,8 +2848,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_pair, task_subtype_force,
                                   flags, 0, ci, cj);
       /* New task for the force A. Nasar */
-//      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
-//                                  0, 0, ci, cj);
+      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
+                                  0, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -2860,10 +2860,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
       /* Make GPU force tasks depend on the sorts A. Nasar */
-//      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
-//        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -2953,8 +2953,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
       /* Do teh same for GPU tasks A. Nasar*/
-//      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-//      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3007,15 +3007,15 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
       /* Start by constructing the task for the second and third GPU hydro loop A. Nasar */
-//      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
-//                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
+      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
 //      /* Add the link between the new loop and both cells */
-//      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-//      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -3024,16 +3024,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-//        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-//        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-//        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
-//        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
       }
 #else
 
@@ -3044,14 +3044,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-//        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-//        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
       }
 
 #endif
@@ -4954,39 +4954,39 @@ void engine_maketasks(struct engine *e) {
     if (t->type == task_type_self) {
 
       if (count_current_self % pack_size == 0) {
-//        last_created_self_unpack = scheduler_addtask(
-//            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
-//        last_created_self_unpack->gpu_done = 0;
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+        last_created_self_unpack->gpu_done = 0;
       }
 
       /* pack -> unpack -> ghost_in */
-//      scheduler_addunlock(sched, t, last_created_self_unpack);
-//      scheduler_addunlock(sched, last_created_self_unpack,
-//                          t->ci->hydro.super->hydro.extra_ghost);
-//      /*Creating links between a each cell and its unpack task*/
-//      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
-//      t->ci->hydro.g_unpack = last_created_self_unpack;
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.extra_ghost);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
+      t->ci->hydro.g_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-//        last_created_pair_unpack = scheduler_addtask(
-//            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-//      scheduler_addunlock(sched, t, last_created_pair_unpack);
-//      if(t->ci->nodeID == e->nodeID)
-//        scheduler_addunlock(sched, last_created_pair_unpack,
-//    		              t->ci->hydro.super->hydro.extra_ghost);
-//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-//        scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->cj->hydro.super->hydro.extra_ghost);
-//
-//      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
-//      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.extra_ghost);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.extra_ghost);
+
+      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
 //      t->ci->hydro.g_unpack = last_created_pair_unpack;
@@ -5032,17 +5032,16 @@ void engine_maketasks(struct engine *e) {
     if (t->type == task_type_self) {
 
       if (count_current_self % pack_size == 0) {
-//        last_created_self_unpack = scheduler_addtask(
-//            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
-//        last_created_self_unpack->gpu_done = 0;
+          last_created_self_unpack = scheduler_addtask(
+              sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-//      scheduler_addunlock(sched, t, last_created_self_unpack);
-//      scheduler_addunlock(sched, last_created_self_unpack,
-//                          t->ci->hydro.super->hydro.end_force);
-//      /*Creating links between a each cell and its unpack task*/
-//      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
+      scheduler_addunlock(sched, t, last_created_self_unpack);
+      scheduler_addunlock(sched, last_created_self_unpack,
+                          t->ci->hydro.super->hydro.end_force);
+      /*Creating links between a each cell and its unpack task*/
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
 //
 //      t->ci->hydro.f_unpack = last_created_self_unpack;
 
@@ -5051,21 +5050,21 @@ void engine_maketasks(struct engine *e) {
 
     else if (t->type == task_type_pair) {
       if (count_current_pair % pack_size == 0) {
-//        last_created_pair_unpack = scheduler_addtask(
-//            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+        last_created_pair_unpack = scheduler_addtask(
+            sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-//      scheduler_addunlock(sched, t, last_created_pair_unpack);
-//      if(t->ci->nodeID == e->nodeID)
-//        scheduler_addunlock(sched, last_created_pair_unpack,
-//    		              t->ci->hydro.super->hydro.end_force);
-//      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-//        scheduler_addunlock(sched, last_created_pair_unpack,
-//                          t->cj->hydro.super->hydro.end_force);
-//
-//      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
-//      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
+      scheduler_addunlock(sched, t, last_created_pair_unpack);
+      if(t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+    		              t->ci->hydro.super->hydro.end_force);
+      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                          t->cj->hydro.super->hydro.end_force);
+
+      engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
       /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
        * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
 //      t->ci->hydro.f_unpack = last_created_pair_unpack;
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 3c6c8792a3..d525ed84eb 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -769,7 +769,6 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -950,7 +949,6 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pa
 	t->done = 1;
 	/* Copies done. Release the lock ! */
 	task_unlock(t);
-//	signal_sleeping_runners(s, t);
 	pack_vars->tasks_packed++;
 	pack_vars->launch = 0;
 	pack_vars->launch_leftovers = 0;
@@ -1337,7 +1335,6 @@ void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
-//			  scheduler_done(s, tii);
 			  /* Release the lock */
 			  cell_unlocktree(cii);
 
@@ -1701,15 +1698,14 @@ void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pa
 
 			  /* Record things for debugging */
 			  cii->gpu_done_g++;
-			  scheduler_done(s, tii);
 
 			  /* Release the lock */
-//			  cell_unlocktree(cii);
+			  cell_unlocktree(cii);
 
 			  /*schedule my dependencies (Only unpacks really)*/
-//			  enqueue_dependencies(s, tii);
+			  enqueue_dependencies(s, tii);
 			  /*Signal sleeping runners*/
-//			  signal_sleeping_runners(s, tii);
+			  signal_sleeping_runners(s, tii);
 
 			  tii->gpu_done = 1;
 
@@ -2072,7 +2068,6 @@ void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pa
 			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
 			  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
-//			  scheduler_done(s, tii);
 			  /* Release the lock */
 			  cell_unlocktree(cii);
 
@@ -2639,7 +2634,6 @@ void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s,
 		  cii->gpu_done_pair++;
 		  cjj->gpu_done_pair++;
 
-//		  scheduler_done(s, tii);
 //		  /* Release the locks */
 		  cell_unlocktree(cii);
 //		  /* Release the locks */
@@ -3291,11 +3285,10 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		  cii->gpu_done_pair_g++;
 		  cjj->gpu_done_pair_g++;
 
-		  scheduler_done(s, tii);
-//		  /* Release the locks */
-//		  cell_unlocktree(cii);
-//		  /* Release the locks */
-//		  cell_unlocktree(cjj);
+		  /* Release the locks */
+		  cell_unlocktree(cii);
+		  /* Release the locks */
+		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -3303,9 +3296,9 @@ void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-//		  enqueue_dependencies(s, tii);
+		  enqueue_dependencies(s, tii);
 		  /*Signal sleeping runners*/
-//		  signal_sleeping_runners(s, tii);
+		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
@@ -3971,12 +3964,10 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		  cii->gpu_done_pair_f++;
 		  cjj->gpu_done_pair_f++;
 
-		  scheduler_done(s, tii);
-
 //		  /* Release the locks */
-//		  cell_unlocktree(cii);
+		  cell_unlocktree(cii);
 //		  /* Release the locks */
-//		  cell_unlocktree(cjj);
+		  cell_unlocktree(cjj);
 
 		  /*Time end of unpacking*/
 		  clock_gettime(CLOCK_REALTIME, &tp1);
@@ -3984,9 +3975,9 @@ void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s
 		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
 		  /*schedule my dependencies (Only unpacks really)*/
-//		  enqueue_dependencies(s, tii);
-//		  /*Signal sleeping runners*/
-//		  signal_sleeping_runners(s, tii);
+		  enqueue_dependencies(s, tii);
+		  /*Signal sleeping runners*/
+		  signal_sleeping_runners(s, tii);
 
 		  tii->gpu_done = 1;
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 2f42c80dba..37a64152a9 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -991,54 +991,54 @@ void *runner_main2(void *data) {
         else if (t->subtype == task_subtype_gpu_pack_g){
           packed_self_g++;
 #ifdef GPUOFFLOAD
-////          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-////        		  t, parts_aos_grad, &packing_time_g);
-//          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
-//        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
-//          /* No pack tasks left in queue, flag that we want to run */
-//  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-//  	      /*Packed enough tasks let's go*/
-//  	      int launch = pack_vars_self_grad->launch;
-//  	      /* Do we have enough stuff to run the GPU ? */
-//            if (launch || launch_leftovers) {
-//          	/*Launch GPU tasks*/
-////      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
-////      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
-//      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
-//      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-//      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
-//      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
-//      	      		self_end_g, &unpack_time_self_g);
-//            } /*End of GPU work Self*/
+//          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+//        		  t, parts_aos_grad, &packing_time_g);
+          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
+        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
+          /* No pack tasks left in queue, flag that we want to run */
+  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+  	      /*Packed enough tasks let's go*/
+  	      int launch = pack_vars_self_grad->launch;
+  	      /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+          	/*Launch GPU tasks*/
+//      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
+//      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
+      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
+      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
+      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
+      	      		self_end_g, &unpack_time_self_g);
+            } /*End of GPU work Self*/
 #endif //GPUGRADSELF
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
           packed_self_f++;
 #ifdef GPUOFFLOAD
-////          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-////        		  t, parts_aos_forc, &packing_time_f);
-//          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
-//        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
-////          int count = ci->hydro.count;
-////          for(int i = 0; i < count; i++){
-////        	  int pid = pack_vars_self_forc->count_parts - count + i;
-////        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
-////          }
-//          /* No pack tasks left in queue, flag that we want to run */
-//	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-//	      /*Packed enough tasks let's go*/
-//	      int launch = pack_vars_self_forc->launch;
-//	      /* Do we have enough stuff to run the GPU ? */
-//          if (launch || launch_leftovers) {
-//            /*Launch GPU tasks*/
-////  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
-////  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
-//	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
-//	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-//					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
-//					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
-//					self_end_f, &unpack_time_self_f);
-//          } /*End of GPU work Self*/
+//          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+//        		  t, parts_aos_forc, &packing_time_f);
+          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
+        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
+//          int count = ci->hydro.count;
+//          for(int i = 0; i < count; i++){
+//        	  int pid = pack_vars_self_forc->count_parts - count + i;
+//        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
+//          }
+          /* No pack tasks left in queue, flag that we want to run */
+	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+	      /*Packed enough tasks let's go*/
+	      int launch = pack_vars_self_forc->launch;
+	      /* Do we have enough stuff to run the GPU ? */
+          if (launch || launch_leftovers) {
+            /*Launch GPU tasks*/
+//  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
+//  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
+	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
+	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
+					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
+					self_end_f, &unpack_time_self_f);
+          } /*End of GPU work Self*/
 #endif //GPUFORCSELF
         }
 #ifdef EXTRA_HYDRO_LOOP
@@ -1245,23 +1245,23 @@ void *runner_main2(void *data) {
 //	      }
 //	      else{
 //#endif //DO_CORNERS
-////          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
-////        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
-//	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-//	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
-//            /* No pack tasks left in queue, flag that we want to run */
-//	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-//	        /*Packed enough tasks let's go*/
-//	        int launch = pack_vars_pair_grad->launch;
-//		    /* Do we have enough stuff to run the GPU ? */
-//		    if (launch || launch_leftovers) {
-//		    /*Launch GPU tasks*/
-////			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-////					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
-//			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-//					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-//					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-//		    }
+//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
+//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
+	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
+            /* No pack tasks left in queue, flag that we want to run */
+	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+	        /*Packed enough tasks let's go*/
+	        int launch = pack_vars_pair_grad->launch;
+		    /* Do we have enough stuff to run the GPU ? */
+		    if (launch || launch_leftovers) {
+		    /*Launch GPU tasks*/
+//			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+//					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
+			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+		    }
 //#ifdef DO_CORNERS
 //	      }/* End of GPU work Pairs */
 //#endif //DO_CORNERS
@@ -1314,23 +1314,23 @@ void *runner_main2(void *data) {
 //  	      }
 //  	      else{
 //#endif //DO_CORNERS
-////            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
-////            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
-//  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-//  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
-//            /* No pack tasks left in queue, flag that we want to run */
-//  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-//  	        /*Packed enough tasks let's go*/
-//  	        int launch = pack_vars_pair_forc->launch;
-//  		    /* Do we have enough stuff to run the GPU ? */
-//  		    if (launch || launch_leftovers) {
-//  		    /*Launch GPU tasks*/
-////  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-////  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
-//  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-//					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-//					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-//  		    } /* End of GPU work Pairs */
+//            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
+//            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
+  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
+            /* No pack tasks left in queue, flag that we want to run */
+  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+  	        /*Packed enough tasks let's go*/
+  	        int launch = pack_vars_pair_forc->launch;
+  		    /* Do we have enough stuff to run the GPU ? */
+  		    if (launch || launch_leftovers) {
+  		    /*Launch GPU tasks*/
+//  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+//  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
+  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+  		    } /* End of GPU work Pairs */
 //#ifdef DO_CORNERS
 //  	      }
 //#endif //DO_CORNERS
@@ -1363,7 +1363,7 @@ void *runner_main2(void *data) {
 #endif //EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_force){
           int Do_nothing = 0;
-//#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
           runner_dopair2_branch_force(r, ci, cj);
@@ -1373,7 +1373,7 @@ void *runner_main2(void *data) {
   	          (t1.tv_sec - t0.tv_sec) +
   	          (t1.tv_nsec - t0.tv_nsec) /
   	           1000000000.0;
-//#endif //GPUFORCPAIR
+#endif //GPUFORCPAIR
         }
         else if (t->subtype == task_subtype_limiter)
           runner_dopair1_branch_limiter(r, ci, cj);
@@ -1763,9 +1763,9 @@ void *runner_main2(void *data) {
       prev = t;
 #ifdef GPUOFFLOAD
 //      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack){// ||
-//    	  t->subtype == task_subtype_gpu_pack_g ||
-//		  t->subtype == task_subtype_gpu_pack_f){
+      if (t->subtype == task_subtype_gpu_pack ||
+    	  t->subtype == task_subtype_gpu_pack_g ||
+		  t->subtype == task_subtype_gpu_pack_f){
     	/* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
diff --git a/src/scheduler.c b/src/scheduler.c
index 0dff9be0f5..509205597c 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1869,8 +1869,12 @@ void scheduler_set_unlocks(struct scheduler *s) {
     struct task *t = &s->tasks[k];
     for (int i = 0; i < t->nr_unlock_tasks; i++) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
-//        if (t->unlock_tasks[i] == t->unlock_tasks[j] && t->subtype != task_subtype_gpu_unpack
-//        		&& t->subtype != task_subtype_gpu_unpack_g && t->subtype != task_subtype_gpu_unpack_f)
+    	/*Fix for the case when one unpack task works over the same cell connected to two pair pack tasks*/
+        if (t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack ||
+        		t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack_g ||
+        		t->unlock_tasks[i]->subtype != task_subtype_gpu_unpack_f){
+          continue;
+        }
         if (t->unlock_tasks[i] == t->unlock_tasks[j])
           error("duplicate unlock! t->type=%s/%s unlocking type=%s/%s",
                 taskID_names[t->type], subtaskID_names[t->subtype],

From abecded7e443ba8fa4f89719cf0312b87cf6748b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 5 Nov 2024 09:15:27 +0000
Subject: [PATCH 037/217] Missing brackets. FIX!

---
 src/runner_main_clean.cu | 296 +++++++++++++++++++--------------------
 1 file changed, 148 insertions(+), 148 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 37a64152a9..76cb61a582 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1044,7 +1044,7 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
         else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
-//#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself1_branch_gradient(r, ci);
@@ -1054,12 +1054,12 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) /
                     1000000000.0;
-//#endif //GPUGRADSELF
+#endif //GPUGRADSELF
         }
 #endif
         else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
-//#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD
             struct timespec t0, t1;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself2_branch_force(r, ci);
@@ -1069,7 +1069,7 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) /
                     1000000000.0;
-//#endif //GPUFORCSELF
+#endif //GPUFORCSELF
         } else if (t->subtype == task_subtype_limiter)
           runner_doself1_branch_limiter(r, ci);
         else if (t->subtype == task_subtype_grav)
@@ -1115,7 +1115,7 @@ void *runner_main2(void *data) {
         if (t->subtype == task_subtype_density) {
 	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
           cpu_pair++;
-//#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD
           struct timespec t0, t1, dt;
           clock_gettime(CLOCK_REALTIME, &t0);
 	      runner_dopair1_branch_density(r, ci, cj);
@@ -1125,57 +1125,57 @@ void *runner_main2(void *data) {
 	          (t1.tv_sec - t0.tv_sec) +
 	          (t1.tv_nsec - t0.tv_nsec) /
 	           1000000000.0;
-//#endif
+#endif
 	    }
 	    /* GPU WORK */
 	    else if (t->subtype == task_subtype_gpu_pack) {
 	      packed_pair++;
 #ifdef GPUOFFLOAD
-//#ifdef DO_CORNERS
-//	      struct timespec t0, t1, dt;
-//	      clock_gettime(CLOCK_REALTIME, &t0);
-//	      double shift[3] = {0.0};
-//	      t->corner_pair = 0;
-//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-//	      clock_gettime(CLOCK_REALTIME, &t1);
-//		  packing_time_pair +=
-//		        (t1.tv_sec - t0.tv_sec) +
-//		        (t1.tv_nsec - t0.tv_nsec) /
-//		         1000000000.0;
-//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-////		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-//		    clock_gettime(CLOCK_REALTIME, &t0);
-//		    runner_dopair1_branch_density(r, ci, cj);
-//		    t->corner_pair = 1;
-//			int qid = r->qid;
-//			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-//			/* Tell the cells they have been packed */
-//			ci->pack_done++;
-//			cj->pack_done++;
-//			t->done = 1;
-//			int launch = 0, launch_leftovers = 0;
-//			if ((sched->queues[qid].n_packs_pair_left == 0))
-//				launch_leftovers = 1;
-//			/* Tasks done. Release the lock ! */
-//		    task_unlock(t);
-//			/*schedule my dependencies (Only unpacks really)*/
-//			enqueue_dependencies(sched, t);
-//			/*Signal sleeping runners*/
-//			signal_sleeping_runners(sched, t);
-//		    clock_gettime(CLOCK_REALTIME, &t1);
-//		    packing_time_pair +=
-//		        (t1.tv_sec - t0.tv_sec) +
-//		        (t1.tv_nsec - t0.tv_nsec) /
-//		         1000000000.0;
-//			if (launch_leftovers) {
-//			  pack_vars_pair_dens->launch_leftovers = 1;
-//			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-//							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-//							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-//			}
-//	      }
-//	      else{
-//#endif //DO_CORNERS
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+		    clock_gettime(CLOCK_REALTIME, &t0);
+		    runner_dopair1_branch_density(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_dens->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
+							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
+							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
 	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
 	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
@@ -1192,59 +1192,59 @@ void *runner_main2(void *data) {
 						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
 						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
 //		    }
-//#ifdef DO_CORNERS
+#ifdef DO_CORNERS
 		  } /* End of GPU work Pairs */
-//#endif //DO_CORNERS
+#endif //DO_CORNERS
 #endif //GPUDENS
 	    } /* pair / pack */
         else if (t->subtype == task_subtype_gpu_pack_g){
   	      packed_pair_g++;
 #ifdef GPUOFFLOAD
-//#ifdef DO_CORNERS
-//	      struct timespec t0, t1, dt;
-//	      clock_gettime(CLOCK_REALTIME, &t0);
-//	      double shift[3] = {0.0};
-//	      t->corner_pair = 0;
-//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-//	      clock_gettime(CLOCK_REALTIME, &t1);
-//		  packing_time_pair +=
-//		        (t1.tv_sec - t0.tv_sec) +
-//		        (t1.tv_nsec - t0.tv_nsec) /
-//		         1000000000.0;
-//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-////          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-//    	    clock_gettime(CLOCK_REALTIME, &t0);
-//	    	runner_dopair1_branch_gradient(r, ci, cj);
-//		    t->corner_pair = 1;
-//			int qid = r->qid;
-//			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-//			/* Tell the cells they have been packed */
-//			ci->pack_done++;
-//			cj->pack_done++;
-//			t->done = 1;
-//			int launch = 0, launch_leftovers = 0;
-//			if ((sched->queues[qid].n_packs_pair_left_g == 0))
-//				launch_leftovers = 1;
-//			/* Tasks done. Release the lock ! */
-//		    task_unlock(t);
-//			/*schedule my dependencies (Only unpacks really)*/
-//			enqueue_dependencies(sched, t);
-//			/*Signal sleeping runners*/
-//			signal_sleeping_runners(sched, t);
-//		    clock_gettime(CLOCK_REALTIME, &t1);
-//		    packing_time_pair_g +=
-//		        (t1.tv_sec - t0.tv_sec) +
-//		        (t1.tv_nsec - t0.tv_nsec) /
-//		         1000000000.0;
-//			if (launch_leftovers) {
-//			  pack_vars_pair_grad->launch_leftovers = 1;
-//			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-//					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-//					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-//			}
-//	      }
-//	      else{
-//#endif //DO_CORNERS
+#ifdef DO_CORNERS
+	      struct timespec t0, t1, dt;
+	      clock_gettime(CLOCK_REALTIME, &t0);
+	      double shift[3] = {0.0};
+	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+	      clock_gettime(CLOCK_REALTIME, &t1);
+		  packing_time_pair +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+    	    clock_gettime(CLOCK_REALTIME, &t0);
+	    	runner_dopair1_branch_gradient(r, ci, cj);
+		    t->corner_pair = 1;
+			int qid = r->qid;
+			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+			/* Tell the cells they have been packed */
+			ci->pack_done++;
+			cj->pack_done++;
+			t->done = 1;
+			int launch = 0, launch_leftovers = 0;
+			if ((sched->queues[qid].n_packs_pair_left_g == 0))
+				launch_leftovers = 1;
+			/* Tasks done. Release the lock ! */
+		    task_unlock(t);
+			/*schedule my dependencies (Only unpacks really)*/
+			enqueue_dependencies(sched, t);
+			/*Signal sleeping runners*/
+			signal_sleeping_runners(sched, t);
+		    clock_gettime(CLOCK_REALTIME, &t1);
+		    packing_time_pair_g +=
+		        (t1.tv_sec - t0.tv_sec) +
+		        (t1.tv_nsec - t0.tv_nsec) /
+		         1000000000.0;
+			if (launch_leftovers) {
+			  pack_vars_pair_grad->launch_leftovers = 1;
+			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
+					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
+					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
+			}
+	      }
+	      else{
+#endif //DO_CORNERS
 //          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
 //        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
 	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
@@ -1262,58 +1262,58 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
 					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
 		    }
-//#ifdef DO_CORNERS
-//	      }/* End of GPU work Pairs */
-//#endif //DO_CORNERS
+#ifdef DO_CORNERS
+	      }/* End of GPU work Pairs */
+#endif //DO_CORNERS
 #endif //GPUGRADPAIR
         }
         else if (t->subtype == task_subtype_gpu_pack_f){
     	    packed_pair_f++;
 #ifdef GPUOFFLOAD
-//#ifdef DO_CORNERS
-//  	      struct timespec t0, t1, dt;
-//  	      clock_gettime(CLOCK_REALTIME, &t0);
-//  	      double shift[3] = {0.0};
-//  	      t->corner_pair = 0;
-//          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-//    	  clock_gettime(CLOCK_REALTIME, &t1);
-//  		  packing_time_pair +=
-//  		        (t1.tv_sec - t0.tv_sec) +
-//  		        (t1.tv_nsec - t0.tv_nsec) /
-//  		         1000000000.0;
-//	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-////          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-//  	    	runner_dopair1_branch_force(r, ci, cj);
-//  		    t->corner_pair = 1;
-//  			int qid = r->qid;
-//  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-//  			/* Tell the cells they have been packed */
-//  			ci->pack_done++;
-//  			cj->pack_done++;
-//  			t->done = 1;
-//  			int launch = 0, launch_leftovers = 0;
-//  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
-//  				launch_leftovers = 1;
-//  			/* Tasks done. Release the lock ! */
-//  		    task_unlock(t);
-//  			/*schedule my dependencies (Only unpacks really)*/
-//  			enqueue_dependencies(sched, t);
-//  			/*Signal sleeping runners*/
-//  			signal_sleeping_runners(sched, t);
-//  		    clock_gettime(CLOCK_REALTIME, &t1);
-//  		    packing_time_pair_f +=
-//  		        (t1.tv_sec - t0.tv_sec) +
-//  		        (t1.tv_nsec - t0.tv_nsec) /
-//  		         1000000000.0;
-//  			if (launch_leftovers) {
-//  			  pack_vars_pair_forc->launch_leftovers = 1;
-//  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-//					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-//					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-//  			}
-//  	      }
-//  	      else{
-//#endif //DO_CORNERS
+#ifdef DO_CORNERS
+  	      struct timespec t0, t1, dt;
+  	      clock_gettime(CLOCK_REALTIME, &t0);
+  	      double shift[3] = {0.0};
+  	      t->corner_pair = 0;
+          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+    	  clock_gettime(CLOCK_REALTIME, &t1);
+  		  packing_time_pair +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
+//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+  	    	runner_dopair1_branch_force(r, ci, cj);
+  		    t->corner_pair = 1;
+  			int qid = r->qid;
+  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+  			/* Tell the cells they have been packed */
+  			ci->pack_done++;
+  			cj->pack_done++;
+  			t->done = 1;
+  			int launch = 0, launch_leftovers = 0;
+  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
+  				launch_leftovers = 1;
+  			/* Tasks done. Release the lock ! */
+  		    task_unlock(t);
+  			/*schedule my dependencies (Only unpacks really)*/
+  			enqueue_dependencies(sched, t);
+  			/*Signal sleeping runners*/
+  			signal_sleeping_runners(sched, t);
+  		    clock_gettime(CLOCK_REALTIME, &t1);
+  		    packing_time_pair_f +=
+  		        (t1.tv_sec - t0.tv_sec) +
+  		        (t1.tv_nsec - t0.tv_nsec) /
+  		         1000000000.0;
+  			if (launch_leftovers) {
+  			  pack_vars_pair_forc->launch_leftovers = 1;
+  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
+					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
+					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
+  			}
+  	      }
+  	      else{
+#endif //DO_CORNERS
 //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
 //            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
   	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
@@ -1331,9 +1331,9 @@ void *runner_main2(void *data) {
 					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
 					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
   		    } /* End of GPU work Pairs */
-//#ifdef DO_CORNERS
-//  	      }
-//#endif //DO_CORNERS
+#ifdef DO_CORNERS
+  	      }
+#endif //DO_CORNERS
 #endif //GPUFORCPAIR
         }
 	    else if (t->subtype == task_subtype_gpu_unpack) {

From e8aa0aec9982fda58bd61da8d8907f78cf77ad10 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 10:16:00 +0100
Subject: [PATCH 038/217] Fix bracketting in MPI hydro recv construction

---
 src/engine_maketasks.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 0033f30150..bf0ed134ff 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -585,10 +585,11 @@ void engine_addtasks_recv_hydro(
 
   /* Have we reached a level where there are any hydro tasks ? */
 #ifdef WITH_CUDA // A. Nasar
-  if (t_xv == NULL && c->hydro.density != NULL  && c->hydro.density_pack != NULL) {
+  if (t_xv == NULL && c->hydro.density != NULL  && c->hydro.density_pack != NULL)
 #else
-  if (t_xv == NULL && c->hydro.density != NULL) {
+  if (t_xv == NULL && c->hydro.density != NULL)
 #endif /*WITH_CUDA*/
+    {
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Make sure this cell has a valid tag. */

From 6aa324d16ac82c33b122a364f7e39dfcb9941cc4 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 10:21:13 +0100
Subject: [PATCH 039/217] Fixed missing closing curly

---
 src/runner_main_clean.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 76cb61a582..70577aaf05 100755
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1191,7 +1191,7 @@ void *runner_main2(void *data) {
 			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
 						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
 						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-//		    }
+		    }
 #ifdef DO_CORNERS
 		  } /* End of GPU work Pairs */
 #endif //DO_CORNERS

From 5e4eafea51610da3c9244029501b9ffc51943a8f Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 10:22:55 +0100
Subject: [PATCH 040/217] Applied code formatting script blindly

---
 src/cell_grid.c                               |    2 +-
 src/cell_hydro.h                              |    3 +-
 src/cell_unskip.c                             |   51 +-
 src/cuda/BLOCK_SIZE.h                         |    2 +-
 src/cuda/GPU_runner_functions.cu              | 3663 +++----
 src/cuda/GPU_runner_functions.h               |  222 +-
 src/cuda/device_functions.h                   |   22 +-
 src/cuda/kernel_definitions.cu                |    9 +-
 src/cuda/part_gpu.h                           |   22 +-
 src/cuda/tester.cu                            |    1 +
 src/engine.c                                  |   79 +-
 src/engine_config.c                           |   17 +-
 src/engine_maketasks.c                        |  297 +-
 src/engine_marktasks.c                        |   81 +-
 src/engine_unskip.c                           |    4 +-
 src/files_for_new_functions/arrays_malloc.cu  |  542 +-
 src/files_for_new_functions/arrays_malloc.h   |   70 +-
 .../host_device_data_transfer.cu              |  855 +-
 .../host_device_data_transfer.h               |  342 +-
 src/fof.c                                     |    4 +-
 src/hip/BLOCK_SIZE.h                          |    2 +-
 src/hip/HIP_runner_functions.h                |    2 +-
 src/hip/device_functions.h                    |   18 +-
 src/lightcone/lightcone_crossing.h            |    2 +-
 src/lightcone/lightcone_replications.c        |    4 +-
 src/lightcone/lightcone_shell.c               |    2 +-
 src/power_spectrum.c                          |    4 +-
 src/queue.h                                   |    6 +-
 src/runner_black_holes.c                      |   12 +-
 src/runner_doiact_functions_black_holes.h     |   16 +-
 src/runner_doiact_functions_hydro.h           |   32 +-
 src/runner_doiact_functions_hydro_gpu.h       | 8406 +++++++++--------
 src/runner_doiact_functions_limiter.h         |   12 +-
 src/runner_doiact_functions_stars.h           |   20 +-
 src/runner_doiact_grav.c                      |    2 +-
 src/runner_doiact_nosort.h                    |    2 +-
 src/runner_gpu_pack_functions.c               | 1823 ++--
 src/runner_gpu_pack_functions.h               |  278 +-
 src/runner_main_clean.cu                      | 2636 +++---
 src/runner_others.c                           |    2 +-
 src/runner_sinks.c                            |   18 +-
 src/scheduler.c                               |  236 +-
 src/scheduler.h                               |    2 +-
 src/space_getsid.h                            |   23 +-
 src/space_recycle.c                           |    2 +-
 src/space_regrid.c                            |    2 +-
 src/space_split.c                             |    2 +-
 src/task.c                                    |  134 +-
 src/task.h                                    |    6 +-
 49 files changed, 10565 insertions(+), 9429 deletions(-)
 mode change 100755 => 100644 src/cuda/BLOCK_SIZE.h
 mode change 100755 => 100644 src/cuda/GPU_runner_functions.cu
 mode change 100755 => 100644 src/cuda/GPU_runner_functions.h
 mode change 100755 => 100644 src/cuda/device_functions.h
 mode change 100755 => 100644 src/cuda/kernel_definitions.cu
 mode change 100755 => 100644 src/cuda/part_gpu.h
 mode change 100755 => 100644 src/cuda/tester.cu
 mode change 100755 => 100644 src/files_for_new_functions/arrays_malloc.cu
 mode change 100755 => 100644 src/files_for_new_functions/arrays_malloc.h
 mode change 100755 => 100644 src/files_for_new_functions/host_device_data_transfer.cu
 mode change 100755 => 100644 src/files_for_new_functions/host_device_data_transfer.h
 mode change 100755 => 100644 src/hip/BLOCK_SIZE.h
 mode change 100755 => 100644 src/hip/HIP_runner_functions.h
 mode change 100755 => 100644 src/hip/device_functions.h
 mode change 100755 => 100644 src/runner_gpu_pack_functions.c
 mode change 100755 => 100644 src/runner_gpu_pack_functions.h
 mode change 100755 => 100644 src/runner_main_clean.cu

diff --git a/src/cell_grid.c b/src/cell_grid.c
index 313d3843a3..3b3d9fa130 100644
--- a/src/cell_grid.c
+++ b/src/cell_grid.c
@@ -353,7 +353,7 @@ void cell_set_grid_completeness_mapper(void *map_data, int num_elements,
         }
       }
     } /* Now loop over all the neighbours of this cell */
-  } /* Loop through the elements, which are just byte offsets from NULL. */
+  }   /* Loop through the elements, which are just byte offsets from NULL. */
 }
 
 /**
diff --git a/src/cell_hydro.h b/src/cell_hydro.h
index 4b9446f731..14b37dcd6d 100644
--- a/src/cell_hydro.h
+++ b/src/cell_hydro.h
@@ -61,7 +61,8 @@ struct cell_hydro {
     /*! Linked list of the tasks computing this cell's hydro density. */
     struct link *density;
 
-    /*! Linked list of the tasks computing this cell's hydro density pack. A. Nasar */
+    /*! Linked list of the tasks computing this cell's hydro density pack. A.
+     * Nasar */
     struct link *density_pack;
     struct link *density_unpack;
     /*! Linked list of the tasks computing this cell's hydro force pack. */
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 071f3c212e..965128a537 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1905,24 +1905,25 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
   /* Unskip all the other task types. */
   int c_active = cell_is_active_hydro(c, e);
   if (c->nodeID == nodeID && c_active) {
-    for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) { /* A. Nasar */
-	  scheduler_activate(s, l->t);
-//	  message("activating pair pack\n");
-	if (l->t->ci != NULL){
-	  l->t->ci->pack_done = 0;
-	  l->t->ci->gpu_done = 0;
-	  l->t->ci->unpack_done = 0;
-	}
-	if (l->t->cj != NULL){
-	  l->t->cj->pack_done = 0;
-	  l->t->cj->gpu_done = 0;
-	  l->t->cj->unpack_done = 0;
-	}
+    for (struct link *l = c->hydro.density_pack; l != NULL;
+         l = l->next) { /* A. Nasar */
+      scheduler_activate(s, l->t);
+      //	  message("activating pair pack\n");
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done = 0;
+        l->t->ci->gpu_done = 0;
+        l->t->ci->unpack_done = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done = 0;
+        l->t->cj->gpu_done = 0;
+        l->t->cj->unpack_done = 0;
+      }
     }
     for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
-	  scheduler_activate(s, l->t);
-//	  message("activating pair UN-pack\n");
-	  l->t->gpu_done = 0;
+      scheduler_activate(s, l->t);
+      //	  message("activating pair UN-pack\n");
+      l->t->gpu_done = 0;
     }
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1935,13 +1936,13 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     // A. Nasar activate force and gradient packing tasks
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-//      message("activating pair pack force\n");
-      if (l->t->ci != NULL){
+      //      message("activating pair pack force\n");
+      if (l->t->ci != NULL) {
         l->t->ci->pack_done_f = 0;
         l->t->ci->gpu_done_f = 0;
         l->t->ci->unpack_done_f = 0;
       }
-      if (l->t->cj != NULL){
+      if (l->t->cj != NULL) {
         l->t->cj->pack_done_f = 0;
         l->t->cj->gpu_done_f = 0;
         l->t->cj->unpack_done_f = 0;
@@ -1949,20 +1950,20 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-//      message("activating pair UN-pack force\n");
+      //      message("activating pair UN-pack force\n");
       l->t->gpu_done = 0;
     }
 
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-//      message("activating pair pack gradient\n");
-      if (l->t->ci != NULL){
+      //      message("activating pair pack gradient\n");
+      if (l->t->ci != NULL) {
         l->t->ci->pack_done_g = 0;
         l->t->ci->gpu_done_g = 0;
         l->t->ci->unpack_done_g = 0;
       }
-      if (l->t->cj != NULL){
+      if (l->t->cj != NULL) {
         l->t->cj->pack_done_g = 0;
         l->t->cj->gpu_done_g = 0;
         l->t->cj->unpack_done_g = 0;
@@ -1970,7 +1971,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     }
     for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-//      message("activating pair UN-pack gradient\n");
+      //      message("activating pair UN-pack gradient\n");
       l->t->gpu_done = 0;
     }
 #endif
@@ -1999,7 +2000,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
    * so, we have to do this now, from the active remote cell). */
   else if (c->nodeID != nodeID && c_active) {
 #if defined(MPI_SYMMETRIC_FORCE_INTERACTION) && defined(WITH_MPI)
-	  // A. Nasar POSSIBLE BUG HERE MISSING ACTIVATION OF PACK TASKS
+    // A. Nasar POSSIBLE BUG HERE MISSING ACTIVATION OF PACK TASKS
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       struct task *t = l->t;
       if (t->type != task_type_pair && t->type != task_type_sub_pair) continue;
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
old mode 100755
new mode 100644
index f3897234a3..ac07782b72
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -9,4 +9,4 @@
 #define N_TASKS_PER_PACK_PAIR 32
 #define N_TASKS_BUNDLE_PAIR 4
 
-#endif // BLOCK_SIZE_H
+#endif  // BLOCK_SIZE_H
diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
old mode 100755
new mode 100644
index a0ca64ca6a..082becaaa9
--- a/src/cuda/GPU_runner_functions.cu
+++ b/src/cuda/GPU_runner_functions.cu
@@ -31,6 +31,7 @@ extern "C" {
 #include "GPU_runner_functions.h"
 #include "device_functions.h"
 #include "part_gpu.h"
+
 #include <cuda_profiler_api.h>
 
 #ifdef WITH_CUDA
@@ -57,10 +58,11 @@ void Initialise_GPU() {
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__global__ void tester(
-    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
-    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+__global__ void tester(struct part_soa parts_soa, int *d_task_first_part,
+                       int *d_task_last_part, float d_a, float d_H, int bid,
+                       int tid, int count_tasks, int tasksperbundle,
+                       int nBlocks_per_task, int bundle_first_task,
+                       int max_parts, int time_bin_inhibited) {
   extern __shared__ float vars[];
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
@@ -76,9 +78,9 @@ __global__ void tester(
   if (pid < last_part_in_task_blocks) {
     parts_soa.tid_p[pid] = 1;
   }
-//  if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
-//	  printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid], last_part_in_task_blocks);
-
+  //  if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
+  //	  printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid],
+  //last_part_in_task_blocks);
 }
 #ifdef WITH_CUDA
 }
@@ -88,20 +90,20 @@ __global__ void tester(
 extern "C" {
 #endif
 __global__ void runner_do_self_density_GPU(
-    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-    int count_tasks, int tasksperbundle, int nBlocks_per_task,
-    int bundle_first_task, int max_parts) {
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    float d_a, float d_H, int count_tasks, int tasksperbundle,
+    int nBlocks_per_task, int bundle_first_task, int max_parts) {
   extern __shared__ float vars[];
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
   int first_part_in_task_blocks, last_part_in_task_blocks;
   first_part_in_task_blocks = d_task_first_part[task_id],
   last_part_in_task_blocks = d_task_last_part[task_id];
-//  __syncthreads();
+  //  __syncthreads();
   const int pid = threadid + first_part_in_task_blocks;
 
   int ttid = 0;
@@ -143,11 +145,11 @@ __global__ void runner_do_self_density_GPU(
     piy = parts_soa.y_p[pid] - celly;
     piz = parts_soa.z_p[pid] - cellz;
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -177,10 +179,10 @@ __global__ void runner_do_self_density_GPU(
     __syncthreads();
     for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
       j = j_block + b;
-//      if ((j != pid) && (j < last_part_in_task_blocks) &&
-//          timebin[j_block] != time_bin_inhibited) {
-//      if ((j < last_part_in_task_blocks) &&
-//    	  timebin[j_block] != time_bin_inhibited) {
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
       if (j < last_part_in_task_blocks) {
         /* Compute the pairwise distance. */
         const float pjx = x_p_tmp[j_block] - cellx;
@@ -189,18 +191,18 @@ __global__ void runner_do_self_density_GPU(
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         const float r2 = xij * xij + yij * yij + zij * zij;
         //				if((hi < 0.0001f || hj < 0.0001f || r2 <
-        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
-        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        // 0.0000001f) && pid < last_part_in_task_blocks){
+        // printf("very small value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
         //				}
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
-//        if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) {
-          Found_neighbours=1;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          //        if (r2 < hig2 && r2 > (0.01f/256.f)*(0.01f/256.f)) {
+          Found_neighbours = 1;
           const float r = sqrt(r2);
           /* Recover some data */
           const float mj = mass_tmp[j_block];
           /* Get the kernel for hi. */
-          if(hi<1.f/256.f)printf("h < dx\n");
-//          if(hi<1.f/256.f)printf("h < dx\n");
+          if (hi < 1.f / 256.f) printf("h < dx\n");
+          //          if(hi<1.f/256.f)printf("h < dx\n");
           const float h_inv = 1.f / hi;
           const float ui = r * h_inv;
           float wi, wi_dx;
@@ -237,10 +239,11 @@ __global__ void runner_do_self_density_GPU(
     __syncthreads();
   }
   if (pid < last_part_in_task_blocks) {
-//	float wi, wi_dx;
-//	d_kernel_deval(0.f, &wi, &wi_dx);
-//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
-//    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    //	float wi, wi_dx;
+    //	d_kernel_deval(0.f, &wi, &wi_dx);
+    //	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    //    if(Found_neighbours == 0) printf("Not sure what's going on but no
+    //    neighbours found in GPU loop\n");
     parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
     parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
     parts_soa.div_v[pid] = div_vi;
@@ -255,22 +258,24 @@ __global__ void runner_do_self_density_GPU(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__global__ void DOSELF_GPU_AOS(
-    struct part_aos *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-    int count_tasks, int tasksperbundle, int nBlocks_per_task,
-    int bundle_first_task, int max_parts, double * d_cell_x,
-	double * d_cell_y, double * d_cell_z) {
+__global__ void DOSELF_GPU_AOS(struct part_aos *parts_aos,
+                               int *d_task_first_part, int *d_task_last_part,
+                               float d_a, float d_H, int count_tasks,
+                               int tasksperbundle, int nBlocks_per_task,
+                               int bundle_first_task, int max_parts,
+                               double *d_cell_x, double *d_cell_y,
+                               double *d_cell_z) {
   extern __shared__ float vars[];
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
   int first_part_in_task_blocks, last_part_in_task_blocks;
   first_part_in_task_blocks = d_task_first_part[task_id],
   last_part_in_task_blocks = d_task_last_part[task_id];
-//  __syncthreads();
+  //  __syncthreads();
   const int pid = threadid + first_part_in_task_blocks;
 
   int ttid = 0;
@@ -302,8 +307,7 @@ __global__ void DOSELF_GPU_AOS(
     first_part = d_task_first_part[ttid];
     last_part = d_task_last_part[ttid];
     count = last_part - first_part;
-    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
-    cellz = d_cell_z[ttid];
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
     hi = ipart.h, hig2 = hi * hi * kernel_gamma2;
     mi = ipart.mass;
     uxi = ipart.ux;
@@ -313,11 +317,11 @@ __global__ void DOSELF_GPU_AOS(
     piy = ipart.y_p - celly;
     piz = ipart.z_p - cellz;
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -332,7 +336,7 @@ __global__ void DOSELF_GPU_AOS(
   float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
   int *timebin = (int *)&vars[BLOCK_SIZE * 8];
   /*Particles copied in blocks to shared memory*/
-//  struct parts_aos jparts[count];
+  //  struct parts_aos jparts[count];
   for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
        b += BLOCK_SIZE) {
     int j = b + threadIdx.x;
@@ -356,8 +360,8 @@ __global__ void DOSELF_GPU_AOS(
         const float pjz = z_p_tmp[j_block] - cellz;
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         const float r2 = xij * xij + yij * yij + zij * zij;
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
-          Found_neighbours=1;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          Found_neighbours = 1;
           const float r = sqrt(r2);
           /* Recover some data */
           const float mj = mass_tmp[j_block];
@@ -398,10 +402,11 @@ __global__ void DOSELF_GPU_AOS(
     __syncthreads();
   }
   if (pid < last_part_in_task_blocks) {
-//	float wi, wi_dx;
-//	d_kernel_deval(0.f, &wi, &wi_dx);
-//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
-//    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    //	float wi, wi_dx;
+    //	d_kernel_deval(0.f, &wi, &wi_dx);
+    //	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    //    if(Found_neighbours == 0) printf("Not sure what's going on but no
+    //    neighbours found in GPU loop\n");
     parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
     parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
     parts_aos[pid].div_v = div_vi;
@@ -413,24 +418,25 @@ __global__ void DOSELF_GPU_AOS(
 }
 #endif
 
-//template <typename T>
+// template <typename T>
 
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-//#include <cuda/barrier>
+// #include <cuda/barrier>
 __global__ void DOSELF_GPU_AOS_F4(
-		struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv,
-		const float d_a, const float d_H,
-	const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
 
   extern __shared__ float4 vars_f4[];
 
-//  auto group = cooperative_groups::this_thread_block();
+  //  auto group = cooperative_groups::this_thread_block();
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
-//  cuda::barrier<cuda::thread_scope_system> bar;
+  //  cuda::barrier<cuda::thread_scope_system> bar;
 
   int first_part_in_task_blocks, last_part_in_task_blocks;
   int2 first_last_parts = d_task_first_part_f4[task_id];
@@ -449,8 +455,8 @@ __global__ void DOSELF_GPU_AOS_F4(
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
    * invocation*/
-  float4 * __restrict__ x_p_h_tmp = (float4 *)&vars_f4[0];
-  float4 * __restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE];
+  float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_f4[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&vars_f4[BLOCK_SIZE];
   /*Particles copied in blocks to shared memory*/
   for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
        b += BLOCK_SIZE) {
@@ -465,11 +471,10 @@ __global__ void DOSELF_GPU_AOS_F4(
         /* Compute the pairwise distance. */
         const float4 x_p_h_j = x_p_h_tmp[j_block];
         const float4 ux_m_j = ux_m_tmp[j_block];
-        const float xij = x_pi.x - x_p_h_j.x,
-        		    yij = x_pi.y - x_p_h_j.y,
-        		    zij = x_pi.z - x_p_h_j.z;
+        const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                    zij = x_pi.z - x_p_h_j.z;
         const float r2 = xij * xij + yij * yij + zij * zij;
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
           const float r = sqrtf(r2);
           /* Recover some data */
           const float mj = ux_m_j.w;
@@ -490,7 +495,7 @@ __global__ void DOSELF_GPU_AOS_F4(
 
           /* Compute dv dot r */
           const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
-                dvz = ux_pi.z - ux_m_j.z;
+                      dvz = ux_pi.z - ux_m_j.z;
           const float dvdr = dvx * xij + dvy * yij + dvz * zij;
 
           /* Compute dv cross r */
@@ -520,26 +525,25 @@ __global__ void DOSELF_GPU_AOS_F4(
 extern "C" {
 #endif
 void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-                           double * d_cell_x,
-						   double * d_cell_y, double * d_cell_z) {
+                        int *d_task_last_part, float d_a, float d_H,
+                        const char *loop_type, cudaStream_t stream,
+                        int block_size, int count_tasks, int tasksperbundle,
+                        int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                        int max_parts, double *d_cell_x, double *d_cell_y,
+                        double *d_cell_z) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
   DOSELF_GPU_AOS<<<gridShape, BLOCK_SIZE,
-                               8 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(int),
-                               stream>>>(
-      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
-	  d_cell_y, d_cell_z);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+                   8 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                   stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                             d_a, d_H, count_tasks, tasksperbundle,
+                             nBlocks_per_task, bundle_first_task, max_parts,
+                             d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -548,21 +552,24 @@ void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-struct first_part{
-	int list[32];
+struct first_part {
+  int list[32];
 };
-void launch_density_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		                   float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int2 *d_task_first_part_f4) {
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+                           struct part_aos_f4_recv *parts_recv, float d_a,
+                           float d_H, cudaStream_t stream, int numBlocks_x,
+                           int numBlocks_y, int bundle_first_task,
+                           int2 *d_task_first_part_f4) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
-  DOSELF_GPU_AOS_F4<<<gridShape, BLOCK_SIZE,
-                               2 * BLOCK_SIZE * sizeof(float4), stream>>>(
-      parts_send, parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+  DOSELF_GPU_AOS_F4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4),
+                      stream>>>(parts_send, parts_recv, d_a, d_H,
+                                bundle_first_task, d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -571,22 +578,24 @@ void launch_density_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__global__ void DOSELF_GPU_AOS_G(
-    struct part_aos_g *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-    int count_tasks, int tasksperbundle, int nBlocks_per_task,
-    int bundle_first_task, int max_parts, double * d_cell_x,
-	double * d_cell_y, double * d_cell_z) {
+__global__ void DOSELF_GPU_AOS_G(struct part_aos_g *parts_aos,
+                                 int *d_task_first_part, int *d_task_last_part,
+                                 float d_a, float d_H, int count_tasks,
+                                 int tasksperbundle, int nBlocks_per_task,
+                                 int bundle_first_task, int max_parts,
+                                 double *d_cell_x, double *d_cell_y,
+                                 double *d_cell_z) {
   extern __shared__ float varsg[];
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
   int first_part_in_task_blocks, last_part_in_task_blocks;
   first_part_in_task_blocks = d_task_first_part[task_id],
   last_part_in_task_blocks = d_task_last_part[task_id];
-//  __syncthreads();
+  //  __syncthreads();
   const int pid = threadid + first_part_in_task_blocks;
 
   int ttid = 0;
@@ -615,8 +624,7 @@ __global__ void DOSELF_GPU_AOS_G(
     first_part = d_task_first_part[ttid];
     last_part = d_task_last_part[ttid];
     count = last_part - first_part;
-    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
-    cellz = d_cell_z[ttid];
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
     hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
     mi = parts_aos[pid].mass;
     uxi = parts_aos[pid].ux;
@@ -631,11 +639,11 @@ __global__ void DOSELF_GPU_AOS_G(
     laplace_u = parts_aos[pid].laplace_u;
     alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -673,10 +681,10 @@ __global__ void DOSELF_GPU_AOS_G(
     __syncthreads();
     for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
       j = j_block + b;
-//      if ((j != pid) && (j < last_part_in_task_blocks) &&
-//          timebin[j_block] != time_bin_inhibited) {
-//      if ((j < last_part_in_task_blocks) &&
-//    	  timebin[j_block] != time_bin_inhibited) {
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
       if (j < last_part_in_task_blocks) {
         /* Compute the pairwise distance. */
         const float pjx = x_p_tmp[j_block] - cellx;
@@ -684,8 +692,8 @@ __global__ void DOSELF_GPU_AOS_G(
         const float pjz = z_p_tmp[j_block] - cellz;
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         const float r2 = xij * xij + yij * yij + zij * zij;
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
-          Found_neighbours=1;
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+          Found_neighbours = 1;
           const float r = sqrt(r2);
           const float r_inv = 1.f / r;
           /* Recover some data */
@@ -704,10 +712,12 @@ __global__ void DOSELF_GPU_AOS_G(
           const float dvdr_Hubble = dvdr + a2_Hubble * r2;
           /* Are the particles moving towards each others ? */
           const float omega_ij = min(dvdr_Hubble, 0.f);
-          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
 
           /* Signal velocity */
-          const float new_v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          const float new_v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
           /* Update if we need to */
           v_sig = max(v_sig, new_v_sig);
           /* Calculate Del^2 u for the thermal diffusion coefficient. */
@@ -722,7 +732,6 @@ __global__ void DOSELF_GPU_AOS_G(
            * (this is used to limit the diffusion in hydro_prepare_force) */
           const float alpha_j = alpha_tmp[j_block];
           alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
-
         }
       }
     }
@@ -741,20 +750,21 @@ __global__ void DOSELF_GPU_AOS_G(
 extern "C" {
 #endif
 __global__ void DOSELF_GPU_AOS_F4_G(
-		struct part_aos_f4_g_send * __restrict__ parts_send, struct part_aos_f4_g_recv * __restrict__ parts_recv,
-		const float d_a, const float d_H,
-	    const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+    struct part_aos_f4_g_send *__restrict__ parts_send,
+    struct part_aos_f4_g_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
 
   extern __shared__ float4 varsf4_g[];
 
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
-//  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  //  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
   int2 first_last_parts = d_task_first_part_f4[task_id];
   int first_part_in_task_blocks = first_last_parts.x;
   int last_part_in_task_blocks = first_last_parts.y;
-//  __syncthreads();
+  //  __syncthreads();
   const int pid = threadid + first_part_in_task_blocks;
 
   /*Keep this*/
@@ -774,9 +784,9 @@ __global__ void DOSELF_GPU_AOS_F4_G(
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
    * invocation*/
-  float4 * __restrict__ x_h_tmp = (float4 *)&varsf4_g[0];
-  float4 * __restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE];
-  float4 * __restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2];
+  float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_g[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_g[BLOCK_SIZE];
+  float4 *__restrict__ rho_avisc_u_c_tmp = (float4 *)&varsf4_g[BLOCK_SIZE * 2];
 
   /*Particles copied in blocks to shared memory*/
   for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
@@ -793,14 +803,15 @@ __global__ void DOSELF_GPU_AOS_F4_G(
     for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
       j = j_block + b;
       if (j < last_part_in_task_blocks) {
-    	float4 x_h_j = x_h_tmp[j_block];
-    	float4 ux_m_j = ux_m_tmp[j_block];
-    	float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block];
+        float4 x_h_j = x_h_tmp[j_block];
+        float4 ux_m_j = ux_m_tmp[j_block];
+        float4 rho_avisc_u_c_j = rho_avisc_u_c_tmp[j_block];
         /* Compute the pairwise distance. */
-        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, zij = x_h_i.z - x_h_j.z;
+        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                    zij = x_h_i.z - x_h_j.z;
         const float r2 = xij * xij + yij * yij + zij * zij;
 
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
           const float r = sqrt(r2);
           const float r_inv = 1.f / r;
           /* Recover some data */
@@ -819,10 +830,12 @@ __global__ void DOSELF_GPU_AOS_F4_G(
           const float dvdr_Hubble = dvdr + a2_Hubble * r2;
           /* Are the particles moving towards each others ? */
           const float omega_ij = min(dvdr_Hubble, 0.f);
-          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
 
           /* Signal velocity */
-          const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+          const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w -
+                                  const_viscosity_beta * mu_ij;
           /* Update if we need to */
           vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
           /* Calculate Del^2 u for the thermal diffusion coefficient. */
@@ -830,21 +843,23 @@ __global__ void DOSELF_GPU_AOS_F4_G(
           const float ui = r * h_inv;
           d_kernel_deval(ui, &wi, &wi_dx);
 
-          const float delta_u_factor = (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
-          vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+          const float delta_u_factor =
+              (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+          vsig_lapu_aviscmax_i.y +=
+              mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
 
           /* Set the maximal alpha from the previous step over the neighbours
            * (this is used to limit the diffusion in hydro_prepare_force) */
           const float alpha_j = rho_avisc_u_c_j.y;
           vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
-
         }
       }
     }
     __syncthreads();
   }
   if (pid < last_part_in_task_blocks) {
-//	  printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x, vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
+    //	  printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x,
+    //vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
     parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
   }
 }
@@ -855,11 +870,13 @@ __global__ void DOSELF_GPU_AOS_F4_G(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__global__ void DOSELF_GPU_AOS_F(
-    struct part_aos_f *parts_aos, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-    int count_tasks, int tasksperbundle, int nBlocks_per_task,
-    int bundle_first_task, int max_parts, double * d_cell_x,
-	double * d_cell_y, double * d_cell_z) {
+__global__ void DOSELF_GPU_AOS_F(struct part_aos_f *parts_aos,
+                                 int *d_task_first_part, int *d_task_last_part,
+                                 float d_a, float d_H, int count_tasks,
+                                 int tasksperbundle, int nBlocks_per_task,
+                                 int bundle_first_task, int max_parts,
+                                 double *d_cell_x, double *d_cell_y,
+                                 double *d_cell_z) {
   extern __shared__ float varsf[];
   __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
@@ -908,8 +925,7 @@ __global__ void DOSELF_GPU_AOS_F(
     first_part = d_task_first_part[ttid];
     last_part = d_task_last_part[ttid];
     count = last_part - first_part;
-    cellx = d_cell_x[ttid], celly = d_cell_y[ttid],
-    cellz = d_cell_z[ttid];
+    cellx = d_cell_x[ttid], celly = d_cell_y[ttid], cellz = d_cell_z[ttid];
     hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
     mi = parts_aos[pid].mass;
     uxi = parts_aos[pid].ux;
@@ -928,14 +944,14 @@ __global__ void DOSELF_GPU_AOS_F(
     alphavisci = parts_aos[pid].alpha_visc;
     alphadiffi = parts_aos[pid].alpha_diff;
     min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
-//    laplace_u = parts_aos[pid].laplace_u;
-//    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+    //    laplace_u = parts_aos[pid].laplace_u;
+    //    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -971,7 +987,7 @@ __global__ void DOSELF_GPU_AOS_F(
     uz_tmp[threadIdx.x] = parts_aos[j].uz;
     timebin[threadIdx.x] = parts_aos[j].time_bin;
     cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
-//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
     u_tmp[threadIdx.x] = parts_aos[j].u;
     rho_tmp[threadIdx.x] = parts_aos[j].rho;
     alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
@@ -989,18 +1005,19 @@ __global__ void DOSELF_GPU_AOS_F(
         const float pjz = z_p_tmp[j_block] - cellz;
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         const float r2 = xij * xij + yij * yij + zij * zij;
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
 
           //          /* Cosmology terms for the signal velocity */
           const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
           const float a2_Hubble = d_a * d_a * d_H;
           const float r = sqrt(r2);
           const float r_inv = 1.f / r;
-//          /* Recover some data */
+          //          /* Recover some data */
           const float mj = mass_tmp[j_block];
-//          /* Get the kernel for hi. */
+          //          /* Get the kernel for hi. */
           const float hi_inv = 1.f / hi;
-          const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
           const float xi = r * hi_inv;
           float wi, wi_dx;
           d_kernel_deval(xi, &wi, &wi_dx);
@@ -1008,23 +1025,26 @@ __global__ void DOSELF_GPU_AOS_F(
           /* Get the kernel for hj. */
           const float hj = h_tmp[j_block];
           const float hj_inv = 1.0f / hj;
-          const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
           const float xj = r * hj_inv;
           float wj, wj_dx;
           d_kernel_deval(xj, &wj, &wj_dx);
           const float wj_dr = hjd_inv * wj_dx;
-//          /* Compute dv dot r */
+          //          /* Compute dv dot r */
           float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
                 dvz = uzi - uz_tmp[j_block];
           const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-//          /* Add Hubble flow */
+          //          /* Add Hubble flow */
           const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-//          /* Are the particles moving towards each others ? */
+          //          /* Are the particles moving towards each others ? */
           const float omega_ij = min(dvdr_Hubble, 0.f);
-          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
-//
-//          /* Signal velocity */
-          const float v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
+          const float v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
 
           /* Variable smoothing length term */
           const float f_ij = 1.f - fi / mj;
@@ -1056,7 +1076,9 @@ __global__ void DOSELF_GPU_AOS_F(
           ahydroxi -= mj * acc * xij;
           ahydroyi -= mj * acc * yij;
           ahydrozi -= mj * acc * zij;
-//          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, pressurei, pressurej);
+          //          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+          //          == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+          //          pressurei, pressurej);
           /* Get the time derivative for u. */
           const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
 
@@ -1064,38 +1086,42 @@ __global__ void DOSELF_GPU_AOS_F(
           const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
           const float press_sum = pressurei + pressurej;
           /* Diffusion term */
-          /* Combine the alpha_diff into a pressure-based switch -- this allows the
-           * alpha from the highest pressure particle to dominate, so that the
-           * diffusion limited particles always take precedence - another trick to
-           * allow the scheme to work with thermal feedback. */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
           float alpha_diff =
               (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
               (press_sum);
           if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
-          const float v_diff = alpha_diff * 0.5f *
-                               (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
-                                fabsf(fac_mu * r_inv * dvdr_Hubble));
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
           /* wi_dx + wj_dx / 2 is F_ij */
           const float diff_du_term =
-              v_diff * (ui - u_tmp[j_block]) * (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+              v_diff * (ui - u_tmp[j_block]) *
+              (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
 
           /* Assemble the energy equation term */
           const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
 
           /* Internal energy time derivative */
           u_dti += du_dt_i * mj;
-          if(mj == 0.f)printf("zero mass mj %f\n", mj);
+          if (mj == 0.f) printf("zero mass mj %f\n", mj);
 
           /* Get the time derivative for h. */
           h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
 
-          /* Update if we need to; this should be guaranteed by the gradient loop but
-           * due to some possible synchronisation problems this is here as a _quick
-           * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
           v_sigi = max(v_sigi, v_sig);
           int time_bin_j = timebin[j_block];
-          if(time_bin_j > 0)min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
-//          printf("Got in\n");
+          if (time_bin_j > 0)
+            min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+          //          printf("Got in\n");
         }
       }
     }
@@ -1119,9 +1145,10 @@ __global__ void DOSELF_GPU_AOS_F(
 extern "C" {
 #endif
 __global__ void DOSELF_GPU_AOS_F4_F(
-	struct part_aos_f4_f_send * __restrict__ parts_send, struct part_aos_f4_f_recv * __restrict__ parts_recv,
-	const float d_a, const float d_H,
-    const int bundle_first_task, const int2 * __restrict__ d_task_first_part_f4) {
+    struct part_aos_f4_f_send *__restrict__ parts_send,
+    struct part_aos_f4_f_recv *__restrict__ parts_recv, const float d_a,
+    const float d_H, const int bundle_first_task,
+    const int2 *__restrict__ d_task_first_part_f4) {
 
   extern __shared__ float4 varsf4_f[];
 
@@ -1130,8 +1157,8 @@ __global__ void DOSELF_GPU_AOS_F4_F(
   const int task_id = bundle_first_task + blockIdx.y;
 
   int first_part_in_task_blocks, last_part_in_task_blocks;
-//  first_part_in_task_blocks = d_task_first_part[task_id],
-//  last_part_in_task_blocks = d_task_last_part[task_id];
+  //  first_part_in_task_blocks = d_task_first_part[task_id],
+  //  last_part_in_task_blocks = d_task_last_part[task_id];
   int2 first_last_parts = d_task_first_part_f4[task_id];
   first_part_in_task_blocks = first_last_parts.x;
   last_part_in_task_blocks = first_last_parts.y;
@@ -1164,11 +1191,12 @@ __global__ void DOSELF_GPU_AOS_F4_F(
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
    * invocation*/
-  float4 * __restrict__ x_h_tmp = (float4 *)&varsf4_f[0];
-  float4 * __restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE];
-  float4 * __restrict__ f_b_t_mintbinngb_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 2];
-  float4 * __restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3];
-  float3 * __restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4];
+  float4 *__restrict__ x_h_tmp = (float4 *)&varsf4_f[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&varsf4_f[BLOCK_SIZE];
+  float4 *__restrict__ f_b_t_mintbinngb_tmp =
+      (float4 *)&varsf4_f[BLOCK_SIZE * 2];
+  float4 *__restrict__ rho_p_c_vsig_tmp = (float4 *)&varsf4_f[BLOCK_SIZE * 3];
+  float3 *__restrict__ u_avisc_adiff_tmp = (float3 *)&varsf4_f[BLOCK_SIZE * 4];
   /*Particles copied in blocks to shared memory*/
   for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
        b += BLOCK_SIZE) {
@@ -1178,33 +1206,33 @@ __global__ void DOSELF_GPU_AOS_F4_F(
     ux_m_tmp[threadIdx.x] = pj.ux_m;
     f_b_t_mintbinngb_tmp[threadIdx.x] = pj.f_bals_timebin_mintimebin_ngb;
     rho_p_c_vsig_tmp[threadIdx.x] = pj.rho_p_c_vsigi;
-//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
     u_avisc_adiff_tmp[threadIdx.x] = pj.u_alphavisc_alphadiff;
     __syncthreads();
     for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
       j = j_block + b;
       if (j < last_part_in_task_blocks) {
         /* Compute the pairwise distance. */
-    	float4 x_h_j = x_h_tmp[j_block];
-    	float4 ux_m_j = ux_m_tmp[j_block];
-    	float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block];
-    	float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block];
-    	float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block];
-        const float xij = x_h_i.x - x_h_j.x,
-        		    yij = x_h_i.y - x_h_j.y,
-        		    zij = x_h_i.z - x_h_j.z;
+        float4 x_h_j = x_h_tmp[j_block];
+        float4 ux_m_j = ux_m_tmp[j_block];
+        float4 f_b_t_mintbinngb_j = f_b_t_mintbinngb_tmp[j_block];
+        float4 rho_p_c_vsig_j = rho_p_c_vsig_tmp[j_block];
+        float3 u_avisc_adiff_j = u_avisc_adiff_tmp[j_block];
+        const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                    zij = x_h_i.z - x_h_j.z;
         const float r2 = xij * xij + yij * yij + zij * zij;
-        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
           //          /* Cosmology terms for the signal velocity */
           const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
           const float a2_Hubble = d_a * d_a * d_H;
           const float r = sqrt(r2);
           const float r_inv = 1.f / r;
-//          /* Recover some data */
+          //          /* Recover some data */
           const float mj = ux_m_j.w;
-//          /* Get the kernel for hi. */
+          //          /* Get the kernel for hi. */
           const float hi_inv = 1.f / hi;
-          const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
           const float xi = r * hi_inv;
           float wi, wi_dx;
           d_kernel_deval(xi, &wi, &wi_dx);
@@ -1212,22 +1240,24 @@ __global__ void DOSELF_GPU_AOS_F4_F(
           /* Get the kernel for hj. */
           const float hj = x_h_j.w;
           const float hj_inv = 1.0f / hj;
-          const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
           const float xj = r * hj_inv;
           float wj, wj_dx;
           d_kernel_deval(xj, &wj, &wj_dx);
           const float wj_dr = hjd_inv * wj_dx;
-//          /* Compute dv dot r */
+          //          /* Compute dv dot r */
           float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
                 dvz = ux_m_i.z - ux_m_j.z;
           const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-//          /* Add Hubble flow */
+          //          /* Add Hubble flow */
           const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-//          /* Are the particles moving towards each others ? */
+          //          /* Are the particles moving towards each others ? */
           const float omega_ij = min(dvdr_Hubble, 0.f);
-          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
-//
-//          /* Signal velocity */
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
           const float cj = rho_p_c_vsig_j.z;
           const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
 
@@ -1239,16 +1269,17 @@ __global__ void DOSELF_GPU_AOS_F4_F(
           const float pressurej = rho_p_c_vsig_j.y;
           const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
           const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
-          const float visc =
-              -0.25f * alpha * v_sig * mu_ij * (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+          const float visc = -0.25f * alpha * v_sig * mu_ij *
+                             (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) /
+                             rho_ij;
           /* Convolve with the kernel */
           const float visc_acc_term =
               0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
           /* Compute gradient terms */
           const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
           const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
-          const float P_over_rho2_i = pressurei / (rhoi2) * f_ij;
-          const float P_over_rho2_j = pressurej / (rhoj2) * f_ji;
+          const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
 
           /* SPH acceleration term */
           const float sph_acc_term =
@@ -1266,21 +1297,23 @@ __global__ void DOSELF_GPU_AOS_F4_F(
           /* Viscosity term */
           const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
           /* Diffusion term */
-          /* Combine the alpha_diff into a pressure-based switch -- this allows the
-           * alpha from the highest pressure particle to dominate, so that the
-           * diffusion limited particles always take precedence - another trick to
-           * allow the scheme to work with thermal feedback. */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
           float alpha_diff =
               (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
               (pressurei + pressurej);
           if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
-          const float v_diff = alpha_diff * 0.5f *
-                               (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
-                                fabsf(fac_mu * r_inv * dvdr_Hubble));
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
           /* wi_dx + wj_dx / 2 is F_ij */
-          const float diff_du_term =
-              v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
-			  (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+          const float diff_du_term = v_diff *
+                                     (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+                                     (f_ij * wi_dr / rho_p_c_vsig_i.x +
+                                      f_ji * wj_dr / rho_p_c_vsig_j.x);
 
           /* Assemble the energy equation term */
           const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
@@ -1289,24 +1322,25 @@ __global__ void DOSELF_GPU_AOS_F4_F(
           udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
 
           /* Get the time derivative for h. */
-          udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+          udt_hdt_vsig_mintbinngb.y -=
+              mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
 
-          /* Update if we need to; this should be guaranteed by the gradient loop but
-           * due to some possible synchronisation problems this is here as a _quick
-           * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
           udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
           unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
           unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
-          if(time_bin_j > 0)f_b_t_mintbinngb_i.w =
-        		  min(min_tb_i, time_bin_j);
-//          printf("Got in\n");
+          if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+          //          printf("Got in\n");
         }
       }
     }
     __syncthreads();
   }
   if (pid < last_part_in_task_blocks) {
-	udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
+    udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
     parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
     parts_recv[pid].a_hydro = ahydro;
   }
@@ -1319,12 +1353,14 @@ __global__ void DOSELF_GPU_AOS_F4_F(
 extern "C" {
 #endif
 __global__ void runner_do_pair_density_GPU_naive(
-	struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci, int *d_task_first_part_cj,
-	int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited) {
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited) {
 
   extern __shared__ float vars[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
@@ -1341,7 +1377,8 @@ __global__ void runner_do_pair_density_GPU_naive(
   // Now we start calculations for particles in cell i
   const int pid = threadid + first_part_in_task_blocks_ci;
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
   int ttid = 0;
   int first_part = 0;
   int count = 0;
@@ -1381,11 +1418,11 @@ __global__ void runner_do_pair_density_GPU_naive(
     piy = parts_soa_ci.y_p[pid] - celly;
     piz = parts_soa_ci.z_p[pid] - cellz;
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -1415,10 +1452,10 @@ __global__ void runner_do_pair_density_GPU_naive(
     __syncthreads();
     for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
       j = j_block + b;
-//      if ((j != pid) && (j < last_part_in_task_blocks) &&
-//          timebin[j_block] != time_bin_inhibited) {
-//      if ((j < last_part_in_task_blocks) &&
-//    	  timebin[j_block] != time_bin_inhibited) {
+      //      if ((j != pid) && (j < last_part_in_task_blocks) &&
+      //          timebin[j_block] != time_bin_inhibited) {
+      //      if ((j < last_part_in_task_blocks) &&
+      //    	  timebin[j_block] != time_bin_inhibited) {
       if (j < last_part_in_task_blocks_cj) {
         /* Compute the pairwise distance. */
         const float pjx = x_p_tmp[j_block] - cellx;
@@ -1426,20 +1463,21 @@ __global__ void runner_do_pair_density_GPU_naive(
         const float pjz = z_p_tmp[j_block] - cellz;
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         const float r2 = xij * xij + yij * yij + zij * zij;
-//        const float hj = h_tmp[j_block], hjg2 = hj * hj * kernel_gamma2;
+        //        const float hj = h_tmp[j_block], hjg2 = hj * hj *
+        //        kernel_gamma2;
         //				if((hi < 0.0001f || hj < 0.0001f || r2 <
-        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
-        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        // 0.0000001f) && pid < last_part_in_task_blocks){
+        // printf("very small value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
         //				}
-//        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
-        if (r2 < hig2 && r2 > (0.01f/dx)*(0.01f/dx)) {
-          Found_neighbours=1;
+        //        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+        if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+          Found_neighbours = 1;
           const float r = sqrt(r2);
           /* Recover some data */
           const float mj = mass_tmp[j_block];
           /* Get the kernel for hi. */
-          if(hi<1.f/dx)printf("h < dx\n");
-//          if(hi<1.f/256.f)printf("h < dx\n");
+          if (hi < 1.f / dx) printf("h < dx\n");
+          //          if(hi<1.f/256.f)printf("h < dx\n");
           const float h_inv = 1.f / hi;
           const float ui = r * h_inv;
           float wi, wi_dx;
@@ -1477,7 +1515,8 @@ __global__ void runner_do_pair_density_GPU_naive(
   }
   if (pid < last_part_in_task_blocks_ci) {
     parts_soa_ci.rho[pid] = rhoi, parts_soa_ci.rho_dh[pid] = rho_dhi;
-    parts_soa_ci.wcount[pid] = wcounti, parts_soa_ci.wcount_dh[pid] = wcount_dhi;
+    parts_soa_ci.wcount[pid] = wcounti,
+    parts_soa_ci.wcount_dh[pid] = wcount_dhi;
     parts_soa_ci.div_v[pid] = div_vi;
     parts_soa_ci.rot_ux[pid] = rot_uxi, parts_soa_ci.rot_uy[pid] = rot_uyi;
     parts_soa_ci.rot_uz[pid] = rot_uzi;
@@ -1490,11 +1529,14 @@ __global__ void runner_do_pair_density_GPU_naive(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci,
-	          int *d_task_first_part_cj, int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
-              const char *loop_type, cudaStream_t stream, int bid, int block_size, int count_tasks, int tasksperbundle,
-              int max_parts_i, int max_parts_j, int numBlocks_y, int tid, int offset, int bundle_first_task, int time_bin_inhibited) {
-
+void launch_density_pair_two_kernels(
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    const char *loop_type, cudaStream_t stream, int bid, int block_size,
+    int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+    int numBlocks_y, int tid, int offset, int bundle_first_task,
+    int time_bin_inhibited) {
 
   int max_parts = max(max_parts_j, max_parts_i);
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
@@ -1503,24 +1545,24 @@ void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_s
 
   /*Do ci*/
   runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
-                               8 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(timebin_t),
-                               stream>>>(
-      parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj, d_task_last_part_ci,
-	  d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, time_bin_inhibited);
-
-//  numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-//  gridShape = dim3(numBlocks_x, numBlocks_y);
-//  nBlocks_per_task = numBlocks_x;
+                                     8 * BLOCK_SIZE * sizeof(float) +
+                                         BLOCK_SIZE * sizeof(timebin_t),
+                                     stream>>>(
+      parts_soa_ci, parts_soa_cj, d_task_first_part_ci, d_task_first_part_cj,
+      d_task_last_part_ci, d_task_last_part_cj, d_a, d_H, bid, tid, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+
+  //  numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  //  gridShape = dim3(numBlocks_x, numBlocks_y);
+  //  nBlocks_per_task = numBlocks_x;
   /*Now do cj*/
   runner_do_pair_density_GPU_naive<<<gridShape, BLOCK_SIZE,
-                               8 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(timebin_t),
-                               stream>>>(
-      parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci, d_task_last_part_cj,
-	  d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, time_bin_inhibited);
+                                     8 * BLOCK_SIZE * sizeof(float) +
+                                         BLOCK_SIZE * sizeof(timebin_t),
+                                     stream>>>(
+      parts_soa_cj, parts_soa_ci, d_task_first_part_cj, d_task_first_part_ci,
+      d_task_last_part_cj, d_task_last_part_ci, d_a, d_H, bid, tid, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -1529,13 +1571,14 @@ void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_s
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIRGPU(
-	struct part_soa parts_soa, int pid,
-	int last_part_in_task_blocks_ci, int first_part_in_task_blocks_cj,
-	int last_part_in_task_blocks_cj, float d_a, float d_H,
-	int time_bin_inhibited, float *vars) {
+__device__ void DOPAIRGPU(struct part_soa parts_soa, int pid,
+                          int last_part_in_task_blocks_ci,
+                          int first_part_in_task_blocks_cj,
+                          int last_part_in_task_blocks_cj, float d_a, float d_H,
+                          int time_bin_inhibited, float *vars) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float cellx = 0.0, celly = 0.0, cellz = 0.0;
   float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
@@ -1557,16 +1600,16 @@ __device__ void DOPAIRGPU(
   int Found_neighbours = 0;
 
   if (pid < last_part_in_task_blocks_ci) {
-	cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
-	cellz = parts_soa.locz[pid];
-	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
-	mi = parts_soa.mass[pid];
-	uxi = parts_soa.ux[pid];
-	uyi = parts_soa.uy[pid];
-	uzi = parts_soa.uz[pid];
-	pix = parts_soa.x_p[pid] - cellx;
-	piy = parts_soa.y_p[pid] - celly;
-	piz = parts_soa.z_p[pid] - cellz;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
   }
 
   int n_neighbours = 0;
@@ -1584,78 +1627,77 @@ __device__ void DOPAIRGPU(
   timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
   /*Particles copied in blocks to shared memory*/
   for (int b = first_part_in_task_blocks_cj; b < last_part_in_task_blocks_cj;
-	   b += BLOCK_SIZE) {
-	int j = b + threadIdx.x;
-	x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
-	y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
-	z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
-	h_tmp[threadIdx.x] = parts_soa.h[j];
-	mass_tmp[threadIdx.x] = parts_soa.mass[j];
-	ux_tmp[threadIdx.x] = parts_soa.ux[j];
-	uy_tmp[threadIdx.x] = parts_soa.uy[j];
-	uz_tmp[threadIdx.x] = parts_soa.uz[j];
-	timebin[threadIdx.x] = parts_soa.time_bin[j];
-	__syncthreads();
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  j = j_block + b;
-	  if (j < last_part_in_task_blocks_cj) {
-		/* Compute the pairwise distance. */
-		const float pjx = x_p_tmp[j_block] - cellx;
-		const float pjy = y_p_tmp[j_block] - celly;
-		const float pjz = z_p_tmp[j_block] - cellz;
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-
-		if (r2 < hig2 && r2 > (0.01f/dx)*(0.01f/dx)) {
-		  Found_neighbours=1;
-		  const float r = sqrt(r2);
-		  /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-		  /* Get the kernel for hi. */
-		  if(hi<1.f/dx)printf("h < dx\n");
-		  const float h_inv = 1.f / hi;
-		  const float ui = r * h_inv;
-		  float wi, wi_dx;
-
-		  d_kernel_deval(ui, &wi, &wi_dx);
-
-		  rhoi += mj * wi;
-		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-		  wcounti += wi;
-		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
-
-		  const float r_inv = 1.f / r;
-		  const float faci = mj * wi_dx * r_inv;
-
-		  /* Compute dv dot r */
-		  float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-				dvz = uzi - uz_tmp[j_block];
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-
-		  div_vi -= faci * dvdr;
-
-		  /* Compute dv cross r */
-		  float curlvrx = dvy * zij - dvz * yij;
-		  float curlvry = dvz * xij - dvx * zij;
-		  float curlvrz = dvx * yij - dvy * xij;
-
-		  rot_uxi += faci * curlvrx;
-		  rot_uyi += faci * curlvry;
-		  rot_uzi += faci * curlvrz;
-		}
-	  }
-	}
-	__syncthreads();
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+      if (j < last_part_in_task_blocks_cj) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+
+        if (r2 < hig2 && r2 > (0.01f / dx) * (0.01f / dx)) {
+          Found_neighbours = 1;
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if (hi < 1.f / dx) printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
   }
   if (pid < last_part_in_task_blocks_ci) {
-	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
-	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
-	parts_soa.div_v[pid] = div_vi;
-	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
-	parts_soa.rot_uz[pid] = rot_uzi;
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
   }
-
 }
 #ifdef WITH_CUDA
 }
@@ -1664,13 +1706,16 @@ __device__ void DOPAIRGPU(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NONSYMGPU(
-	struct part_soa parts_soa, int pid, const int ci_start,
-	const int ci_end, const int cj_start,
-	const int cj_end, float d_a, float d_H,
-	float *vars_pair, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+__device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
+                                 const int ci_start, const int ci_end,
+                                 const int cj_start, const int cj_end,
+                                 float d_a, float d_H, float *vars_pair,
+                                 double *d_shift_x, double *d_shift_y,
+                                 double *d_shift_z, const int task_id_tmp,
+                                 int flip_order) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
 
@@ -1691,17 +1736,19 @@ __device__ void DOPAIR2NONSYMGPU(
   float rot_uzi = 0.0;
   int Found_neighbours = 0;
   int count_i = cj_start;
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
   if (pid < ci_end) {
-	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
-	mi = parts_soa.mass[pid];
-	uxi = parts_soa.ux[pid];
-	uyi = parts_soa.uy[pid];
-	uzi = parts_soa.uz[pid];
-	pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
-	piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
-	piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+    piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+    piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
   }
 
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
@@ -1718,80 +1765,82 @@ __device__ void DOPAIR2NONSYMGPU(
   timebin_t *timebin = (timebin_t *)&uz_tmp[BLOCK_SIZE];
 
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	const int tid_x = threadIdx.x;
-	int j = b + tid_x;
-	x_p_tmp[tid_x] = parts_soa.x_p[j];
-	y_p_tmp[tid_x] = parts_soa.y_p[j];
-	z_p_tmp[tid_x] = parts_soa.z_p[j];
-//	h_tmp[tid_x] = parts_soa.h[j];
-	mass_tmp[tid_x] = parts_soa.mass[j];
-	ux_tmp[tid_x] = parts_soa.ux[j];
-	uy_tmp[tid_x] = parts_soa.uy[j];
-	uz_tmp[tid_x] = parts_soa.uz[j];
-	timebin[tid_x] = parts_soa.time_bin[j];
-
-	__syncthreads();
-	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
-	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
-	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-
-		const float pjx = x_p_tmp[j_block] - shift_x_j;
-		const float pjy = y_p_tmp[j_block] - shift_y_j;
-		const float pjz = z_p_tmp[j_block] - shift_z_j;
-
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-//		const float xij = (pix - pjx) * flip_order, yij = (piy - pjy) * flip_order, zij = (piz - pjz) * flip_order;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		if (r2 < hig2) {
-		  /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-		  const float r = sqrt(r2);
-		  /* Get the kernel for hi. */
-		  const float h_inv = 1.f / hi;
-		  const float ui = r * h_inv;
-		  float wi, wi_dx;
-
-		  d_kernel_deval(ui, &wi, &wi_dx);
-
-		  rhoi += mj * wi;
-		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-		  wcounti += wi;
-		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
-
-		  const float r_inv = 1.f / r;
-		  const float faci = mj * wi_dx * r_inv;
-		  /* Compute dv dot r */
-		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-		  dvz = uzi - uz_tmp[j_block];
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-		  /* Compute dv cross r */
-		  const float curlvrx = dvy * zij - dvz * yij;
-		  const float curlvry = dvz * xij - dvx * zij;
-		  const float curlvrz = dvx * yij - dvy * xij;
-
-		  div_vi -= faci * dvdr;
-
-		  rot_uxi += faci * curlvrx;
-		  rot_uyi += faci * curlvry;
-		  rot_uzi += faci * curlvrz;
-		}
-	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
-	__syncthreads();
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_soa.x_p[j];
+    y_p_tmp[tid_x] = parts_soa.y_p[j];
+    z_p_tmp[tid_x] = parts_soa.z_p[j];
+    //	h_tmp[tid_x] = parts_soa.h[j];
+    mass_tmp[tid_x] = parts_soa.mass[j];
+    ux_tmp[tid_x] = parts_soa.ux[j];
+    uy_tmp[tid_x] = parts_soa.uy[j];
+    uz_tmp[tid_x] = parts_soa.uz[j];
+    timebin[tid_x] = parts_soa.time_bin[j];
+
+    __syncthreads();
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = (pix - pjx) * flip_order, yij = (piy -
+        //pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    }   /*End of looping through particles in shared memory---Shared arrays
+           zero'ed for next step in outer loop*/
+    __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
-	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
-	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
-	parts_soa.div_v[pid] = div_vi;
-	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
-	parts_soa.rot_uz[pid] = rot_uzi;
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
   }
 }
 #ifdef WITH_CUDA
@@ -1801,13 +1850,16 @@ __device__ void DOPAIR2NONSYMGPU(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NONSYMGPUAOS(
-	struct part_aos *parts_aos, int pid, const int ci_start,
-	const int ci_end, const int cj_start,
-	const int cj_end, float d_a, float d_H,
-	float *vars_pair_aos, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+__device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
+                                    const int ci_start, const int ci_end,
+                                    const int cj_start, const int cj_end,
+                                    float d_a, float d_H, float *vars_pair_aos,
+                                    double *d_shift_x, double *d_shift_y,
+                                    double *d_shift_z, const int task_id_tmp,
+                                    int flip_order) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = 0.0;
 
@@ -1828,17 +1880,19 @@ __device__ void DOPAIR2NONSYMGPUAOS(
   float rot_uzi = 0.0;
   int Found_neighbours = 0;
   int count_i = cj_start;
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
   if (pid < ci_end) {
-	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
-	mi = parts_aos[pid].mass;
-	uxi = parts_aos[pid].ux;
-	uyi = parts_aos[pid].uy;
-	uzi = parts_aos[pid].uz;
-	pix = parts_aos[pid].x_p;// - d_shift_x[task_id_tmp];
-	piy = parts_aos[pid].y_p;// - d_shift_y[task_id_tmp];
-	piz = parts_aos[pid].z_p;// - d_shift_z[task_id_tmp];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    pix = parts_aos[pid].x_p;  // - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p;  // - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p;  // - d_shift_z[task_id_tmp];
   }
 
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
@@ -1855,82 +1909,85 @@ __device__ void DOPAIR2NONSYMGPUAOS(
   int *timebin = (int *)&uz_tmp[BLOCK_SIZE];
 
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	const int tid_x = threadIdx.x;
-	int j = b + tid_x;
-	x_p_tmp[tid_x] = parts_aos[j].x_p;
-	y_p_tmp[tid_x] = parts_aos[j].y_p;
-	z_p_tmp[tid_x] = parts_aos[j].z_p;
-//	h_tmp[tid_x] = parts_aos[j].h;
-	mass_tmp[tid_x] = parts_aos[j].mass;
-	ux_tmp[tid_x] = parts_aos[j].ux;
-	uy_tmp[tid_x] = parts_aos[j].uy;
-	uz_tmp[tid_x] = parts_aos[j].uz;
-	timebin[tid_x] = parts_aos[j].time_bin;
-//	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
-//	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
-//	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
-	__syncthreads();
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-
-		const float pjx = x_p_tmp[j_block];// - shift_x_j;
-		const float pjy = y_p_tmp[j_block];// - shift_y_j;
-		const float pjz = z_p_tmp[j_block];// - shift_z_j;
-
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-//		const float xij = (pix - pjx) * flip_order, yij = (piy - pjy) * flip_order, zij = (piz - pjz) * flip_order;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		if (r2 < hig2) {
-		  /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-		  const float r = sqrt(r2);
-		  /* Get the kernel for hi. */
-		  const float h_inv = 1.f / hi;
-		  const float ui = r * h_inv;
-		  float wi, wi_dx;
-
-		  d_kernel_deval(ui, &wi, &wi_dx);
-
-		  rhoi += mj * wi;
-		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-		  wcounti += wi;
-		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
-
-		  const float r_inv = 1.f / r;
-		  const float faci = mj * wi_dx * r_inv;
-		  /* Compute dv dot r */
-		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-		  dvz = uzi - uz_tmp[j_block];
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-		  /* Compute dv cross r */
-		  const float curlvrx = dvy * zij - dvz * yij;
-		  const float curlvry = dvz * xij - dvx * zij;
-		  const float curlvrz = dvx * yij - dvy * xij;
-
-		  div_vi -= faci * dvdr;
-
-		  rot_uxi += faci * curlvrx;
-		  rot_uyi += faci * curlvry;
-		  rot_uzi += faci * curlvrz;
-//		  if(timebin[j_block] != 1000 && timebin[j_block] != 20)printf("incorrect timebin %i\n", timebin[j_block]);
-		}
-	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
-	__syncthreads();
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_aos[j].x_p;
+    y_p_tmp[tid_x] = parts_aos[j].y_p;
+    z_p_tmp[tid_x] = parts_aos[j].z_p;
+    //	h_tmp[tid_x] = parts_aos[j].h;
+    mass_tmp[tid_x] = parts_aos[j].mass;
+    ux_tmp[tid_x] = parts_aos[j].ux;
+    uy_tmp[tid_x] = parts_aos[j].uy;
+    uz_tmp[tid_x] = parts_aos[j].uz;
+    timebin[tid_x] = parts_aos[j].time_bin;
+    //	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    //	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    //	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block];  // - shift_x_j;
+        const float pjy = y_p_tmp[j_block];  // - shift_y_j;
+        const float pjz = z_p_tmp[j_block];  // - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = (pix - pjx) * flip_order, yij = (piy -
+        //pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+          //		  if(timebin[j_block] != 1000 && timebin[j_block] !=
+          //20)printf("incorrect timebin %i\n", timebin[j_block]);
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    }   /*End of looping through particles in shared memory---Shared arrays
+           zero'ed for next step in outer loop*/
+    __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
-//	printf("timebin %i\n", parts_aos[pid].time_bin);
-	parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
-	parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
-	parts_aos[pid].div_v = div_vi;
-	parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi;
-	parts_aos[pid].rot_uz = rot_uzi;
-	parts_aos[pid].time_bin = 20;
+    //	printf("timebin %i\n", parts_aos[pid].time_bin);
+    parts_aos[pid].rho = rhoi, parts_aos[pid].rho_dh = rho_dhi;
+    parts_aos[pid].wcount = wcounti, parts_aos[pid].wcount_dh = wcount_dhi;
+    parts_aos[pid].div_v = div_vi;
+    parts_aos[pid].rot_ux = rot_uxi, parts_aos[pid].rot_uy = rot_uyi;
+    parts_aos[pid].rot_uz = rot_uzi;
+    parts_aos[pid].time_bin = 20;
   }
 }
 #ifdef WITH_CUDA
@@ -1941,10 +1998,13 @@ __device__ void DOPAIR2NONSYMGPUAOS(
 extern "C" {
 #endif
 __device__ void DOPAIR2NONSYMGPUAOSF4(
-	struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv, int pid,
-	const int ci_start, const int ci_end, const int cj_start, const int cj_end, float d_a, float d_H, float4 *vars_pair_aos_f4) {
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+    const int ci_start, const int ci_end, const int cj_start, const int cj_end,
+    float d_a, float d_H, float4 *vars_pair_aos_f4) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = 0.0;
 
@@ -1956,77 +2016,80 @@ __device__ void DOPAIR2NONSYMGPUAOSF4(
   const part_aos_f4_send pi = parts_send[pid];
   const float4 x_pi = pi.x_p_h;
   const float4 ux_pi = pi.ux_m;
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
-//  if (pid < ci_end) {
-	hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
-//  }
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
 
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
    * invocation*/
-  float4 * __restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0];
-  float4 * __restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE];
+  float4 *__restrict__ x_p_h_tmp = (float4 *)&vars_pair_aos_f4[0];
+  float4 *__restrict__ ux_m_tmp = (float4 *)&vars_pair_aos_f4[BLOCK_SIZE];
 
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	const int tid_x = threadIdx.x;
-	int j = b + tid_x;
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
     struct part_aos_f4_send pj = parts_send[j];
-	x_p_h_tmp[tid_x] = pj.x_p_h;
-	ux_m_tmp[tid_x] = pj.ux_m;
-	__syncthreads();
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-
-		const float4 x_p_h_j = x_p_h_tmp[j_block];
+    x_p_h_tmp[tid_x] = pj.x_p_h;
+    ux_m_tmp[tid_x] = pj.ux_m;
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float4 x_p_h_j = x_p_h_tmp[j_block];
         const float4 ux_m_j = ux_m_tmp[j_block];
 
-		const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
-		zij = x_pi.z - x_p_h_j.z;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		if (r2 < hig2) {
-		  /* Recover some data */
-		  const float mj = ux_m_j.w;
-		  const float r = sqrt(r2);
-		  /* Get the kernel for hi. */
-		  const float h_inv = 1.f / hi;
-		  const float ui = r * h_inv;
-		  float wi, wi_dx;
-
-		  d_kernel_deval(ui, &wi, &wi_dx);
-		  /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
-		  res_rho.x += mj * wi;
-		  res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
-		  res_rho.z += wi;
-		  res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
-
-		  const float r_inv = 1.f / r;
-		  const float faci = mj * wi_dx * r_inv;
-		  /* Compute dv dot r */
-		  const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
-		  dvz = ux_pi.z - ux_m_j.z;
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-		  /* Compute dv cross r */
-		  const float curlvrx = dvy * zij - dvz * yij;
-		  const float curlvry = dvz * xij - dvx * zij;
-		  const float curlvrz = dvx * yij - dvy * xij;
-
-		  res_rot.x += faci * curlvrx;
-		  res_rot.y += faci * curlvry;
-		  res_rot.z += faci * curlvrz;
-		  res_rot.w -= faci * dvdr;
-		}
-	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
-	__syncthreads();
+        const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                    zij = x_pi.z - x_p_h_j.z;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = ux_m_j.w;
+          const float r = sqrt(r2);
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+          /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+          res_rho.x += mj * wi;
+          res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+          res_rho.z += wi;
+          res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+          /* Compute dv dot r */
+          const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                      dvz = ux_pi.z - ux_m_j.z;
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Compute dv cross r */
+          const float curlvrx = dvy * zij - dvz * yij;
+          const float curlvry = dvz * xij - dvx * zij;
+          const float curlvrz = dvx * yij - dvy * xij;
+
+          res_rot.x += faci * curlvrx;
+          res_rot.y += faci * curlvry;
+          res_rot.z += faci * curlvrz;
+          res_rot.w -= faci * dvdr;
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    }   /*End of looping through particles in shared memory---Shared arrays
+           zero'ed for next step in outer loop*/
+    __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
     parts_recv[pid].rho_dh_wcount = res_rho;
-	parts_recv[pid].rot_ux_div_v = res_rot;
+    parts_recv[pid].rot_ux_div_v = res_rot;
   }
 }
 #ifdef WITH_CUDA
@@ -2036,11 +2099,14 @@ __device__ void DOPAIR2NONSYMGPUAOSF4(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NAIVEGPUAOSF4(const struct part_aos_f4_send pi,
-	struct part_aos_f4_send * __restrict__ parts_send, struct part_aos_f4_recv * __restrict__ parts_recv, int pid,
-	const int cj_start, const int cj_end, float d_a, float d_H) {
+__device__ void DOPAIR2NAIVEGPUAOSF4(
+    const struct part_aos_f4_send pi,
+    struct part_aos_f4_send *__restrict__ parts_send,
+    struct part_aos_f4_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = 0.0;
 
@@ -2049,64 +2115,66 @@ __device__ void DOPAIR2NAIVEGPUAOSF4(const struct part_aos_f4_send pi,
 
   float4 res_rho = {0.0, 0.0, 0.0, 0.0};
   float4 res_rot = {0.0, 0.0, 0.0, 0.0};
-//  const part_aos_f4_send pi = parts_send[pid];
+  //  const part_aos_f4_send pi = parts_send[pid];
   const float4 x_pi = pi.x_p_h;
   const float4 ux_pi = pi.ux_m;
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
-//  if (pid < ci_end) {
-	hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
-//  }
-
-//  printf("js %i je %i\n", cj_start, cj_end);
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
+
+  //  printf("js %i je %i\n", cj_start, cj_end);
   /*Particles copied in blocks to shared memory*/
-  for (int j = cj_start; j < cj_end; j ++) {
+  for (int j = cj_start; j < cj_end; j++) {
     struct part_aos_f4_send pj = parts_send[j];
 
-	const float4 x_p_h_j = pj.x_p_h;
-	const float4 ux_m_j = pj.ux_m;
-
-	const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
-	zij = x_pi.z - x_p_h_j.z;
-	const float r2 = xij * xij + yij * yij + zij * zij;
-//	printf("r2 %f \n", r2);
-	if (r2 < hig2) {
-	  /* Recover some data */
-	  const float mj = ux_m_j.w;
-	  const float r = sqrt(r2);
-	  /* Get the kernel for hi. */
-	  const float h_inv = 1.f / hi;
-	  const float ui = r * h_inv;
-	  float wi, wi_dx;
-
-	  d_kernel_deval(ui, &wi, &wi_dx);
-	  /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
-	  res_rho.x += mj * wi;
-	  res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
-	  res_rho.z += wi;
-	  res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
-
-	  const float r_inv = 1.f / r;
-	  const float faci = mj * wi_dx * r_inv;
-	  /* Compute dv dot r */
-	  const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
-	  dvz = ux_pi.z - ux_m_j.z;
-	  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-	  /* Compute dv cross r */
-	  const float curlvrx = dvy * zij - dvz * yij;
-	  const float curlvry = dvz * xij - dvx * zij;
-	  const float curlvrz = dvx * yij - dvy * xij;
-
-	  res_rot.x += faci * curlvrx;
-	  res_rot.y += faci * curlvry;
-	  res_rot.z += faci * curlvrz;
-	  res_rot.w -= faci * dvdr;
-	}
+    const float4 x_p_h_j = pj.x_p_h;
+    const float4 ux_m_j = pj.ux_m;
+
+    const float xij = x_pi.x - x_p_h_j.x, yij = x_pi.y - x_p_h_j.y,
+                zij = x_pi.z - x_p_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      /* Recover some data */
+      const float mj = ux_m_j.w;
+      const float r = sqrt(r2);
+      /* Get the kernel for hi. */
+      const float h_inv = 1.f / hi;
+      const float ui = r * h_inv;
+      float wi, wi_dx;
+
+      d_kernel_deval(ui, &wi, &wi_dx);
+      /*Add to sums of rho, rho_dh, wcount and wcount_dh*/
+      res_rho.x += mj * wi;
+      res_rho.y -= mj * (hydro_dimension * wi + ui * wi_dx);
+      res_rho.z += wi;
+      res_rho.w -= (hydro_dimension * wi + ui * wi_dx);
+
+      const float r_inv = 1.f / r;
+      const float faci = mj * wi_dx * r_inv;
+      /* Compute dv dot r */
+      const float dvx = ux_pi.x - ux_m_j.x, dvy = ux_pi.y - ux_m_j.y,
+                  dvz = ux_pi.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      /* Compute dv cross r */
+      const float curlvrx = dvy * zij - dvz * yij;
+      const float curlvry = dvz * xij - dvx * zij;
+      const float curlvrz = dvx * yij - dvy * xij;
+
+      res_rot.x += faci * curlvrx;
+      res_rot.y += faci * curlvry;
+      res_rot.z += faci * curlvrz;
+      res_rot.w -= faci * dvdr;
+    }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-//  if (pid >= ci_start && pid < ci_end) {
-    parts_recv[pid].rho_dh_wcount = res_rho;
-	parts_recv[pid].rot_ux_div_v = res_rot;
-//  }
+    //  if (pid >= ci_start && pid < ci_end) {
+  parts_recv[pid].rho_dh_wcount = res_rho;
+  parts_recv[pid].rot_ux_div_v = res_rot;
+  //  }
 }
 #ifdef WITH_CUDA
 }
@@ -2115,13 +2183,16 @@ __device__ void DOPAIR2NAIVEGPUAOSF4(const struct part_aos_f4_send pi,
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NONSYMGPUAOSG(
-	struct part_aos_g *parts_aos, int pid, const int ci_start,
-	const int ci_end, const int cj_start,
-	const int cj_end, float d_a, float d_H,
-	float *vars_pair_aosg, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+__device__ void DOPAIR2NONSYMGPUAOSG(struct part_aos_g *parts_aos, int pid,
+                                     const int ci_start, const int ci_end,
+                                     const int cj_start, const int cj_end,
+                                     float d_a, float d_H,
+                                     float *vars_pair_aosg, double *d_shift_x,
+                                     double *d_shift_y, double *d_shift_z,
+                                     const int task_id_tmp, int flip_order) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = 0.0;
 
@@ -2143,20 +2214,20 @@ __device__ void DOPAIR2NONSYMGPUAOSG(
 
   int count_i = cj_start;
   if (pid < ci_end) {
-	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
-	mi = parts_aos[pid].mass;
-	uxi = parts_aos[pid].ux;
-	uyi = parts_aos[pid].uy;
-	uzi = parts_aos[pid].uz;
-	ci = parts_aos[pid].soundspeed;
-	v_sig = parts_aos[pid].v_sig;
-	u = parts_aos[pid].u;
-	laplace_u = parts_aos[pid].laplace_u;
-	alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
-
-	pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
-	piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
-	piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    ci = parts_aos[pid].soundspeed;
+    v_sig = parts_aos[pid].v_sig;
+    u = parts_aos[pid].u;
+    laplace_u = parts_aos[pid].laplace_u;
+    alpha_visc_max_ngb = parts_aos[pid].alpha_visc_max_ngb;
+
+    pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
   }
 
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
@@ -2177,10 +2248,9 @@ __device__ void DOPAIR2NONSYMGPUAOSG(
   int *timebin = (int *)&vars_pair_aosg[BLOCK_SIZE * 12];
 
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	const int tid_x = threadIdx.x;
-	int j = b + tid_x;
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
     x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
     y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
     z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
@@ -2194,49 +2264,52 @@ __device__ void DOPAIR2NONSYMGPUAOSG(
     alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
     u_tmp[threadIdx.x] = parts_aos[j].u;
     rho_tmp[threadIdx.x] = parts_aos[j].rho;
-	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
-	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
-	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
     __syncthreads();
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-
-		const float pjx = x_p_tmp[j_block] - shift_x_j;
-		const float pjy = y_p_tmp[j_block] - shift_y_j;
-		const float pjz = z_p_tmp[j_block] - shift_z_j;
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		if (r2 < hig2) {
-		  /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-		  const float r = sqrt(r2);
-          const float r_inv = 1.f / r;
-		  /* Get the kernel for hi. */
-		  const float h_inv = 1.f / hi;
-		  float wi, wi_dx;
-          /* Cosmology terms for the signal velocity */
-          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
-          const float a2_Hubble = d_a * d_a * d_H;
-		  /* Compute dv dot r */
-		  const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-		              dvz = uzi - uz_tmp[j_block];
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-          /* Add Hubble flow */
-          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-          /* Are the particles moving towards each others ? */
-          const float omega_ij = min(dvdr_Hubble, 0.f);
-          const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          /* Get the kernel for hi. */
+          const float h_inv = 1.f / hi;
+          float wi, wi_dx;
+          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          /* Compute dv dot r */
+          const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                      dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
 
           /* Signal velocity */
-          const float new_v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+          const float new_v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
           /* Update if we need to */
           v_sig = max(v_sig, new_v_sig);
           /* Calculate Del^2 u for the thermal diffusion coefficient. */
           /* Need to get some kernel values F_ij = wi_dx */
-		  const float ui = r * h_inv;
-		  d_kernel_deval(ui, &wi, &wi_dx);
+          const float ui = r * h_inv;
+          d_kernel_deval(ui, &wi, &wi_dx);
 
           const float delta_u_factor = (u - u_tmp[j_block]) * r_inv;
           laplace_u += mj * delta_u_factor * wi_dx / rho_tmp[j_block];
@@ -2245,10 +2318,11 @@ __device__ void DOPAIR2NONSYMGPUAOSG(
            * (this is used to limit the diffusion in hydro_prepare_force) */
           const float alpha_j = alpha_tmp[j_block];
           alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
-		}
-	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
-	__syncthreads();
+        }
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    }   /*End of looping through particles in shared memory---Shared arrays
+           zero'ed for next step in outer loop*/
+    __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
     parts_aos[pid].v_sig = v_sig, parts_aos[pid].laplace_u = laplace_u;
@@ -2259,15 +2333,17 @@ __device__ void DOPAIR2NONSYMGPUAOSG(
 }
 #endif
 
-
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NAIVEGPUAOSF4G(const struct part_aos_f4_g_send pi,
-	struct part_aos_f4_g_send * __restrict__ parts_send, struct part_aos_f4_g_recv * __restrict__ parts_recv, int pid,
-	const int cj_start, const int cj_end, float d_a, float d_H) {
+__device__ void DOPAIR2NAIVEGPUAOSF4G(
+    const struct part_aos_f4_g_send pi,
+    struct part_aos_f4_g_send *__restrict__ parts_send,
+    struct part_aos_f4_g_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float hi = 0.0, hig2 = 0.0;
 
@@ -2276,85 +2352,91 @@ __device__ void DOPAIR2NAIVEGPUAOSF4G(const struct part_aos_f4_g_send pi,
 
   float4 res_rho = {0.0, 0.0, 0.0, 0.0};
   float4 res_rot = {0.0, 0.0, 0.0, 0.0};
-//  const part_aos_f4_send pi = parts_send[pid];
+  //  const part_aos_f4_send pi = parts_send[pid];
   const float4 x_h_i = pi.x_h;
   const float4 ux_m_i = pi.ux_m;
   const float4 rho_avisc_u_c_i = pi.rho_avisc_u_c;
   float3 vsig_lapu_aviscmax_i = {0.f, 0.f, 0.f};
 
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
-//  if (pid < ci_end) {
-	hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
-//  }
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
+  //  if (pid < ci_end) {
+  hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
+  //  }
 
-//  printf("js %i je %i\n", cj_start, cj_end);
+  //  printf("js %i je %i\n", cj_start, cj_end);
   /*Particles copied in blocks to shared memory*/
-  for (int j = cj_start; j < cj_end; j ++) {
+  for (int j = cj_start; j < cj_end; j++) {
     struct part_aos_f4_g_send pj = parts_send[j];
 
-	const float4 x_h_j = pj.x_h;
-	const float4 ux_m_j = pj.ux_m;
+    const float4 x_h_j = pj.x_h;
+    const float4 ux_m_j = pj.ux_m;
     const float4 rho_avisc_u_c_j = pj.rho_avisc_u_c;
-	const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y, zij = x_h_i.z - x_h_j.z;
-	const float r2 = xij * xij + yij * yij + zij * zij;
-//	printf("r2 %f \n", r2);
-	if (r2 < hig2) {
-        const float r = sqrt(r2);
-        const float r_inv = 1.f / r;
-        /* Recover some data */
-        const float mj = ux_m_j.w;
-        /* Get the kernel for hi. */
-        const float h_inv = 1.f / hi;
-        float wi, wi_dx;
-        /* Cosmology terms for the signal velocity */
-        const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
-        const float a2_Hubble = d_a * d_a * d_H;
-        /* Compute dv dot r */
-        float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
-              dvz = ux_m_i.z - ux_m_j.z;
-        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-        /* Add Hubble flow */
-        const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-        /* Are the particles moving towards each others ? */
-        const float omega_ij = min(dvdr_Hubble, 0.f);
-        const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
-
-        /* Signal velocity */
-        const float new_v_sig = rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
-        /* Update if we need to */
-        vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
-        /* Calculate Del^2 u for the thermal diffusion coefficient. */
-        /* Need to get some kernel values F_ij = wi_dx */
-        const float ui = r * h_inv;
-        d_kernel_deval(ui, &wi, &wi_dx);
-
-        const float delta_u_factor = (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
-        vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
-
-        /* Set the maximal alpha from the previous step over the neighbours
-         * (this is used to limit the diffusion in hydro_prepare_force) */
-        const float alpha_j = rho_avisc_u_c_j.y;
-        vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
-	}
+    const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                zij = x_h_i.z - x_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      const float r = sqrt(r2);
+      const float r_inv = 1.f / r;
+      /* Recover some data */
+      const float mj = ux_m_j.w;
+      /* Get the kernel for hi. */
+      const float h_inv = 1.f / hi;
+      float wi, wi_dx;
+      /* Cosmology terms for the signal velocity */
+      const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+      const float a2_Hubble = d_a * d_a * d_H;
+      /* Compute dv dot r */
+      float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+            dvz = ux_m_i.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      /* Add Hubble flow */
+      const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+      /* Are the particles moving towards each others ? */
+      const float omega_ij = min(dvdr_Hubble, 0.f);
+      const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+
+      /* Signal velocity */
+      const float new_v_sig =
+          rho_avisc_u_c_i.w + rho_avisc_u_c_j.w - const_viscosity_beta * mu_ij;
+      /* Update if we need to */
+      vsig_lapu_aviscmax_i.x = fmaxf(vsig_lapu_aviscmax_i.x, new_v_sig);
+      /* Calculate Del^2 u for the thermal diffusion coefficient. */
+      /* Need to get some kernel values F_ij = wi_dx */
+      const float ui = r * h_inv;
+      d_kernel_deval(ui, &wi, &wi_dx);
+
+      const float delta_u_factor =
+          (rho_avisc_u_c_i.z - rho_avisc_u_c_j.z) * r_inv;
+      vsig_lapu_aviscmax_i.y += mj * delta_u_factor * wi_dx / rho_avisc_u_c_j.x;
+
+      /* Set the maximal alpha from the previous step over the neighbours
+       * (this is used to limit the diffusion in hydro_prepare_force) */
+      const float alpha_j = rho_avisc_u_c_j.y;
+      vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
+    }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-//  if (pid >= ci_start && pid < ci_end) {
+    //  if (pid >= ci_start && pid < ci_end) {
   parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
-//  }
+  //  }
 }
 #ifdef WITH_CUDA
 }
 #endif
 
-
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NONSYMGPUAOSF(
-	struct part_aos_f *parts_aos, int pid, const int ci_start,
-	const int ci_end, const int cj_start,
-	const int cj_end, float d_a, float d_H,
-	float *vars_pair_aosf, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp, int flip_order) {
+__device__ void DOPAIR2NONSYMGPUAOSF(struct part_aos_f *parts_aos, int pid,
+                                     const int ci_start, const int ci_end,
+                                     const int cj_start, const int cj_end,
+                                     float d_a, float d_H,
+                                     float *vars_pair_aosf, double *d_shift_x,
+                                     double *d_shift_y, double *d_shift_z,
+                                     const int task_id_tmp, int flip_order) {
 
   float ci = 0.0, cj = 0.0;
   float hi = 0.0, hig2 = 0.0;
@@ -2384,30 +2466,30 @@ __device__ void DOPAIR2NONSYMGPUAOSF(
   float h_dti = 0.0;
   int min_ngb_time_bin = 0;
   if (pid < ci_end) {
-	hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
-	mi = parts_aos[pid].mass;
-	uxi = parts_aos[pid].ux;
-	uyi = parts_aos[pid].uy;
-	uzi = parts_aos[pid].uz;
-	ci = parts_aos[pid].soundspeed;
-	fi = parts_aos[pid].f;
-	v_sigi = parts_aos[pid].v_sig;
-	ui = parts_aos[pid].u;
-	rhoi = parts_aos[pid].rho;
-	pressurei = parts_aos[pid].pressure;
-	balsarai = parts_aos[pid].balsara;
-	alphavisci = parts_aos[pid].alpha_visc;
-	alphadiffi = parts_aos[pid].alpha_diff;
-	min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
-	pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
-	piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
-	piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
+    hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
+    mi = parts_aos[pid].mass;
+    uxi = parts_aos[pid].ux;
+    uyi = parts_aos[pid].uy;
+    uzi = parts_aos[pid].uz;
+    ci = parts_aos[pid].soundspeed;
+    fi = parts_aos[pid].f;
+    v_sigi = parts_aos[pid].v_sig;
+    ui = parts_aos[pid].u;
+    rhoi = parts_aos[pid].rho;
+    pressurei = parts_aos[pid].pressure;
+    balsarai = parts_aos[pid].balsara;
+    alphavisci = parts_aos[pid].alpha_visc;
+    alphadiffi = parts_aos[pid].alpha_diff;
+    min_ngb_time_bin = parts_aos[pid].min_ngb_time_bin;
+    pix = parts_aos[pid].x_p - d_shift_x[task_id_tmp];
+    piy = parts_aos[pid].y_p - d_shift_y[task_id_tmp];
+    piz = parts_aos[pid].z_p - d_shift_z[task_id_tmp];
   }
-//  if (threadIdx.x == 0) {
-//    first_part_tid_0 = first_part;
-//    last_part_tid_0 = last_part;
-//  }
-//  __syncthreads();
+  //  if (threadIdx.x == 0) {
+  //    first_part_tid_0 = first_part;
+  //    last_part_tid_0 = last_part;
+  //  }
+  //  __syncthreads();
   int n_neighbours = 0;
   /*Here we use different pointers "x_p_tmp", etc. to point to different regions
    * of the single shared memory space "vars" which we allocate in kernel
@@ -2430,162 +2512,173 @@ __device__ void DOPAIR2NONSYMGPUAOSF(
   float *balsara_tmp = (float *)&vars_pair_aosf[BLOCK_SIZE * 15];
   int *timebin = (int *)&vars_pair_aosf[BLOCK_SIZE * 16];
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	int j = b + threadIdx.x;
-	x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
-	y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
-	z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
-	h_tmp[threadIdx.x] = parts_aos[j].h;
-	mass_tmp[threadIdx.x] = parts_aos[j].mass;
-	ux_tmp[threadIdx.x] = parts_aos[j].ux;
-	uy_tmp[threadIdx.x] = parts_aos[j].uy;
-	uz_tmp[threadIdx.x] = parts_aos[j].uz;
-	timebin[threadIdx.x] = parts_aos[j].time_bin;
-	cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
-//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
-	u_tmp[threadIdx.x] = parts_aos[j].u;
-	rho_tmp[threadIdx.x] = parts_aos[j].rho;
-	alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
-	alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
-	pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
-	f_tmp[threadIdx.x] = parts_aos[j].f;
-	balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
-	const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
-	const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
-	const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
-	__syncthreads();
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-		/* Compute the pairwise distance. */
-		const float pjx = x_p_tmp[j_block] - shift_x_j;
-		const float pjy = y_p_tmp[j_block] - shift_y_j;
-		const float pjz = z_p_tmp[j_block] - shift_z_j;
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		if (r2 < hig2) {
-
-		  //          /* Cosmology terms for the signal velocity */
-		  const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
-		  const float a2_Hubble = d_a * d_a * d_H;
-		  const float r = sqrt(r2);
-		  const float r_inv = 1.f / r;
-//          /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-//          /* Get the kernel for hi. */
-		  const float hi_inv = 1.f / hi;
-		  const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
-		  const float xi = r * hi_inv;
-		  float wi, wi_dx;
-		  d_kernel_deval(xi, &wi, &wi_dx);
-		  const float wi_dr = hid_inv * wi_dx;
-		  /* Get the kernel for hj. */
-		  const float hj = h_tmp[j_block];
-		  const float hj_inv = 1.0f / hj;
-		  const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
-		  const float xj = r * hj_inv;
-		  float wj, wj_dx;
-		  d_kernel_deval(xj, &wj, &wj_dx);
-		  const float wj_dr = hjd_inv * wj_dx;
-//          /* Compute dv dot r */
-		  float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-				dvz = uzi - uz_tmp[j_block];
-		  const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-//          /* Add Hubble flow */
-		  const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-//          /* Are the particles moving towards each others ? */
-		  const float omega_ij = min(dvdr_Hubble, 0.f);
-		  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
-//
-//          /* Signal velocity */
-		  const float v_sig = ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
-
-		  /* Variable smoothing length term */
-		  const float f_ij = 1.f - fi / mj;
-		  const float f_ji = 1.f - f_tmp[j_block] / mi;
-
-		  /* Balsara term */
-		  const float balsaraj = balsara_tmp[j_block];
-		  /* Construct the full viscosity term */
-		  const float rhoj = rho_tmp[j_block];
-		  const float pressurej = pressure_tmp[j_block];
-		  const float rho_ij = rhoi + rhoj;
-		  const float alpha = alphavisci + alphavisc_tmp[j_block];
-		  const float visc =
-			  -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
-		  /* Convolve with the kernel */
-		  const float visc_acc_term =
-			  0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
-		  /* Compute gradient terms */
-		  const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
-		  const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
-
-		  /* SPH acceleration term */
-		  const float sph_acc_term =
-			  (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
-
-		  /* Assemble the acceleration */
-		  const float acc = sph_acc_term + visc_acc_term;
-		  /* Use the force Luke ! */
-		  ahydroxi -= mj * acc * xij;
-		  ahydroyi -= mj * acc * yij;
-		  ahydrozi -= mj * acc * zij;
-//          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj, pressurei, pressurej);
-		  /* Get the time derivative for u. */
-		  const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
-
-		  /* Viscosity term */
-		  const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
-		  const float press_sum = pressurei + pressurej;
-		  /* Diffusion term */
-		  /* Combine the alpha_diff into a pressure-based switch -- this allows the
-		   * alpha from the highest pressure particle to dominate, so that the
-		   * diffusion limited particles always take precedence - another trick to
-		   * allow the scheme to work with thermal feedback. */
-		  float alpha_diff =
-			  (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
-			  (press_sum);
-		  if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
-		  const float v_diff = alpha_diff * 0.5f *
-							   (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
-								fabsf(fac_mu * r_inv * dvdr_Hubble));
-		  /* wi_dx + wj_dx / 2 is F_ij */
-		  const float diff_du_term =
-			  v_diff * (ui - u_tmp[j_block]) * (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
-
-		  /* Assemble the energy equation term */
-		  const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
-
-		  /* Internal energy time derivative */
-		  u_dti += du_dt_i * mj;
-		  if(mj == 0.f)printf("zero mass mj %f\n", mj);
-
-		  /* Get the time derivative for h. */
-		  h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
-
-		  /* Update if we need to; this should be guaranteed by the gradient loop but
-		   * due to some possible synchronisation problems this is here as a _quick
-		   * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
-		  v_sigi = max(v_sigi, v_sig);
-		  int time_bin_j = timebin[j_block];
-		  if(time_bin_j > 0)min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
-//          printf("Got in\n");
-		}
-	  }
-	}
-	__syncthreads();
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_aos[j].x_p;
+    y_p_tmp[threadIdx.x] = parts_aos[j].y_p;
+    z_p_tmp[threadIdx.x] = parts_aos[j].z_p;
+    h_tmp[threadIdx.x] = parts_aos[j].h;
+    mass_tmp[threadIdx.x] = parts_aos[j].mass;
+    ux_tmp[threadIdx.x] = parts_aos[j].ux;
+    uy_tmp[threadIdx.x] = parts_aos[j].uy;
+    uz_tmp[threadIdx.x] = parts_aos[j].uz;
+    timebin[threadIdx.x] = parts_aos[j].time_bin;
+    cj_tmp[threadIdx.x] = parts_aos[j].soundspeed;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    u_tmp[threadIdx.x] = parts_aos[j].u;
+    rho_tmp[threadIdx.x] = parts_aos[j].rho;
+    alphavisc_tmp[threadIdx.x] = parts_aos[j].alpha_visc;
+    alphadiff_tmp[threadIdx.x] = parts_aos[j].alpha_diff;
+    pressure_tmp[threadIdx.x] = parts_aos[j].pressure;
+    f_tmp[threadIdx.x] = parts_aos[j].f;
+    balsara_tmp[threadIdx.x] = parts_aos[j].balsara;
+    const float shift_x_j = d_shift_x[task_id_tmp + flip_order];
+    const float shift_y_j = d_shift_y[task_id_tmp + flip_order];
+    const float shift_z_j = d_shift_z[task_id_tmp + flip_order];
+    __syncthreads();
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - shift_x_j;
+        const float pjy = y_p_tmp[j_block] - shift_y_j;
+        const float pjz = z_p_tmp[j_block] - shift_z_j;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        if (r2 < hig2) {
+
+          //          /* Cosmology terms for the signal velocity */
+          const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+          const float a2_Hubble = d_a * d_a * d_H;
+          const float r = sqrt(r2);
+          const float r_inv = 1.f / r;
+          //          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          //          /* Get the kernel for hi. */
+          const float hi_inv = 1.f / hi;
+          const float hid_inv =
+              d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+          const float xi = r * hi_inv;
+          float wi, wi_dx;
+          d_kernel_deval(xi, &wi, &wi_dx);
+          const float wi_dr = hid_inv * wi_dx;
+          /* Get the kernel for hj. */
+          const float hj = h_tmp[j_block];
+          const float hj_inv = 1.0f / hj;
+          const float hjd_inv =
+              d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+          const float xj = r * hj_inv;
+          float wj, wj_dx;
+          d_kernel_deval(xj, &wj, &wj_dx);
+          const float wj_dr = hjd_inv * wj_dx;
+          //          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+          //          /* Add Hubble flow */
+          const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+          //          /* Are the particles moving towards each others ? */
+          const float omega_ij = min(dvdr_Hubble, 0.f);
+          const float mu_ij =
+              fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                         //
+                                         //          /* Signal velocity */
+          const float v_sig =
+              ci + cj_tmp[j_block] - const_viscosity_beta * mu_ij;
+
+          /* Variable smoothing length term */
+          const float f_ij = 1.f - fi / mj;
+          const float f_ji = 1.f - f_tmp[j_block] / mi;
+
+          /* Balsara term */
+          const float balsaraj = balsara_tmp[j_block];
+          /* Construct the full viscosity term */
+          const float rhoj = rho_tmp[j_block];
+          const float pressurej = pressure_tmp[j_block];
+          const float rho_ij = rhoi + rhoj;
+          const float alpha = alphavisci + alphavisc_tmp[j_block];
+          const float visc =
+              -0.25f * alpha * v_sig * mu_ij * (balsarai + balsaraj) / rho_ij;
+          /* Convolve with the kernel */
+          const float visc_acc_term =
+              0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+          /* Compute gradient terms */
+          const float P_over_rho2_i = pressurei / (rhoi * rhoi) * f_ij;
+          const float P_over_rho2_j = pressurej / (rhoj * rhoj) * f_ji;
+
+          /* SPH acceleration term */
+          const float sph_acc_term =
+              (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+          /* Assemble the acceleration */
+          const float acc = sph_acc_term + visc_acc_term;
+          /* Use the force Luke ! */
+          ahydroxi -= mj * acc * xij;
+          ahydroyi -= mj * acc * yij;
+          ahydrozi -= mj * acc * zij;
+          //          if(rhoi == 0 || rhoj == 0 || pressurei == 0 || pressurej
+          //          == 0)printf("ri %f rj %f pi %f pj %f\n", rhoi, rhoj,
+          //          pressurei, pressurej);
+          /* Get the time derivative for u. */
+          const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+          /* Viscosity term */
+          const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+          const float press_sum = pressurei + pressurej;
+          /* Diffusion term */
+          /* Combine the alpha_diff into a pressure-based switch -- this allows
+           * the alpha from the highest pressure particle to dominate, so that
+           * the diffusion limited particles always take precedence - another
+           * trick to allow the scheme to work with thermal feedback. */
+          float alpha_diff =
+              (pressurei * alphadiffi + pressurej * alphadiff_tmp[j_block]) /
+              (press_sum);
+          if (fabsf(press_sum) < 1e-10) alpha_diff = 0.f;
+          const float v_diff =
+              alpha_diff * 0.5f *
+              (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+               fabsf(fac_mu * r_inv * dvdr_Hubble));
+          /* wi_dx + wj_dx / 2 is F_ij */
+          const float diff_du_term =
+              v_diff * (ui - u_tmp[j_block]) *
+              (f_ij * wi_dr / rhoi + f_ji * wj_dr / rhoj);
+
+          /* Assemble the energy equation term */
+          const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+          /* Internal energy time derivative */
+          u_dti += du_dt_i * mj;
+          if (mj == 0.f) printf("zero mass mj %f\n", mj);
+
+          /* Get the time derivative for h. */
+          h_dti -= mj * dvdr * r_inv / rhoj * wi_dr;
+
+          /* Update if we need to; this should be guaranteed by the gradient
+           * loop but due to some possible synchronisation problems this is here
+           * as a _quick fix_. Added: 14th August 2019. To be removed by 1st Jan
+           * 2020. (JB) */
+          v_sigi = max(v_sigi, v_sig);
+          int time_bin_j = timebin[j_block];
+          if (time_bin_j > 0)
+            min_ngb_time_bin = min(min_ngb_time_bin, time_bin_j);
+          //          printf("Got in\n");
+        }
+      }
+    }
+    __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
-	parts_aos[pid].v_sig = v_sigi;
-	parts_aos[pid].h_dt = h_dti;
-	parts_aos[pid].u_dt = u_dti;
-	parts_aos[pid].a_hydrox = ahydroxi;
-	parts_aos[pid].a_hydroy = ahydroyi;
-	parts_aos[pid].a_hydroz = ahydrozi;
-	parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
-//    printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi, ahydroyi, ahydrozi);
+    parts_aos[pid].v_sig = v_sigi;
+    parts_aos[pid].h_dt = h_dti;
+    parts_aos[pid].u_dt = u_dti;
+    parts_aos[pid].a_hydrox = ahydroxi;
+    parts_aos[pid].a_hydroy = ahydroyi;
+    parts_aos[pid].a_hydroz = ahydrozi;
+    parts_aos[pid].min_ngb_time_bin = min_ngb_time_bin;
+    //    printf("%f %f %f %f %f %f\n", v_sigi, h_dti, u_dti, ahydroxi,
+    //    ahydroyi, ahydrozi);
   }
 }
 #ifdef WITH_CUDA
@@ -2595,15 +2688,18 @@ __device__ void DOPAIR2NONSYMGPUAOSF(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2NAIVEGPUAOSF4F(const struct part_aos_f4_f_send pi,
-	struct part_aos_f4_f_send * __restrict__ parts_send, struct part_aos_f4_f_recv * __restrict__ parts_recv, int pid,
-	const int cj_start, const int cj_end, float d_a, float d_H) {
+__device__ void DOPAIR2NAIVEGPUAOSF4F(
+    const struct part_aos_f4_f_send pi,
+    struct part_aos_f4_f_send *__restrict__ parts_send,
+    struct part_aos_f4_f_recv *__restrict__ parts_recv, int pid,
+    const int cj_start, const int cj_end, float d_a, float d_H) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   int Found_neighbours = 0;
 
-//  const part_aos_f4_send pi = parts_send[pid];
+  //  const part_aos_f4_send pi = parts_send[pid];
   const float4 x_h_i = pi.x_h;
   const float4 ux_m_i = pi.ux_m;
 
@@ -2622,134 +2718,133 @@ __device__ void DOPAIR2NAIVEGPUAOSF4F(const struct part_aos_f4_f_send pi,
   const float hi = x_h_i.w;
   const float hig2 = hi * hi * kernel_gamma2;
 
-//  printf("js %i je %i\n", cj_start, cj_end);
+  //  printf("js %i je %i\n", cj_start, cj_end);
   /*Particles copied in blocks to shared memory*/
-  for (int j = cj_start; j < cj_end; j ++) {
+  for (int j = cj_start; j < cj_end; j++) {
     struct part_aos_f4_f_send pj = parts_send[j];
-	const float4 x_h_j = pj.x_h;
-	const float4 ux_m_j = pj.ux_m;
+    const float4 x_h_j = pj.x_h;
+    const float4 ux_m_j = pj.ux_m;
     const float4 f_b_t_mintbinngb_j = pj.f_bals_timebin_mintimebin_ngb;
     const float4 rho_p_c_vsig_j = pj.rho_p_c_vsigi;
-//    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
+    //    alpha_tmp[threadIdx.x] = parts_aos[j].visc_alpha;
     const float3 u_avisc_adiff_j = pj.u_alphavisc_alphadiff;
-	const float xij = x_h_i.x - x_h_j.x,
-			    yij = x_h_i.y - x_h_j.y,
-			    zij = x_h_i.z - x_h_j.z;
-	const float r2 = xij * xij + yij * yij + zij * zij;
-//	printf("r2 %f \n", r2);
-	if (r2 < hig2) {
-        //          /* Cosmology terms for the signal velocity */
-        const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
-        const float a2_Hubble = d_a * d_a * d_H;
-        const float r = sqrt(r2);
-        const float r_inv = 1.f / r;
-//          /* Recover some data */
-        const float mj = ux_m_j.w;
-//          /* Get the kernel for hi. */
-        const float hi_inv = 1.f / hi;
-        const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
-        const float xi = r * hi_inv;
-        float wi, wi_dx;
-        d_kernel_deval(xi, &wi, &wi_dx);
-        const float wi_dr = hid_inv * wi_dx;
-        /* Get the kernel for hj. */
-        const float hj = x_h_j.w;
-        const float hj_inv = 1.0f / hj;
-        const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
-        const float xj = r * hj_inv;
-        float wj, wj_dx;
-        d_kernel_deval(xj, &wj, &wj_dx);
-        const float wj_dr = hjd_inv * wj_dx;
-//          /* Compute dv dot r */
-        float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
-              dvz = ux_m_i.z - ux_m_j.z;
-        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-//          /* Add Hubble flow */
-        const float dvdr_Hubble = dvdr + a2_Hubble * r2;
-//          /* Are the particles moving towards each others ? */
-        const float omega_ij = min(dvdr_Hubble, 0.f);
-        const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
-//
-//          /* Signal velocity */
-        const float cj = rho_p_c_vsig_j.z;
-        const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
-
-        /* Variable smoothing length term */
-        const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
-        const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
-
-        /* Construct the full viscosity term */
-        const float pressurej = rho_p_c_vsig_j.y;
-        const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
-        const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
-        const float visc =
-            -0.25f * alpha * v_sig * mu_ij * (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
-        /* Convolve with the kernel */
-        const float visc_acc_term =
-            0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
-        /* Compute gradient terms */
-        const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
-        const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
-        const float P_over_rho2_i = pressurei / (rhoi2) * f_ij;
-        const float P_over_rho2_j = pressurej / (rhoj2) * f_ji;
-
-        /* SPH acceleration term */
-        const float sph_acc_term =
-            (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
-
-        /* Assemble the acceleration */
-        const float acc = sph_acc_term + visc_acc_term;
-        /* Use the force Luke ! */
-        ahydro.x -= mj * acc * xij;
-        ahydro.y -= mj * acc * yij;
-        ahydro.z -= mj * acc * zij;
-        /* Get the time derivative for u. */
-        const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
-
-        /* Viscosity term */
-        const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
-        /* Diffusion term */
-        /* Combine the alpha_diff into a pressure-based switch -- this allows the
-         * alpha from the highest pressure particle to dominate, so that the
-         * diffusion limited particles always take precedence - another trick to
-         * allow the scheme to work with thermal feedback. */
-        float alpha_diff =
-            (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
-            (pressurei + pressurej);
-        if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
-        const float v_diff = alpha_diff * 0.5f *
-                             (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
-                              fabsf(fac_mu * r_inv * dvdr_Hubble));
-        /* wi_dx + wj_dx / 2 is F_ij */
-        const float diff_du_term =
-            v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
-			  (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
-
-        /* Assemble the energy equation term */
-        const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
-
-        /* Internal energy time derivative */
-        udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
-
-        /* Get the time derivative for h. */
-        udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
-
-        /* Update if we need to; this should be guaranteed by the gradient loop but
-         * due to some possible synchronisation problems this is here as a _quick
-         * fix_. Added: 14th August 2019. To be removed by 1st Jan 2020. (JB) */
-        udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
-        unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
-        unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
-        if(time_bin_j > 0)f_b_t_mintbinngb_i.w =
-      		  min(min_tb_i, time_bin_j);
-//          printf("Got in\n");
-	}
+    const float xij = x_h_i.x - x_h_j.x, yij = x_h_i.y - x_h_j.y,
+                zij = x_h_i.z - x_h_j.z;
+    const float r2 = xij * xij + yij * yij + zij * zij;
+    //	printf("r2 %f \n", r2);
+    if (r2 < hig2) {
+      //          /* Cosmology terms for the signal velocity */
+      const float fac_mu = d_pow_three_gamma_minus_five_over_two(d_a);
+      const float a2_Hubble = d_a * d_a * d_H;
+      const float r = sqrt(r2);
+      const float r_inv = 1.f / r;
+      //          /* Recover some data */
+      const float mj = ux_m_j.w;
+      //          /* Get the kernel for hi. */
+      const float hi_inv = 1.f / hi;
+      const float hid_inv = d_pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
+      const float xi = r * hi_inv;
+      float wi, wi_dx;
+      d_kernel_deval(xi, &wi, &wi_dx);
+      const float wi_dr = hid_inv * wi_dx;
+      /* Get the kernel for hj. */
+      const float hj = x_h_j.w;
+      const float hj_inv = 1.0f / hj;
+      const float hjd_inv = d_pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
+      const float xj = r * hj_inv;
+      float wj, wj_dx;
+      d_kernel_deval(xj, &wj, &wj_dx);
+      const float wj_dr = hjd_inv * wj_dx;
+      //          /* Compute dv dot r */
+      float dvx = ux_m_i.x - ux_m_j.x, dvy = ux_m_i.y - ux_m_j.y,
+            dvz = ux_m_i.z - ux_m_j.z;
+      const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+      //          /* Add Hubble flow */
+      const float dvdr_Hubble = dvdr + a2_Hubble * r2;
+      //          /* Are the particles moving towards each others ? */
+      const float omega_ij = min(dvdr_Hubble, 0.f);
+      const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */
+                                                     //
+      //          /* Signal velocity */
+      const float cj = rho_p_c_vsig_j.z;
+      const float v_sig = ci + cj - const_viscosity_beta * mu_ij;
+
+      /* Variable smoothing length term */
+      const float f_ij = 1.f - f_b_t_mintbinngb_i.x / mj;
+      const float f_ji = 1.f - f_b_t_mintbinngb_j.x / mi;
+
+      /* Construct the full viscosity term */
+      const float pressurej = rho_p_c_vsig_j.y;
+      const float rho_ij = rho_p_c_vsig_i.x + rho_p_c_vsig_j.x;
+      const float alpha = u_avisc_adiff_i.y + u_avisc_adiff_j.y;
+      const float visc = -0.25f * alpha * v_sig * mu_ij *
+                         (f_b_t_mintbinngb_i.y + f_b_t_mintbinngb_j.y) / rho_ij;
+      /* Convolve with the kernel */
+      const float visc_acc_term =
+          0.5f * visc * (wi_dr * f_ij + wj_dr * f_ji) * r_inv;
+      /* Compute gradient terms */
+      const float rhoi2 = rho_p_c_vsig_i.x * rho_p_c_vsig_i.x;
+      const float rhoj2 = rho_p_c_vsig_j.x * rho_p_c_vsig_j.x;
+      const float P_over_rho2_i = pressurei / (rhoi2)*f_ij;
+      const float P_over_rho2_j = pressurej / (rhoj2)*f_ji;
+
+      /* SPH acceleration term */
+      const float sph_acc_term =
+          (P_over_rho2_i * wi_dr + P_over_rho2_j * wj_dr) * r_inv;
+
+      /* Assemble the acceleration */
+      const float acc = sph_acc_term + visc_acc_term;
+      /* Use the force Luke ! */
+      ahydro.x -= mj * acc * xij;
+      ahydro.y -= mj * acc * yij;
+      ahydro.z -= mj * acc * zij;
+      /* Get the time derivative for u. */
+      const float sph_du_term_i = P_over_rho2_i * dvdr * r_inv * wi_dr;
+
+      /* Viscosity term */
+      const float visc_du_term = 0.5f * visc_acc_term * dvdr_Hubble;
+      /* Diffusion term */
+      /* Combine the alpha_diff into a pressure-based switch -- this allows the
+       * alpha from the highest pressure particle to dominate, so that the
+       * diffusion limited particles always take precedence - another trick to
+       * allow the scheme to work with thermal feedback. */
+      float alpha_diff =
+          (pressurei * u_avisc_adiff_i.z + pressurej * u_avisc_adiff_j.z) /
+          (pressurei + pressurej);
+      if (fabsf(pressurei + pressurej) < 1e-10) alpha_diff = 0.f;
+      const float v_diff = alpha_diff * 0.5f *
+                           (sqrtf(2.f * fabsf(pressurei - pressurej) / rho_ij) +
+                            fabsf(fac_mu * r_inv * dvdr_Hubble));
+      /* wi_dx + wj_dx / 2 is F_ij */
+      const float diff_du_term =
+          v_diff * (u_avisc_adiff_i.x - u_avisc_adiff_j.x) *
+          (f_ij * wi_dr / rho_p_c_vsig_i.x + f_ji * wj_dr / rho_p_c_vsig_j.x);
+
+      /* Assemble the energy equation term */
+      const float du_dt_i = sph_du_term_i + visc_du_term + diff_du_term;
+
+      /* Internal energy time derivative */
+      udt_hdt_vsig_mintbinngb.x += du_dt_i * mj;
+
+      /* Get the time derivative for h. */
+      udt_hdt_vsig_mintbinngb.y -= mj * dvdr * r_inv / rho_p_c_vsig_j.x * wi_dr;
+
+      /* Update if we need to; this should be guaranteed by the gradient loop
+       * but due to some possible synchronisation problems this is here as a
+       * _quick fix_. Added: 14th August 2019. To be removed by 1st Jan 2020.
+       * (JB) */
+      udt_hdt_vsig_mintbinngb.z = fmaxf(udt_hdt_vsig_mintbinngb.z, v_sig);
+      unsigned int time_bin_j = (f_b_t_mintbinngb_j.z + 0.5f);
+      unsigned int min_tb_i = (f_b_t_mintbinngb_i.w + 0.5f);
+      if (time_bin_j > 0) f_b_t_mintbinngb_i.w = min(min_tb_i, time_bin_j);
+      //          printf("Got in\n");
+    }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-//  if (pid >= ci_start && pid < ci_end) {
+    //  if (pid >= ci_start && pid < ci_end) {
   udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
   parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
   parts_recv[pid].a_hydro = ahydro;
-//  }
+  //  }
 }
 #ifdef WITH_CUDA
 }
@@ -2758,13 +2853,15 @@ __device__ void DOPAIR2NAIVEGPUAOSF4F(const struct part_aos_f4_f_send pi,
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-__device__ void DOPAIR2GPU(
-	struct part_soa parts_soa, int pid, const int ci_start,
-	const int ci_end, const int cj_start,
-	const int cj_end, float d_a, float d_H,
-	int time_bin_inhibited, float *vars_pair, double *d_shift_x, double *d_shift_y, double *d_shift_z, const int task_id_tmp) {
+__device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
+                           const int ci_start, const int ci_end,
+                           const int cj_start, const int cj_end, float d_a,
+                           float d_H, int time_bin_inhibited, float *vars_pair,
+                           double *d_shift_x, double *d_shift_y,
+                           double *d_shift_z, const int task_id_tmp) {
 
-  float dx = 1.f/64.f; //Value used to avoid interacting parts with themselves
+  float dx =
+      1.f / 64.f;  // Value used to avoid interacting parts with themselves
 
   float cellx = 0.0, celly = 0.0, cellz = 0.0;
   float cellxj = 0.0, cellyj = 0.0, cellzj = 0.0;
@@ -2787,25 +2884,27 @@ __device__ void DOPAIR2GPU(
   float rot_uzi = 0.0;
   int Found_neighbours = 0;
   int count_i = cj_start;
-//  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i last_part_in_task_blocks_ci %i\n",
-//		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, last_part_in_task_blocks_ci);
+  //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
+  //  last_part_in_task_blocks_ci %i\n",
+  //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
+  //last_part_in_task_blocks_ci);
 
   if (pid < ci_end) {
-	cellx = parts_soa.locx[pid];
-	celly = parts_soa.locy[pid];
-	cellz = parts_soa.locz[pid];
-	const int j = cj_start;
-	cellxj = parts_soa.locx[j];
-	cellyj = parts_soa.locy[j];
-	cellzj = parts_soa.locz[j];
-	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
-	mi = parts_soa.mass[pid];
-	uxi = parts_soa.ux[pid];
-	uyi = parts_soa.uy[pid];
-	uzi = parts_soa.uz[pid];
-	pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
-	piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
-	piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
+    cellx = parts_soa.locx[pid];
+    celly = parts_soa.locy[pid];
+    cellz = parts_soa.locz[pid];
+    const int j = cj_start;
+    cellxj = parts_soa.locx[j];
+    cellyj = parts_soa.locy[j];
+    cellzj = parts_soa.locz[j];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - d_shift_x[task_id_tmp];
+    piy = parts_soa.y_p[pid] - d_shift_y[task_id_tmp];
+    piz = parts_soa.z_p[pid] - d_shift_z[task_id_tmp];
   }
 
   int n_neighbours = 0;
@@ -2836,156 +2935,163 @@ __device__ void DOPAIR2GPU(
   float *rot_uz_tmp = (float *)&rot_uy_tmp[BLOCK_SIZE];
 
   /*Particles copied in blocks to shared memory*/
-  for (int b = cj_start; b < cj_end;
-	   b += BLOCK_SIZE) {
-	const int tid_x = threadIdx.x;
-	int j = b + tid_x;
-	x_p_tmp[tid_x] = parts_soa.x_p[j];
-	y_p_tmp[tid_x] = parts_soa.y_p[j];
-	z_p_tmp[tid_x] = parts_soa.z_p[j];
-	h_tmp[tid_x] = parts_soa.h[j];
-	mass_tmp[tid_x] = parts_soa.mass[j];
-	ux_tmp[tid_x] = parts_soa.ux[j];
-	uy_tmp[tid_x] = parts_soa.uy[j];
-	uz_tmp[tid_x] = parts_soa.uz[j];
-	timebin[tid_x] = parts_soa.time_bin[j];
-	rho_tmp[tid_x] = 0.f;
-	rho_dh_tmp[tid_x] = 0.f;
-	wcount_tmp[tid_x] = 0.f;
-	wcount_dh_tmp[tid_x] = 0.f;
-	div_v_tmp[tid_x] = 0.f;
-	rot_ux_tmp[tid_x] = 0.f;
-	rot_uy_tmp[tid_x] = 0.f;
-	rot_uz_tmp[tid_x] = 0.f;
-	__syncthreads();
-	const double shift_x_j = d_shift_x[task_id_tmp + 1];
-	const double shift_y_j = d_shift_y[task_id_tmp + 1];
-	const double shift_z_j = d_shift_z[task_id_tmp + 1];
-	/*j_block is the particle's index in the block. Loop through particles in shared memory one by one*/
-	for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-	  int jj = b + j_block;
-	  if (jj < cj_end && pid < ci_end && pid >= ci_start) {
-
-		const double pjx = x_p_tmp[j_block] - shift_x_j;
-		const double pjy = y_p_tmp[j_block] - shift_y_j;
-		const double pjz = z_p_tmp[j_block] - shift_z_j;
-
-		const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-//		const float xij = pjx - pix, yij = pjy - piy, zij = pjz - piz;
-		const float r2 = xij * xij + yij * yij + zij * zij;
-		const float hj =  h_tmp[j_block];
-		const float hjg2 = hj * hj * kernel_gamma2;
-//		if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z %f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx);
-		/* Compute dv dot r */
-		const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
-		dvz = uzi - uz_tmp[j_block];
-		const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-		/* Compute dv cross r */
-		const float curlvrx = dvy * zij - dvz * yij;
-		const float curlvry = dvz * xij - dvx * zij;
-		const float curlvrz = dvx * yij - dvy * xij;
-
-		const float r = sqrt(r2);
-		if (r2 < hig2) {
-		  /* Recover some data */
-		  const float mj = mass_tmp[j_block];
-		  /* Get the kernel for hi. */
-//		  if(hi<1.f/dx)printf("h < dx\n");
-		  const float h_inv = 1.f / hi;
-		  const float ui = r * h_inv;
-		  float wi, wi_dx;
-
-		  d_kernel_deval(ui, &wi, &wi_dx);
-
-		  rhoi += mj * wi;
-		  rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-		  wcounti += wi;
-		  wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
-
-		  const float r_inv = 1.f / r;
-		  const float faci = mj * wi_dx * r_inv;
-
-		  div_vi -= faci * dvdr;
-
-		  rot_uxi += faci * curlvrx;
-		  rot_uyi += faci * curlvry;
-		  rot_uzi += faci * curlvrz;
-//
-		}
-		if (r2 < hjg2) {
-		  /* Recover some data */
-		  /* Get the kernel for hi. */
-		  const float hj_inv = 1.f / hj;
-		  const float uj = r * hj_inv;
-		  float wj, wj_dx;
-
-		  d_kernel_deval(uj, &wj, &wj_dx);
-
-//		  atomicAdd(&rho_tmp[j_block], mi * wj);
-		  atomicAdd(&parts_soa.rho[j], mi * wj);
-//		  atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension * wj + uj * wj_dx));
-		  atomicAdd(&parts_soa.rho_dh[j], -mi * (hydro_dimension * wj + uj * wj_dx));
-
-//		  atomicAdd(&wcount_tmp[j_block], wj);
-		  atomicAdd(&parts_soa.wcount[j], wj);
-//		  atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension * wj + uj * wj_dx));
-		  atomicAdd(&parts_soa.wcount_dh[j], -(hydro_dimension * wj + uj * wj_dx));
-
-		  const float r_inv = 1.f / r;
-		  const float facj = mi * wj_dx * r_inv;
-
-//		  atomicAdd(&div_v_tmp[j_block], -facj * dvdr);
-		  atomicAdd(&parts_soa.div_v[j], -facj * dvdr);
-
-//		  atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx);
-//		  atomicAdd(&rot_uy_tmp[j_block], facj * curlvry);
-//		  atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz);
-		  atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx);
-		  atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
-		  atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
-//		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v %f rotux %f rotuy %f rotuz %f\n"
-//				 ,rhoi, rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
-		} /*if r2<hjg2 */
-	  } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-	} /*End of looping through particles in shared memory---Shared arrays zero'ed for next step in outer loop*/
-	__syncthreads();
-//	if(j < cj_end){
-//	  atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]);
-//	  atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]);
-//	}
-//	__syncthreads();
-//	parts_soa.rho[j] += rho_tmp[threadIdx.x];
-//	parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x];
-//	parts_soa.wcount[j] += wcount_tmp[threadIdx.x];
-//	parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x];
-//	parts_soa.div_v[j] += div_v_tmp[threadIdx.x];
-//	parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x];
-//	parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x];
-//	parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x];
+  for (int b = cj_start; b < cj_end; b += BLOCK_SIZE) {
+    const int tid_x = threadIdx.x;
+    int j = b + tid_x;
+    x_p_tmp[tid_x] = parts_soa.x_p[j];
+    y_p_tmp[tid_x] = parts_soa.y_p[j];
+    z_p_tmp[tid_x] = parts_soa.z_p[j];
+    h_tmp[tid_x] = parts_soa.h[j];
+    mass_tmp[tid_x] = parts_soa.mass[j];
+    ux_tmp[tid_x] = parts_soa.ux[j];
+    uy_tmp[tid_x] = parts_soa.uy[j];
+    uz_tmp[tid_x] = parts_soa.uz[j];
+    timebin[tid_x] = parts_soa.time_bin[j];
+    rho_tmp[tid_x] = 0.f;
+    rho_dh_tmp[tid_x] = 0.f;
+    wcount_tmp[tid_x] = 0.f;
+    wcount_dh_tmp[tid_x] = 0.f;
+    div_v_tmp[tid_x] = 0.f;
+    rot_ux_tmp[tid_x] = 0.f;
+    rot_uy_tmp[tid_x] = 0.f;
+    rot_uz_tmp[tid_x] = 0.f;
+    __syncthreads();
+    const double shift_x_j = d_shift_x[task_id_tmp + 1];
+    const double shift_y_j = d_shift_y[task_id_tmp + 1];
+    const double shift_z_j = d_shift_z[task_id_tmp + 1];
+    /*j_block is the particle's index in the block. Loop through particles in
+     * shared memory one by one*/
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      int jj = b + j_block;
+      if (jj < cj_end && pid < ci_end && pid >= ci_start) {
+
+        const double pjx = x_p_tmp[j_block] - shift_x_j;
+        const double pjy = y_p_tmp[j_block] - shift_y_j;
+        const double pjz = z_p_tmp[j_block] - shift_z_j;
+
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        //		const float xij = pjx - pix, yij = pjy - piy, zij = pjz
+        //- piz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        const float hj = h_tmp[j_block];
+        const float hjg2 = hj * hj * kernel_gamma2;
+        //		if(r2 > 32.f * hig2 && hig2 != 0.f) printf("x %f y %f z
+        //%f r %f hig2 %f\n", xij/dx, yij/dx, zij/dx, sqrt(r2)/dx);
+        /* Compute dv dot r */
+        const float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                    dvz = uzi - uz_tmp[j_block];
+        const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+        /* Compute dv cross r */
+        const float curlvrx = dvy * zij - dvz * yij;
+        const float curlvry = dvz * xij - dvx * zij;
+        const float curlvrz = dvx * yij - dvy * xij;
+
+        const float r = sqrt(r2);
+        if (r2 < hig2) {
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          //		  if(hi<1.f/dx)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          div_vi -= faci * dvdr;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+          //
+        }
+        if (r2 < hjg2) {
+          /* Recover some data */
+          /* Get the kernel for hi. */
+          const float hj_inv = 1.f / hj;
+          const float uj = r * hj_inv;
+          float wj, wj_dx;
+
+          d_kernel_deval(uj, &wj, &wj_dx);
+
+          //		  atomicAdd(&rho_tmp[j_block], mi * wj);
+          atomicAdd(&parts_soa.rho[j], mi * wj);
+          //		  atomicAdd(&rho_dh_tmp[j_block], -mi * (hydro_dimension
+          //* wj + uj * wj_dx));
+          atomicAdd(&parts_soa.rho_dh[j],
+                    -mi * (hydro_dimension * wj + uj * wj_dx));
+
+          //		  atomicAdd(&wcount_tmp[j_block], wj);
+          atomicAdd(&parts_soa.wcount[j], wj);
+          //		  atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension *
+          //wj + uj * wj_dx));
+          atomicAdd(&parts_soa.wcount_dh[j],
+                    -(hydro_dimension * wj + uj * wj_dx));
+
+          const float r_inv = 1.f / r;
+          const float facj = mi * wj_dx * r_inv;
+
+          //		  atomicAdd(&div_v_tmp[j_block], -facj * dvdr);
+          atomicAdd(&parts_soa.div_v[j], -facj * dvdr);
+
+          //		  atomicAdd(&rot_ux_tmp[j_block], facj * curlvrx);
+          //		  atomicAdd(&rot_uy_tmp[j_block], facj * curlvry);
+          //		  atomicAdd(&rot_uz_tmp[j_block], facj * curlvrz);
+          atomicAdd(&parts_soa.rot_ux[j], facj * curlvrx);
+          atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
+          atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
+          //		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v
+          //%f rotux %f rotuy %f rotuz %f\n" 				 ,rhoi, rho_dhi, wcounti,
+          //wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+        } /*if r2<hjg2 */
+      }   /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    }     /*End of looping through particles in shared memory---Shared arrays
+             zero'ed for next step in outer loop*/
+    __syncthreads();
+    //	if(j < cj_end){
+    //	  atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rho_dh[j], rho_dh_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.wcount[j], wcount_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.wcount_dh[j], wcount_dh_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.div_v[j], div_v_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_ux[j], rot_ux_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_uy[j], rot_uy_tmp[threadIdx.x]);
+    //	  atomicAdd(&parts_soa.rot_uz[j], rot_uz_tmp[threadIdx.x]);
+    //	}
+    //	__syncthreads();
+    //	parts_soa.rho[j] += rho_tmp[threadIdx.x];
+    //	parts_soa.rho_dh[j] += rho_dh_tmp[threadIdx.x];
+    //	parts_soa.wcount[j] += wcount_tmp[threadIdx.x];
+    //	parts_soa.wcount_dh[j] =+ wcount_dh_tmp[threadIdx.x];
+    //	parts_soa.div_v[j] += div_v_tmp[threadIdx.x];
+    //	parts_soa.rot_ux[j] += rot_ux_tmp[threadIdx.x];
+    //	parts_soa.rot_uy[j] =+ rot_uy_tmp[threadIdx.x];
+    //	parts_soa.rot_uz[j] += rot_uz_tmp[threadIdx.x];
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
-//	if(n_neighbours > 0){
-//		distby2h = distby2h/n_neighbours;
-//		av_dist = av_dist/(n_neighbours*dx);
-//	}
-//    av_distx = av_distx/(n_neighbours*dx);
-//    av_disty = av_disty/(n_neighbours*dx);
-//    av_distz = av_distz/(n_neighbours*dx);
-	parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
-	parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
-	parts_soa.div_v[pid] = div_vi;
-	parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
-	parts_soa.rot_uz[pid] = rot_uzi;
-//	if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi);
+    //	if(n_neighbours > 0){
+    //		distby2h = distby2h/n_neighbours;
+    //		av_dist = av_dist/(n_neighbours*dx);
+    //	}
+    //    av_distx = av_distx/(n_neighbours*dx);
+    //    av_disty = av_disty/(n_neighbours*dx);
+    //    av_distz = av_distz/(n_neighbours*dx);
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi;
+    parts_soa.rot_uz[pid] = rot_uzi;
+    //	if(rhoi != 0.f)printf("rho i %f, rho_dh i %f\n", rhoi, rho_dhi);
   }
-
 }
 #ifdef WITH_CUDA
 }
@@ -2995,12 +3101,14 @@ __device__ void DOPAIR2GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_density_GPU(
-	struct part_soa parts_soa, int *d_task_first_part_ci, int *d_task_first_part_cj,
-	int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited) {
+    struct part_soa parts_soa, int *d_task_first_part_ci,
+    int *d_task_first_part_cj, int *d_task_last_part_ci,
+    int *d_task_last_part_cj, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited) {
 
   extern __shared__ float vars[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
@@ -3017,19 +3125,16 @@ __global__ void runner_do_pair_density_GPU(
   const int pid = threadid + first_part_in_task_blocks_ci;
 
   /*Don't ever put me in an if statement. I've got __syncthreads inside*/
-  DOPAIRGPU(
-  		parts_soa, pid, last_part_in_task_blocks_ci,
-		first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H,
-		time_bin_inhibited, vars);
-//  __syncthreads();
+  DOPAIRGPU(parts_soa, pid, last_part_in_task_blocks_ci,
+            first_part_in_task_blocks_cj, last_part_in_task_blocks_cj, d_a, d_H,
+            time_bin_inhibited, vars);
+  //  __syncthreads();
   // Now we start calculations for particles in cell i
   const int pjd = threadid + last_part_in_task_blocks_ci;
   /*Don't ever put me in an if statement. I've got __syncthreads inside*/
-  DOPAIRGPU(
-  		parts_soa, pjd, last_part_in_task_blocks_cj,
-  		first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H,
-  		time_bin_inhibited, vars);
-
+  DOPAIRGPU(parts_soa, pjd, last_part_in_task_blocks_cj,
+            first_part_in_task_blocks_ci, last_part_in_task_blocks_ci, d_a, d_H,
+            time_bin_inhibited, vars);
 }
 #ifdef WITH_CUDA
 }
@@ -3039,19 +3144,20 @@ __global__ void runner_do_pair_density_GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_sym_density_GPU(
-	struct part_soa parts_soa, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   extern __shared__ float vars_pair[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3062,12 +3168,10 @@ __global__ void runner_do_pair_sym_density_GPU(
   const int pid = threadid + ci_start;
 
   /*Don't ever put me in an if statement. I've got __syncthreads inside*/
-  DOPAIR2GPU(
-  		parts_soa, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp);
-//  __syncthreads();
-
+  DOPAIR2GPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+             time_bin_inhibited, vars_pair, d_shift_x, d_shift_y, d_shift_z,
+             task_id_tmp);
+  //  __syncthreads();
 }
 #ifdef WITH_CUDA
 }
@@ -3077,19 +3181,20 @@ __global__ void runner_do_pair_sym_density_GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_nonsym_density_GPU(
-	struct part_soa parts_soa, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, int time_bin_inhibited, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int time_bin_inhibited, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   extern __shared__ float vars_pair[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3097,26 +3202,24 @@ __global__ void runner_do_pair_nonsym_density_GPU(
   const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
   const int flip_i = 1;
-  DOPAIR2NONSYMGPU(
-  		parts_soa, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
+  DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+                   flip_i);
 
-  /*Necessary evil to stop parts from j and i co-existing on shared memory for sums*/
+  /*Necessary evil to stop parts from j and i co-existing on shared memory for
+   * sums*/
   __syncthreads();
 
   /*Now do cj
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
   const int flip_j = -1;
-  DOPAIR2NONSYMGPU(
-  		parts_soa, pjd, cj_start, cj_end,
-		ci_start, ci_end, d_a, d_H,
-		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
-
+  DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+                   flip_j);
 }
 #ifdef WITH_CUDA
 }
@@ -3126,19 +3229,20 @@ __global__ void runner_do_pair_nonsym_density_GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_ci_density_GPU(
-	struct part_soa parts_soa, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3146,14 +3250,12 @@ __global__ void runner_do_pair_ci_density_GPU(
   const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
   const int flip_i = 1;
-  DOPAIR2NONSYMGPU(
-  		parts_soa, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
-
+  DOPAIR2NONSYMGPU(parts_soa, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp,
+                   flip_i);
 }
 #ifdef WITH_CUDA
 }
@@ -3163,19 +3265,20 @@ __global__ void runner_do_pair_ci_density_GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_cj_density_GPU(
-	struct part_soa parts_soa, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3186,11 +3289,9 @@ __global__ void runner_do_pair_cj_density_GPU(
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
   const int flip_j = -1;
-  DOPAIR2NONSYMGPU(
-  		parts_soa, pjd, cj_start, cj_end,
-		ci_start, ci_end, d_a, d_H,
-		vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
-
+  DOPAIR2NONSYMGPU(parts_soa, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H,
+                   vars_pair, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1,
+                   flip_j);
 }
 #ifdef WITH_CUDA
 }
@@ -3200,19 +3301,20 @@ __global__ void runner_do_pair_cj_density_GPU(
 extern "C" {
 #endif
 __global__ void runner_do_pair_ci_density_GPU_aos(
-	struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aos[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3220,14 +3322,12 @@ __global__ void runner_do_pair_ci_density_GPU_aos(
   const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
   const int flip_i = 1;
-  DOPAIR2NONSYMGPUAOS(
-  		parts_aos, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
-
+  DOPAIR2NONSYMGPUAOS(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                      d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+                      task_id_tmp, flip_i);
 }
 #ifdef WITH_CUDA
 }
@@ -3237,19 +3337,20 @@ __global__ void runner_do_pair_ci_density_GPU_aos(
 extern "C" {
 #endif
 __global__ void runner_do_pair_cj_density_GPU_aos(
-	struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aos[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3260,11 +3361,9 @@ __global__ void runner_do_pair_cj_density_GPU_aos(
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
   const int flip_j = -1;
-  DOPAIR2NONSYMGPUAOS(
-  		parts_aos, pjd, cj_start, cj_end,
-		ci_start, ci_end, d_a, d_H,
-		vars_pair_aos, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
-
+  DOPAIR2NONSYMGPUAOS(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                      d_H, vars_pair_aos, d_shift_x, d_shift_y, d_shift_z,
+                      task_id_tmp + 1, flip_j);
 }
 #ifdef WITH_CUDA
 }
@@ -3274,28 +3373,29 @@ __global__ void runner_do_pair_cj_density_GPU_aos(
 extern "C" {
 #endif
 __global__ void runner_do_pair_ci_density_GPU_aos_f4(
-	struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, int4 *fparti_fpartj_lparti_lpartj_dens,
-	float d_a, float d_H, int bundle_first_task) {
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+    int bundle_first_task) {
 
   extern __shared__ float4 vars_pair_i_f4[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
   const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
   const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
   const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
 
-  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start, cj_end, d_a, d_H, vars_pair_i_f4);
-
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pid, ci_start, ci_end, cj_start,
+                        cj_end, d_a, d_H, vars_pair_i_f4);
 }
 #ifdef WITH_CUDA
 }
@@ -3305,24 +3405,25 @@ __global__ void runner_do_pair_ci_density_GPU_aos_f4(
 extern "C" {
 #endif
 __global__ void runner_do_pair_cj_density_GPU_aos_f4(
-		struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, int4 *fparti_fpartj_lparti_lpartj_dens,
-			float d_a, float d_H, int bundle_first_task) {
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    int4 *fparti_fpartj_lparti_lpartj_dens, float d_a, float d_H,
+    int bundle_first_task) {
 
   extern __shared__ float4 vars_pair_j_f4[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   const int ci_start = fparti_fpartj_lparti_lpartj_dens[task_id].x;
   const int cj_start = fparti_fpartj_lparti_lpartj_dens[task_id].y;
   const int ci_end = fparti_fpartj_lparti_lpartj_dens[task_id].z;
-  const int cj_end =fparti_fpartj_lparti_lpartj_dens[task_id].w;
+  const int cj_end = fparti_fpartj_lparti_lpartj_dens[task_id].w;
 
   /*Now do cj
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
-  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start, ci_end, d_a, d_H, vars_pair_j_f4);
-
+  DOPAIR2NONSYMGPUAOSF4(parts_send, parts_recv, pjd, cj_start, cj_end, ci_start,
+                        ci_end, d_a, d_H, vars_pair_j_f4);
 }
 #ifdef WITH_CUDA
 }
@@ -3332,27 +3433,27 @@ __global__ void runner_do_pair_cj_density_GPU_aos_f4(
 extern "C" {
 #endif
 __global__ void runner_do_pair_density_GPU_aos_f4(
-	struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
 
-//  extern __shared__ float4 vars_pair_i_f4[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int pid = bundle_first_part + threadid;
-//  const int task_id = bundle_first_part + blockIdx.y;
+  //  const int task_id = bundle_first_part + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
-  if(pid < bundle_first_part + bundle_n_parts){
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  if (pid < bundle_first_part + bundle_n_parts) {
     const struct part_aos_f4_send pi = parts_send[pid];
     const int cj_start = pi.cjs_cje.x;
     const int cj_end = pi.cjs_cje.y;
 
-  /* Start calculations for particles in cell i*/
-    DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+    /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a,
+                         d_H);
   }
-
 }
 #ifdef WITH_CUDA
 }
@@ -3362,19 +3463,20 @@ __global__ void runner_do_pair_density_GPU_aos_f4(
 extern "C" {
 #endif
 __global__ void runner_do_pair_ci_density_GPU_aos_g(
-	struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aosg[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3382,14 +3484,12 @@ __global__ void runner_do_pair_ci_density_GPU_aos_g(
   const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
   const int flip_i = 1;
-  DOPAIR2NONSYMGPUAOSG(
-  		parts_aos, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
-
+  DOPAIR2NONSYMGPUAOSG(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                       d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp, flip_i);
 }
 #ifdef WITH_CUDA
 }
@@ -3399,19 +3499,20 @@ __global__ void runner_do_pair_ci_density_GPU_aos_g(
 extern "C" {
 #endif
 __global__ void runner_do_pair_cj_density_GPU_aos_g(
-	struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aosg[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3422,11 +3523,9 @@ __global__ void runner_do_pair_cj_density_GPU_aos_g(
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
   const int flip_j = -1;
-  DOPAIR2NONSYMGPUAOSG(
-  		parts_aos, pjd, cj_start, cj_end,
-		ci_start, ci_end, d_a, d_H,
-		vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
-
+  DOPAIR2NONSYMGPUAOSG(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                       d_H, vars_pair_aosg, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp + 1, flip_j);
 }
 #ifdef WITH_CUDA
 }
@@ -3436,24 +3535,25 @@ __global__ void runner_do_pair_cj_density_GPU_aos_g(
 extern "C" {
 #endif
 __global__ void runner_do_pair_gradient_GPU_aos_f4(
-	struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
-	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    int bundle_first_part, int bundle_n_parts) {
 
-//  extern __shared__ float4 vars_pair_i_f4[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int pid = bundle_first_part + threadid;
-//  const int task_id = bundle_first_part + blockIdx.y;
+  //  const int task_id = bundle_first_part + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-  if(pid < bundle_first_part + bundle_n_parts){
+  if (pid < bundle_first_part + bundle_n_parts) {
     const struct part_aos_f4_g_send pi = parts_send[pid];
     const int cj_start = pi.cjs_cje.x;
     const int cj_end = pi.cjs_cje.y;
-  /* Start calculations for particles in cell i*/
-    DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+    /* Start calculations for particles in cell i*/
+    DOPAIR2NAIVEGPUAOSF4G(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+                          d_a, d_H);
   }
-
 }
 #ifdef WITH_CUDA
 }
@@ -3463,19 +3563,20 @@ __global__ void runner_do_pair_gradient_GPU_aos_f4(
 extern "C" {
 #endif
 __global__ void runner_do_pair_ci_density_GPU_aos_f(
-	struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aosf[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3483,14 +3584,12 @@ __global__ void runner_do_pair_ci_density_GPU_aos_f(
   const int cj_end = d_task_last_parts_pair[task_id_tmp + 1];
 
   /* Start calculations for particles in cell i
-  * Don't ever put me in an if statement. I've got __syncthreads inside*/
+   * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pid = threadid + ci_start;
   const int flip_i = 1;
-  DOPAIR2NONSYMGPUAOSF(
-  		parts_aos, pid, ci_start, ci_end,
-		cj_start, cj_end, d_a, d_H,
-		vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, task_id_tmp, flip_i);
-
+  DOPAIR2NONSYMGPUAOSF(parts_aos, pid, ci_start, ci_end, cj_start, cj_end, d_a,
+                       d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp, flip_i);
 }
 #ifdef WITH_CUDA
 }
@@ -3500,19 +3599,20 @@ __global__ void runner_do_pair_ci_density_GPU_aos_f(
 extern "C" {
 #endif
 __global__ void runner_do_pair_cj_density_GPU_aos_f(
-	struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid, int count_tasks,
-	int tasksperbundle, int nBlocks_per_task, int bundle_first_task, double *d_shift_x
-	, double *d_shift_y, double *d_shift_z) {
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, int bid, int tid,
+    int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, double *d_shift_x, double *d_shift_y,
+    double *d_shift_z) {
 
   extern __shared__ float vars_pair_aosf[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-//  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
-//  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
+  //  int first_part_in_task_blocks_ci, last_part_in_task_blocks_ci;
+  //  int first_part_in_task_blocks_cj, last_part_in_task_blocks_cj;
   const int task_id_tmp = 2 * task_id;
   const int ci_start = d_task_first_parts_pair[task_id_tmp];
   const int ci_end = d_task_last_parts_pair[task_id_tmp];
@@ -3523,11 +3623,9 @@ __global__ void runner_do_pair_cj_density_GPU_aos_f(
    * Don't ever put me in an if statement. I've got __syncthreads inside*/
   const int pjd = threadid + cj_start;
   const int flip_j = -1;
-  DOPAIR2NONSYMGPUAOSF(
-  		parts_aos, pjd, cj_start, cj_end,
-		ci_start, ci_end, d_a, d_H,
-		vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z, task_id_tmp + 1, flip_j);
-
+  DOPAIR2NONSYMGPUAOSF(parts_aos, pjd, cj_start, cj_end, ci_start, ci_end, d_a,
+                       d_H, vars_pair_aosf, d_shift_x, d_shift_y, d_shift_z,
+                       task_id_tmp + 1, flip_j);
 }
 #ifdef WITH_CUDA
 }
@@ -3537,24 +3635,25 @@ __global__ void runner_do_pair_cj_density_GPU_aos_f(
 extern "C" {
 #endif
 __global__ void runner_do_pair_force_GPU_aos_f4(
-	struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
-	float d_a, float d_H, int bundle_first_part, int bundle_n_parts) {
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    int bundle_first_part, int bundle_n_parts) {
 
-//  extern __shared__ float4 vars_pair_i_f4[];
-//  __shared__ int first_part_tid_0, last_part_tid_0;
+  //  extern __shared__ float4 vars_pair_i_f4[];
+  //  __shared__ int first_part_tid_0, last_part_tid_0;
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int pid = bundle_first_part + threadid;
-//  const int task_id = bundle_first_part + blockIdx.y;
+  //  const int task_id = bundle_first_part + blockIdx.y;
 
   //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
-  if(pid < bundle_first_part + bundle_n_parts){
+  if (pid < bundle_first_part + bundle_n_parts) {
     const struct part_aos_f4_f_send pi = parts_send[pid];
     const int cj_start = pi.cjs_cje.x;
     const int cj_end = pi.cjs_cje.y;
-  /* Start calculations for particles in cell i */
-    DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end, d_a, d_H);
+    /* Start calculations for particles in cell i */
+    DOPAIR2NAIVEGPUAOSF4F(pi, parts_send, parts_recv, pid, cj_start, cj_end,
+                          d_a, d_H);
   }
-
 }
 #ifdef WITH_CUDA
 }
@@ -3563,42 +3662,45 @@ __global__ void runner_do_pair_force_GPU_aos_f4(
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopair1_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, int time_bin_inhibited, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopair1_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, int time_bin_inhibited,
+    double *d_shift_x, double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max(max_parts_j, max_parts_i);
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
-//  fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
-//		  "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
-//		  max_parts_i, max_parts_j, numBlocks_x, numBlocks_y, BLOCK_SIZE);
+  //  fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
+  //		  "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
+  //		  max_parts_i, max_parts_j, numBlocks_x, numBlocks_y,
+  //BLOCK_SIZE);
 
   /*Do ci & cj*/
-//  fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n", BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts);
-
-//  runner_do_pair_sym_density_GPU<<<gridShape, BLOCK_SIZE,
-//		  13 * BLOCK_SIZE * sizeof(float) +
-//		  3 * BLOCK_SIZE * sizeof(double) +
-//              BLOCK_SIZE * sizeof(timebin_t),
-//          stream>>>(
-//      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
-//      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
+  //  fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n",
+  //  BLOCK_SIZE, max_parts, numBlocks_x * BLOCK_SIZE - max_parts);
+
+  //  runner_do_pair_sym_density_GPU<<<gridShape, BLOCK_SIZE,
+  //		  13 * BLOCK_SIZE * sizeof(float) +
+  //		  3 * BLOCK_SIZE * sizeof(double) +
+  //              BLOCK_SIZE * sizeof(timebin_t),
+  //          stream>>>(
+  //      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
+  //      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+  //      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x,
+  //      d_shift_y, d_shift_z);
 
   runner_do_pair_nonsym_density_GPU<<<gridShape, BLOCK_SIZE,
-		  5 * BLOCK_SIZE * sizeof(float) +
-		  3 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(timebin_t),
-          stream>>>(
-      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
-
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(timebin_t),
+                                      stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      time_bin_inhibited, d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3607,28 +3709,27 @@ void runner_dopair1_branch_density_gpu(struct part_soa parts_soa, int *d_task_fi
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopairci_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopairci_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_i;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
   runner_do_pair_ci_density_GPU<<<gridShape, BLOCK_SIZE,
-		  5 * BLOCK_SIZE * sizeof(float) +
-		  3 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(timebin_t),
-          stream>>>(
-      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+                                  5 * BLOCK_SIZE * sizeof(float) +
+                                      3 * BLOCK_SIZE * sizeof(float) +
+                                      BLOCK_SIZE * sizeof(timebin_t),
+                                  stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3637,28 +3738,27 @@ void runner_dopairci_branch_density_gpu(struct part_soa parts_soa, int *d_task_f
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopaircj_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopaircj_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_j;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
   runner_do_pair_cj_density_GPU<<<gridShape, BLOCK_SIZE,
-		  5 * BLOCK_SIZE * sizeof(float) +
-		  3 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(timebin_t),
-          stream>>>(
-      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+                                  5 * BLOCK_SIZE * sizeof(float) +
+                                      3 * BLOCK_SIZE * sizeof(float) +
+                                      BLOCK_SIZE * sizeof(timebin_t),
+                                  stream>>>(
+      parts_soa, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3667,28 +3767,27 @@ void runner_dopaircj_branch_density_gpu(struct part_soa parts_soa, int *d_task_f
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopairci_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopairci_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_i;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
   runner_do_pair_ci_density_GPU_aos<<<gridShape, BLOCK_SIZE,
-		  5 * BLOCK_SIZE * sizeof(float) +
-		  3 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(int),
+                                      stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3697,48 +3796,47 @@ void runner_dopairci_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_t
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopaircj_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopaircj_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_j;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
   runner_do_pair_cj_density_GPU_aos<<<gridShape, BLOCK_SIZE,
-		  5 * BLOCK_SIZE * sizeof(float) +
-		  3 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+                                      5 * BLOCK_SIZE * sizeof(float) +
+                                          3 * BLOCK_SIZE * sizeof(float) +
+                                          BLOCK_SIZE * sizeof(int),
+                                      stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
 #endif
 
-
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopairci_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens){
-
-	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
-	  int nBlocks_per_task = numBlocks_x;
-
+void runner_dopairci_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
 
-	  runner_do_pair_ci_density_GPU_aos_f4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
-			  parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, bundle_first_task);
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
 
+  runner_do_pair_ci_density_GPU_aos_f4<<<
+      gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+      parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+      bundle_first_task);
 }
 #ifdef WITH_CUDA
 }
@@ -3747,16 +3845,18 @@ void runner_dopairci_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_se
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopaircj_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-	      float d_a, float d_H, cudaStream_t stream,
-		  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
+void runner_dopaircj_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-  runner_do_pair_cj_density_GPU_aos_f4<<<gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
-		  parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H, bundle_first_task);
-
+  runner_do_pair_cj_density_GPU_aos_f4<<<
+      gridShape, BLOCK_SIZE, 2 * BLOCK_SIZE * sizeof(float4), stream>>>(
+      parts_send, parts_recv, fparti_fpartj_lparti_lpartj_dens, d_a, d_H,
+      bundle_first_task);
 }
 #ifdef WITH_CUDA
 }
@@ -3765,17 +3865,17 @@ void runner_dopaircj_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_se
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopair_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
-
-	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
-	  int nBlocks_per_task = numBlocks_x;
+void runner_dopair_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
 
-//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
-	  runner_do_pair_density_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
-			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
 
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_density_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
 }
 #ifdef WITH_CUDA
 }
@@ -3784,27 +3884,25 @@ void runner_dopair_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopairci_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopairci_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_i;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
-  runner_do_pair_ci_density_GPU_aos_g<<<gridShape, BLOCK_SIZE,
-		  12 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+  runner_do_pair_ci_density_GPU_aos_g<<<
+      gridShape, BLOCK_SIZE,
+      12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3813,27 +3911,25 @@ void runner_dopairci_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopaircj_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopaircj_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_j;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
-  runner_do_pair_cj_density_GPU_aos_g<<<gridShape, BLOCK_SIZE,
-		  12 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+  runner_do_pair_cj_density_GPU_aos_g<<<
+      gridShape, BLOCK_SIZE,
+      12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3842,17 +3938,18 @@ void runner_dopaircj_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopair_branch_gradient_gpu_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
+void runner_dopair_branch_gradient_gpu_aos_f4(
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
 
-	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
-	  int nBlocks_per_task = numBlocks_x;
-
-//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
-	  runner_do_pair_gradient_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
-			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
 
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_gradient_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
 }
 #ifdef WITH_CUDA
 }
@@ -3861,27 +3958,25 @@ void runner_dopair_branch_gradient_gpu_aos_f4(struct part_aos_f4_g_send *parts_s
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopairci_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopairci_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_i;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
-  runner_do_pair_ci_density_GPU_aos_f<<<gridShape, BLOCK_SIZE,
-		  17 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+  runner_do_pair_ci_density_GPU_aos_f<<<
+      gridShape, BLOCK_SIZE,
+      17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3890,27 +3985,25 @@ void runner_dopairci_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopaircj_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z) {
-
+void runner_dopaircj_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z) {
 
   int max_parts = max_parts_j;
   int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
 
-
-  runner_do_pair_cj_density_GPU_aos_f<<<gridShape, BLOCK_SIZE,
-		  17 * BLOCK_SIZE * sizeof(float) +
-              BLOCK_SIZE * sizeof(int),
-          stream>>>(
-      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair,
-      d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, d_shift_x, d_shift_y, d_shift_z);
-
+  runner_do_pair_cj_density_GPU_aos_f<<<
+      gridShape, BLOCK_SIZE,
+      17 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int), stream>>>(
+      parts_aos, d_task_first_parts_pair, d_task_last_parts_pair, d_a, d_H, bid,
+      tid, count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+      d_shift_x, d_shift_y, d_shift_z);
 }
 #ifdef WITH_CUDA
 }
@@ -3919,17 +4012,18 @@ void runner_dopaircj_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void runner_dopair_branch_force_gpu_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts){
-
-	  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
-	  int nBlocks_per_task = numBlocks_x;
+void runner_dopair_branch_force_gpu_aos_f4(
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts) {
 
-//	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
-	  runner_do_pair_force_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
-			  parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
 
+  //	  fprintf(stderr, "nblocks %i\n", numBlocks_x);
+  runner_do_pair_force_GPU_aos_f4<<<numBlocks_x, BLOCK_SIZE, 0, stream>>>(
+      parts_send, parts_recv, d_a, d_H, bundle_first_part, bundle_n_parts);
 }
 #ifdef WITH_CUDA
 }
@@ -3940,9 +4034,10 @@ extern "C" {
 #endif
 
 __global__ void runner_do_self_density_GPU_naive(
-	    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part, float d_a, float d_H,
-	    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
-	    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    float d_a, float d_H, int bid, int tid, int count_tasks, int tasksperbundle,
+    int nBlocks_per_task, int bundle_first_task, int max_parts,
+    int time_bin_inhibited) {
 
   const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
   const int task_id = bundle_first_task + blockIdx.y;
@@ -3978,90 +4073,90 @@ __global__ void runner_do_self_density_GPU_naive(
   int Found_neighbours = 0;
 
   if (pid < last_part_in_task_blocks) {
-	ttid = parts_soa.tid_p[pid];
-	first_part = d_task_first_part[ttid];
-	last_part = d_task_last_part[ttid];
-	count = last_part - first_part;
-	cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
-	cellz = parts_soa.locz[pid];
-	hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
-	mi = parts_soa.mass[pid];
-	uxi = parts_soa.ux[pid];
-	uyi = parts_soa.uy[pid];
-	uzi = parts_soa.uz[pid];
-	pix = parts_soa.x_p[pid] - cellx;
-	piy = parts_soa.y_p[pid] - celly;
-	piz = parts_soa.z_p[pid] - cellz;
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
 
     int n_neighbours = 0;
 
     /*Naive loop over neighbours*/
     for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
-    	       b += BLOCK_SIZE) {
+         b += BLOCK_SIZE) {
       for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
-    	int j = j_block + b;
-    	if (j < last_part_in_task_blocks) {
-    	  const float x_p_tmp = parts_soa.x_p[j];
-    	  const float y_p_tmp = parts_soa.y_p[j];
-    	  const float z_p_tmp = parts_soa.z_p[j];
-    	  const float h_tmp = parts_soa.h[j];
-    	  const float mass_tmp = parts_soa.mass[j];
-    	  const float ux_tmp = parts_soa.ux[j];
-    	  const float uy_tmp = parts_soa.uy[j];
-    	  const float uz_tmp = parts_soa.uz[j];
-    	  const timebin_t timebin = parts_soa.time_bin[j];
-
-		  /* Compute the pairwise distance. */
-		  const float pjx = x_p_tmp - cellx;
-		  const float pjy = y_p_tmp - celly;
-		  const float pjz = z_p_tmp - cellz;
-		  const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
-		  const float r2 = xij * xij + yij * yij + zij * zij;
-		  const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2;
-		  if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
-			Found_neighbours=1;
-			const float r = sqrt(r2);
-			/* Recover some data */
-			const float mj = mass_tmp;
-			/* Get the kernel for hi. */
-			if(hi<1.f/128.f)printf("h < dx\n");
-			const float h_inv = 1.f / hi;
-			const float ui = r * h_inv;
-			float wi, wi_dx;
-
-			d_kernel_deval(ui, &wi, &wi_dx);
-
-			rhoi += mj * wi;
-			rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
-
-			wcounti += wi;
-			wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
-
-			const float r_inv = 1.f / r;
-			const float faci = mj * wi_dx * r_inv;
-
-			/* Compute dv dot r */
-			float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp,
-				dvz = uzi - uz_tmp;
-			const float dvdr = dvx * xij + dvy * yij + dvz * zij;
-
-			div_vi -= faci * dvdr;
-
-			/* Compute dv cross r */
-			float curlvrx = dvy * zij - dvz * yij;
-			float curlvry = dvz * xij - dvx * zij;
-			float curlvrz = dvx * yij - dvy * xij;
-
-			rot_uxi += faci * curlvrx;
-			rot_uyi += faci * curlvry;
-			rot_uzi += faci * curlvrz;
-		  }
-    	}
+        int j = j_block + b;
+        if (j < last_part_in_task_blocks) {
+          const float x_p_tmp = parts_soa.x_p[j];
+          const float y_p_tmp = parts_soa.y_p[j];
+          const float z_p_tmp = parts_soa.z_p[j];
+          const float h_tmp = parts_soa.h[j];
+          const float mass_tmp = parts_soa.mass[j];
+          const float ux_tmp = parts_soa.ux[j];
+          const float uy_tmp = parts_soa.uy[j];
+          const float uz_tmp = parts_soa.uz[j];
+          const timebin_t timebin = parts_soa.time_bin[j];
+
+          /* Compute the pairwise distance. */
+          const float pjx = x_p_tmp - cellx;
+          const float pjy = y_p_tmp - celly;
+          const float pjz = z_p_tmp - cellz;
+          const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+          const float r2 = xij * xij + yij * yij + zij * zij;
+          const float hj = h_tmp, hjg2 = hj * hj * kernel_gamma2;
+          if (r2 < hig2 && r2 > (0.01f / 128.f) * (0.01f / 128.f)) {
+            Found_neighbours = 1;
+            const float r = sqrt(r2);
+            /* Recover some data */
+            const float mj = mass_tmp;
+            /* Get the kernel for hi. */
+            if (hi < 1.f / 128.f) printf("h < dx\n");
+            const float h_inv = 1.f / hi;
+            const float ui = r * h_inv;
+            float wi, wi_dx;
+
+            d_kernel_deval(ui, &wi, &wi_dx);
+
+            rhoi += mj * wi;
+            rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+            wcounti += wi;
+            wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+            const float r_inv = 1.f / r;
+            const float faci = mj * wi_dx * r_inv;
+
+            /* Compute dv dot r */
+            float dvx = uxi - ux_tmp, dvy = uyi - uy_tmp, dvz = uzi - uz_tmp;
+            const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+            div_vi -= faci * dvdr;
+
+            /* Compute dv cross r */
+            float curlvrx = dvy * zij - dvz * yij;
+            float curlvry = dvz * xij - dvx * zij;
+            float curlvrz = dvx * yij - dvy * xij;
+
+            rot_uxi += faci * curlvrx;
+            rot_uyi += faci * curlvry;
+            rot_uzi += faci * curlvrz;
+          }
+        }
       }
     }
-//    float wi, wi_dx;
-//    d_kernel_deval(0.f, &wi, &wi_dx);
-    if(Found_neighbours == 0) printf("Not sure what's going on but no neighbours found in GPU loop\n");
+    //    float wi, wi_dx;
+    //    d_kernel_deval(0.f, &wi, &wi_dx);
+    if (Found_neighbours == 0)
+      printf("Not sure what's going on but no neighbours found in GPU loop\n");
     parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
     parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
     parts_soa.div_v[pid] = div_vi;
@@ -4077,21 +4172,20 @@ __global__ void runner_do_self_density_GPU_naive(
 extern "C" {
 #endif
 void launch_tester_kernel(struct part_soa parts_soa, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream, int bid,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y, int tid,
-                           int offset, int bundle_first_task, int max_parts,
-                           int time_bin_inhibited) {
+                          int *d_task_last_part, float d_a, float d_H,
+                          const char *loop_type, cudaStream_t stream, int bid,
+                          int block_size, int count_tasks, int tasksperbundle,
+                          int numBlocks_x, int numBlocks_y, int tid, int offset,
+                          int bundle_first_task, int max_parts,
+                          int time_bin_inhibited) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
   tester<<<gridShape, BLOCK_SIZE,
-                               8 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(timebin_t),
-                               stream>>>(
-      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+           8 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(timebin_t),
+           stream>>>(parts_soa, d_task_first_part, d_task_last_part, d_a, d_H,
+                     bid, tid, count_tasks, tasksperbundle, nBlocks_per_task,
+                     bundle_first_task, max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -4113,11 +4207,12 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
                                8 * BLOCK_SIZE * sizeof(float) +
                                    BLOCK_SIZE * sizeof(timebin_t),
                                stream>>>(
-      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, max_parts);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+      parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks,
+      tasksperbundle, nBlocks_per_task, bundle_first_task, max_parts);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -4127,26 +4222,25 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
 extern "C" {
 #endif
 void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-                           double * d_cell_x,
-						   double * d_cell_y, double * d_cell_z) {
+                         int *d_task_last_part, float d_a, float d_H,
+                         const char *loop_type, cudaStream_t stream,
+                         int block_size, int count_tasks, int tasksperbundle,
+                         int numBlocks_x, int numBlocks_y,
+                         int bundle_first_task, int max_parts, double *d_cell_x,
+                         double *d_cell_y, double *d_cell_z) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
   DOSELF_GPU_AOS_G<<<gridShape, BLOCK_SIZE,
-                               12 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(int),
-                               stream>>>(
-      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
-	  d_cell_y, d_cell_z);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+                     12 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                     stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                               d_a, d_H, count_tasks, tasksperbundle,
+                               nBlocks_per_task, bundle_first_task, max_parts,
+                               d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -4155,18 +4249,21 @@ void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
-                           cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int2 * d_task_first_part_f4) {
-
-	dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
-	int nBlocks_per_task = numBlocks_x;
-	DOSELF_GPU_AOS_F4_G<<<gridShape, BLOCK_SIZE,
-	                        3 * BLOCK_SIZE * sizeof(float4), stream>>>(
-	      parts_send, parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+                            struct part_aos_f4_g_recv *parts_recv, float d_a,
+                            float d_H, cudaStream_t stream, int numBlocks_x,
+                            int numBlocks_y, int bundle_first_task,
+                            int2 *d_task_first_part_f4) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  DOSELF_GPU_AOS_F4_G<<<gridShape, BLOCK_SIZE, 3 * BLOCK_SIZE * sizeof(float4),
+                        stream>>>(parts_send, parts_recv, d_a, d_H,
+                                  bundle_first_task, d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -4176,26 +4273,25 @@ void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_a
 extern "C" {
 #endif
 void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-                           double * d_cell_x,
-						   double * d_cell_y, double * d_cell_z) {
+                      int *d_task_last_part, float d_a, float d_H,
+                      const char *loop_type, cudaStream_t stream,
+                      int block_size, int count_tasks, int tasksperbundle,
+                      int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                      int max_parts, double *d_cell_x, double *d_cell_y,
+                      double *d_cell_z) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
   DOSELF_GPU_AOS_F<<<gridShape, BLOCK_SIZE,
-                               16 * BLOCK_SIZE * sizeof(float) +
-                                   BLOCK_SIZE * sizeof(int),
-                               stream>>>(
-      parts_aos, d_task_first_part, d_task_last_part, d_a, d_H, count_tasks, tasksperbundle,
-      nBlocks_per_task, bundle_first_task, max_parts, d_cell_x,
-	  d_cell_y, d_cell_z);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+                     16 * BLOCK_SIZE * sizeof(float) + BLOCK_SIZE * sizeof(int),
+                     stream>>>(parts_aos, d_task_first_part, d_task_last_part,
+                               d_a, d_H, count_tasks, tasksperbundle,
+                               nBlocks_per_task, bundle_first_task, max_parts,
+                               d_cell_x, d_cell_y, d_cell_z);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
@@ -4204,20 +4300,23 @@ void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
 #ifdef WITH_CUDA
 extern "C" {
 #endif
-void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send, struct part_aos_f4_f_recv *d_parts_recv, float d_a, float d_H,
-                           cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int2 * d_task_first_part_f4) {
+void launch_force_aos_f4(struct part_aos_f4_f_send *d_parts_send,
+                         struct part_aos_f4_f_recv *d_parts_recv, float d_a,
+                         float d_H, cudaStream_t stream, int numBlocks_x,
+                         int numBlocks_y, int bundle_first_task,
+                         int2 *d_task_first_part_f4) {
 
   dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
   int nBlocks_per_task = numBlocks_x;
-  DOSELF_GPU_AOS_F4_F<<<gridShape, BLOCK_SIZE,
-                                   4 * BLOCK_SIZE * sizeof(float4) +
-                                   BLOCK_SIZE * sizeof(float3),
-                                   stream>>>(
-      d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task, d_task_first_part_f4);
-//  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
-//        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
-//        nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+  DOSELF_GPU_AOS_F4_F<<<
+      gridShape, BLOCK_SIZE,
+      4 * BLOCK_SIZE * sizeof(float4) + BLOCK_SIZE * sizeof(float3), stream>>>(
+      d_parts_send, d_parts_recv, d_a, d_H, bundle_first_task,
+      d_task_first_part_f4);
+  //  runner_do_self_density_GPU_naive<<<gridShape, BLOCK_SIZE, 0, stream>>>(
+  //        parts_soa, d_task_first_part, d_task_last_part, d_a, d_H, bid, tid,
+  //        count_tasks, tasksperbundle, nBlocks_per_task, bundle_first_task,
+  //        max_parts, time_bin_inhibited);
 }
 #ifdef WITH_CUDA
 }
diff --git a/src/cuda/GPU_runner_functions.h b/src/cuda/GPU_runner_functions.h
old mode 100755
new mode 100644
index d43fc6f2ff..27bbecdd92
--- a/src/cuda/GPU_runner_functions.h
+++ b/src/cuda/GPU_runner_functions.h
@@ -13,104 +13,136 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
                            int numBlocks_x, int numBlocks_y,
                            int bundle_first_task, int max_parts);
 void launch_density_aos(struct part_aos *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-						   double * d_cell_x,
-						   double * d_cell_y, double * d_cell_z);
-void launch_density_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, float d_a, float d_H,
-        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-        int bundle_first_task, int2 *d_task_first_part_f4);
+                        int *d_task_last_part, float d_a, float d_H,
+                        const char *loop_type, cudaStream_t stream,
+                        int block_size, int count_tasks, int tasksperbundle,
+                        int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                        int max_parts, double *d_cell_x, double *d_cell_y,
+                        double *d_cell_z);
+void launch_density_aos_f4(struct part_aos_f4_send *parts_send,
+                           struct part_aos_f4_recv *parts_recv, float d_a,
+                           float d_H, cudaStream_t stream, int numBlocks_x,
+                           int numBlocks_y, int bundle_first_task,
+                           int2 *d_task_first_part_f4);
 void launch_gradient_aos(struct part_aos_g *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-                           double * d_cell_x,
-							double * d_cell_y, double * d_cell_z);
-void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
-        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-        int bundle_first_task, int2 * d_task_first_part_f4);
+                         int *d_task_last_part, float d_a, float d_H,
+                         const char *loop_type, cudaStream_t stream,
+                         int block_size, int count_tasks, int tasksperbundle,
+                         int numBlocks_x, int numBlocks_y,
+                         int bundle_first_task, int max_parts, double *d_cell_x,
+                         double *d_cell_y, double *d_cell_z);
+void launch_gradient_aos_f4(struct part_aos_f4_g_send *parts_send,
+                            struct part_aos_f4_g_recv *parts_recv, float d_a,
+                            float d_H, cudaStream_t stream, int numBlocks_x,
+                            int numBlocks_y, int bundle_first_task,
+                            int2 *d_task_first_part_f4);
 void launch_force_aos(struct part_aos_f *parts_aos, int *d_task_first_part,
-                           int *d_task_last_part, float d_a, float d_H,
-                           const char *loop_type, cudaStream_t stream,
-                           int block_size, int count_tasks, int tasksperbundle,
-                           int numBlocks_x, int numBlocks_y,
-                           int bundle_first_task, int max_parts,
-                           double * d_cell_x,
-							double * d_cell_y, double * d_cell_z);
-void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
-        cudaStream_t stream, int numBlocks_x, int numBlocks_y,
-        int bundle_first_task, int2 * d_task_first_part_f4);
-void launch_density_pair_two_kernels(struct part_soa parts_soa_ci, struct part_soa parts_soa_cj, int *d_task_first_part_ci,
-	          int *d_task_first_part_cj, int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
-              const char *loop_type, cudaStream_t stream, int bid, int block_size, int count_tasks, int tasksperbundle,
-              int max_parts_i, int max_parts_j, int numBlocks_y, int tid, int offset, int bundle_first_task, int max_active_bin);
-void runner_dopair1_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-        int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-		  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-		  int numBlocks_y, int tid, int offset, int bundle_first_task, int max_active_bin, double * d_shift_x,
-		  double * d_shift_y, double * d_shift_z);
-void runner_dopairci_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopaircj_branch_density_gpu(struct part_soa parts_soa, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopairci_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopairci_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
-void runner_dopaircj_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
-void runner_dopair_branch_density_gpu_aos_f4(struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
-void runner_dopaircj_branch_density_gpu_aos(struct part_aos *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopairci_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopaircj_branch_density_gpu_aos_g(struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopair_branch_gradient_gpu_aos_f4(struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
-void runner_dopairci_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopaircj_branch_density_gpu_aos_f(struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
-	          int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type, cudaStream_t stream,
-			  int bid, int block_size, int count_tasks, int tasksperbundle,int max_parts_i, int max_parts_j,
-			  int numBlocks_y, int tid, int offset, int bundle_first_task, double * d_shift_x
-			  , double * d_shift_y, double * d_shift_z);
-void runner_dopair_branch_force_gpu_aos_f4(struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
-		      float d_a, float d_H, cudaStream_t stream,
-			  int numBlocks_x, int numBlocks_y, int bundle_first_part, int bundle_n_parts);
+                      int *d_task_last_part, float d_a, float d_H,
+                      const char *loop_type, cudaStream_t stream,
+                      int block_size, int count_tasks, int tasksperbundle,
+                      int numBlocks_x, int numBlocks_y, int bundle_first_task,
+                      int max_parts, double *d_cell_x, double *d_cell_y,
+                      double *d_cell_z);
+void launch_force_aos_f4(struct part_aos_f4_f_send *parts_send,
+                         struct part_aos_f4_f_recv *parts_recv, float d_a,
+                         float d_H, cudaStream_t stream, int numBlocks_x,
+                         int numBlocks_y, int bundle_first_task,
+                         int2 *d_task_first_part_f4);
+void launch_density_pair_two_kernels(
+    struct part_soa parts_soa_ci, struct part_soa parts_soa_cj,
+    int *d_task_first_part_ci, int *d_task_first_part_cj,
+    int *d_task_last_part_ci, int *d_task_last_part_cj, float d_a, float d_H,
+    const char *loop_type, cudaStream_t stream, int bid, int block_size,
+    int count_tasks, int tasksperbundle, int max_parts_i, int max_parts_j,
+    int numBlocks_y, int tid, int offset, int bundle_first_task,
+    int max_active_bin);
+void runner_dopair1_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, int max_active_bin,
+    double *d_shift_x, double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu(
+    struct part_soa parts_soa, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopaircj_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_task, int4 *fparti_fpartj_lparti_lpartj_dens);
+void runner_dopair_branch_density_gpu_aos_f4(
+    struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv,
+    float d_a, float d_H, cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
+void runner_dopaircj_branch_density_gpu_aos(
+    struct part_aos *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopairci_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_g(
+    struct part_aos_g *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_gradient_gpu_aos_f4(
+    struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
+void runner_dopairci_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopaircj_branch_density_gpu_aos_f(
+    struct part_aos_f *parts_aos, int *d_task_first_parts_pair,
+    int *d_task_last_parts_pair, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int max_parts_i, int max_parts_j, int numBlocks_y,
+    int tid, int offset, int bundle_first_task, double *d_shift_x,
+    double *d_shift_y, double *d_shift_z);
+void runner_dopair_branch_force_gpu_aos_f4(
+    struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv, float d_a, float d_H,
+    cudaStream_t stream, int numBlocks_x, int numBlocks_y,
+    int bundle_first_part, int bundle_n_parts);
 #ifdef __cplusplus
 }
 #endif
 
-#endif // CUDA_HEADER_H
+#endif  // CUDA_HEADER_H
diff --git a/src/cuda/device_functions.h b/src/cuda/device_functions.h
old mode 100755
new mode 100644
index 6e4edac6ac..afc4a1a5d8
--- a/src/cuda/device_functions.h
+++ b/src/cuda/device_functions.h
@@ -3,11 +3,11 @@
 #include "../../config.h"
 
 /* Local headers. */
-//#include "../dimension.h"
-//#include "../error.h"
-//#include "../inline.h"
-//#include "../minmax.h"
-//#include "../vector.h"
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
 
 // Is this even necessary? Probably not as our code will operate differently
 #define num_cuda_threads 128
@@ -22,11 +22,11 @@
 #define kernel_ivals 2
 #define kernel_degree 3 /*!< Degree of the polynomial */
 #define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_dim_plus_one                                              \
+#define kernel_gamma_dim_plus_one \
   ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_inv_dim                                                   \
+#define kernel_gamma_inv_dim \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
-#define kernel_gamma_inv_dim_plus_one                                          \
+#define kernel_gamma_inv_dim_plus_one \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
 #define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
 #define kernel_constant ((float)(16. * M_1_PI))
@@ -110,8 +110,8 @@ __device__ float d_pow_three_gamma_minus_five_over_two(float x) {
  * @param W (return) The value of the kernel function \f$W(x,h)\f$.
  * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
  */
-__device__ void d_kernel_deval(float u, float * __restrict__ W,
-                               float * __restrict__ dW_dx) {
+__device__ void d_kernel_deval(float u, float *__restrict__ W,
+                               float *__restrict__ dW_dx) {
 
   /* Go to the range [0,1[ from [0,H[ */
   const float x = u * kernel_gamma_inv;
@@ -146,4 +146,4 @@ __device__ void d_kernel_deval(float u, float * __restrict__ W,
 }
 #endif
 
-#endif // DEVICE_FUNCTIONS_H
+#endif  // DEVICE_FUNCTIONS_H
diff --git a/src/cuda/kernel_definitions.cu b/src/cuda/kernel_definitions.cu
old mode 100755
new mode 100644
index 82e749725d..a272b7beee
--- a/src/cuda/kernel_definitions.cu
+++ b/src/cuda/kernel_definitions.cu
@@ -7,9 +7,9 @@
 #ifndef static
 #define static
 #endif
-//#ifndef restrict
-//#define restrict __restrict__
-//#endif
+// #ifndef restrict
+// #define restrict __restrict__
+// #endif
 #endif
 
 /* Required header files */
@@ -105,8 +105,7 @@ __device__ void SPH_Sum_Self(cell_gpu *d_ci_gpu) {
     q4 = q4 * q4 * q4 * q4;
     float w = q4 * (2.0f * q + 1.0f);
     float v = mass / rho;
-    if (q < 2.0f)
-      sumLoc = sumLoc + w * v * 7.0 * 7.0 / (4.0 * 22.0 * h * h);
+    if (q < 2.0f) sumLoc = sumLoc + w * v * 7.0 * 7.0 / (4.0 * 22.0 * h * h);
   }
   // d_Particles[i].ker_sum=sumLoc;
 }
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
old mode 100755
new mode 100644
index 47e5da4f29..92d12f45bd
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -182,18 +182,19 @@ typedef struct part_aos_f4_send {
 
   /*! Particle predicted velocity and mass -> ux, uy, uz, m */
   float4 ux_m;
-  /*Markers for where neighbour cell j starts and stops in array indices for pair tasks*/
+  /*Markers for where neighbour cell j starts and stops in array indices for
+   * pair tasks*/
   int2 cjs_cje;
-}part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)));
+} part_aos_f4_send __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)));
 
-typedef struct part_aos_f4_recv{
+typedef struct part_aos_f4_recv {
   /* Density information; rho */
   /*! Derivative of density with respect to h; rho_dh,
-  * Neighbour number count; w_count
-  * * Derivative of the neighbour number with respect to h; w_count_dh */
+   * Neighbour number count; w_count
+   * * Derivative of the neighbour number with respect to h; w_count_dh */
   float4 rho_dh_wcount;
   /*! Particle velocity curl; rot_ux and
-  * velocity divergence; div_v */
+   * velocity divergence; div_v */
   float4 rot_ux_div_v;
 } part_aos_f4_recv;
 
@@ -206,12 +207,12 @@ typedef struct part_aos_f4 {
   float4 ux_m;
   /* Density information; rho */
   /*! Derivative of density with respect to h; rho_dh,
-  * Neighbour number count; w_count
-  * * Derivative of the neighbour number with respect to h; w_count_dh */
+   * Neighbour number count; w_count
+   * * Derivative of the neighbour number with respect to h; w_count_dh */
   float4 rho_dh_wcount;
 
   /*! Particle velocity curl; rot_ux and
-  * velocity divergence; div_v */
+   * velocity divergence; div_v */
   float4 rot_ux_div_v;
 
 } part_aos_f4;
@@ -403,9 +404,8 @@ typedef struct part_aos_f4_g_recv {
 
 } part_aos_f4_g_recv;
 
-
 #ifdef __WITH_CUDA
 }
 #endif
 
-#endif // PART_GPU_H
+#endif  // PART_GPU_H
diff --git a/src/cuda/tester.cu b/src/cuda/tester.cu
old mode 100755
new mode 100644
index 5e70230211..3ffaf9e10c
--- a/src/cuda/tester.cu
+++ b/src/cuda/tester.cu
@@ -1,4 +1,5 @@
 #include "tester.h"
+
 #include <iostream>
 #include <vector>
 #ifdef __cplusplus
diff --git a/src/engine.c b/src/engine.c
index b0d632ccb5..606e246a09 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -1092,9 +1092,11 @@ int engine_estimate_nr_tasks(const struct engine *e) {
      */
     n1 += 38;
     n2 += 2;
-#ifdef WITH_CUDA // A. Nasar
-    n1 += 4; //Self force and density packs should be 2 but doubled to prevent code crash due to unpack tasks
-    n1 += 52; //Pair force and density packs should be 26 but doubled to prevent code crash due to unpack tasks
+#ifdef WITH_CUDA  // A. Nasar
+    n1 += 4;  // Self force and density packs should be 2 but doubled to prevent
+              // code crash due to unpack tasks
+    n1 += 52;  // Pair force and density packs should be 26 but doubled to
+               // prevent code crash due to unpack tasks
 #endif
 #ifdef WITH_MPI
     n1 += 6;
@@ -1103,8 +1105,8 @@ int engine_estimate_nr_tasks(const struct engine *e) {
 #ifdef EXTRA_HYDRO_LOOP
     n1 += 15;
 #ifdef WITH_CUDA
-    n1 += 1; //Self gradient packs
-    n1 += 13; //Pair gradient packs
+    n1 += 1;   // Self gradient packs
+    n1 += 13;  // Pair gradient packs
 #endif
 #ifdef WITH_MPI
     n1 += 2;
@@ -1758,7 +1760,7 @@ void engine_skip_force_and_kick(struct engine *e) {
         t->type == task_type_rt_ghost2 || t->type == task_type_rt_tchem ||
         t->type == task_type_rt_advance_cell_time ||
         t->type == task_type_neutrino_weight || t->type == task_type_csds ||
-        t->subtype == task_subtype_force || // A. Nasar
+        t->subtype == task_subtype_force ||  // A. Nasar
         t->subtype == task_subtype_gpu_pack_f ||
         t->subtype == task_subtype_gpu_unpack_f ||
         t->subtype == task_subtype_limiter ||
@@ -2204,24 +2206,25 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   }
 #endif
 
-//  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar write deps before running first step
+  //  scheduler_write_dependencies(&e->sched, e->verbose, e->step); // A. Nasar
+  //  write deps before running first step
   /* Now, launch the calculation */
-//  message("n tasks %i", e->sched.nr_tasks);
-//  for (int i = 0; i < e->sched.nr_tasks; i++){
-//	  struct task *tmp_t = &e->sched.tasks[i];
-//	  if(tmp_t->subtype == task_subtype_density){
-//		if(tmp_t->skip == 1)error("inactive density task");
-//	  }
-////	  if(tmp_t->subtype == task_subtype_force){
-////		if(tmp_t->skip == 1)error("inactive force task");
-////	  }
-//	  if(tmp_t->subtype == task_subtype_gpu_pack){
-//		if(tmp_t->skip == 1)error("inactive pack task");
-//	  }
-//	  if(tmp_t->subtype == task_subtype_gpu_unpack){
-//	    if(tmp_t->skip == 1)error("inactive unpack task");
-//	  }
-//  }
+  //  message("n tasks %i", e->sched.nr_tasks);
+  //  for (int i = 0; i < e->sched.nr_tasks; i++){
+  //	  struct task *tmp_t = &e->sched.tasks[i];
+  //	  if(tmp_t->subtype == task_subtype_density){
+  //		if(tmp_t->skip == 1)error("inactive density task");
+  //	  }
+  ////	  if(tmp_t->subtype == task_subtype_force){
+  ////		if(tmp_t->skip == 1)error("inactive force task");
+  ////	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack){
+  //		if(tmp_t->skip == 1)error("inactive pack task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+  //	    if(tmp_t->skip == 1)error("inactive unpack task");
+  //	  }
+  //  }
   TIMER_TIC;
   engine_launch(e, "tasks");
   TIMER_TOC(timer_runners);
@@ -2309,21 +2312,21 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   scheduler_write_cell_dependencies(&e->sched, e->verbose, e->step);
   if (e->nodeID == 0) scheduler_write_task_level(&e->sched, e->step);
 
-//  for (int i = 0; i < e->sched.nr_tasks; i++){
-//	  struct task *tmp_t = &e->sched.tasks[i];
-//	  if(tmp_t->subtype == task_subtype_density){
-//		if(tmp_t->skip == 1)error("inactive density task");
-//	  }
-//	  if(tmp_t->subtype == task_subtype_force){
-//		if(tmp_t->skip == 1)error("inactive force task");
-//	  }
-//	  if(tmp_t->subtype == task_subtype_gpu_pack){
-//		if(tmp_t->skip == 1)error("inactive pack task");
-//	  }
-//	  if(tmp_t->subtype == task_subtype_gpu_unpack){
-//	    if(tmp_t->skip == 1)error("inactive unpack task");
-//	  }
-//  }
+  //  for (int i = 0; i < e->sched.nr_tasks; i++){
+  //	  struct task *tmp_t = &e->sched.tasks[i];
+  //	  if(tmp_t->subtype == task_subtype_density){
+  //		if(tmp_t->skip == 1)error("inactive density task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_force){
+  //		if(tmp_t->skip == 1)error("inactive force task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack){
+  //		if(tmp_t->skip == 1)error("inactive pack task");
+  //	  }
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+  //	    if(tmp_t->skip == 1)error("inactive unpack task");
+  //	  }
+  //  }
 
   /* Run the 0th time-step */
   TIMER_TIC2;
diff --git a/src/engine_config.c b/src/engine_config.c
index 336e3d155d..28cbb3f671 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -33,14 +33,16 @@
 #endif
 
 #ifdef WITH_CUDA
-#include <cuda_runtime.h> /* A. Nasar */
 #include "runner_main_clean.cu"
+
+#include <cuda_runtime.h> /* A. Nasar */
 #endif
 
 #ifdef WITH_HIP
-//#include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h"
-#include <hip/hip_runtime.h>
+// #include "/opt/rocm-5.1.0/hip/include/hip/hip_runtime.h"
 #include "runner_main_clean.hip"
+
+#include <hip/hip_runtime.h>
 #endif
 
 /* This object's header. */
@@ -921,11 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-//                 (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
+  //  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+  //                 (e->policy & scheduler_flag_steal), e->nodeID,
+  //                 &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                     0, e->nodeID, &e->threadpool);
+  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index bf0ed134ff..58dd8bc453 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -583,13 +583,13 @@ void engine_addtasks_recv_hydro(
   /* Early abort (are we below the level where tasks are)? */
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
 
-  /* Have we reached a level where there are any hydro tasks ? */
-#ifdef WITH_CUDA // A. Nasar
-  if (t_xv == NULL && c->hydro.density != NULL  && c->hydro.density_pack != NULL)
+    /* Have we reached a level where there are any hydro tasks ? */
+#ifdef WITH_CUDA  // A. Nasar
+  if (t_xv == NULL && c->hydro.density != NULL && c->hydro.density_pack != NULL)
 #else
   if (t_xv == NULL && c->hydro.density != NULL)
 #endif /*WITH_CUDA*/
-    {
+  {
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Make sure this cell has a valid tag. */
@@ -735,17 +735,17 @@ void engine_addtasks_recv_hydro(
 #ifdef WITH_CUDA
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
-//      scheduler_addunlock(s, l->t, t_gradient);
+      //      scheduler_addunlock(s, l->t, t_gradient);
     }
     scheduler_addunlock(s, c->hydro.super->hydro.g_unpack, t_gradient);
 
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_gradient, l->t);
-//      scheduler_addunlock(s, l->t, t_ti);
+      //      scheduler_addunlock(s, l->t, t_ti);
     }
     scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
-#endif   /*WITH_CUDA*/
-#else /*EXTRA_HYDRO_LOOP*/
+#endif /*WITH_CUDA*/
+#else  /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
       scheduler_addunlock(s, l->t, tend);
@@ -753,11 +753,11 @@ void engine_addtasks_recv_hydro(
 #ifdef WITH_CUDA
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
-//      scheduler_addunlock(s, l->t, t_ti);
+      //      scheduler_addunlock(s, l->t, t_ti);
     }
     scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
-#endif   /*WITH_CUDA*/
-#endif/*EXTRA_HYDRO_LOOP*/
+#endif /*WITH_CUDA*/
+#endif /*EXTRA_HYDRO_LOOP*/
 
     if (with_limiter) {
       for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
@@ -2120,7 +2120,8 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
   for (int ind = 0; ind < num_elements; ind++) {
     struct task *t = &((struct task *)map_data)[ind];
-    if (t->ci == NULL) { //Possible fix missing when moving code over. Prevents unpack tasks continuing past here
+    if (t->ci == NULL) {  // Possible fix missing when moving code over.
+                          // Prevents unpack tasks continuing past here
       break;
     }
     struct cell *ci = t->ci;
@@ -2150,12 +2151,12 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
-//      } else if (t_subtype == task_subtype_gpu_pack_f) {
-//        engine_addlink(e, &ci->hydro.force_pack, t);
-//      } else if (t_subtype == task_subtype_gpu_pack_g) {
-//        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        //        engine_addlink(e, &ci->hydro.force_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        //        engine_addlink(e, &ci->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2170,15 +2171,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
         engine_addlink(e, &cj->hydro.density_pack, t);
-//      } else if (t_subtype == task_subtype_gpu_pack_f) {
-//        engine_addlink(e, &ci->hydro.force_pack, t);
-//        engine_addlink(e, &cj->hydro.force_pack, t);
-//      } else if (t_subtype == task_subtype_gpu_pack_g) {
-//        engine_addlink(e, &ci->hydro.gradient_pack, t);
-//        engine_addlink(e, &cj->hydro.gradient_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_f) {
+        //        engine_addlink(e, &ci->hydro.force_pack, t);
+        //        engine_addlink(e, &cj->hydro.force_pack, t);
+        //      } else if (t_subtype == task_subtype_gpu_pack_g) {
+        //        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        //        engine_addlink(e, &cj->hydro.gradient_pack, t);
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -2195,7 +2196,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) { // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
         error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
         error("Abouzied: you need to code this up!");
@@ -2486,7 +2487,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
   const int with_sink = (e->policy & engine_policy_sinks);
 #ifdef EXTRA_HYDRO_LOOP
   struct task *t_gradient = NULL;
-  struct task *t_gradient_gpu = NULL; // A. Nasar
+  struct task *t_gradient_gpu = NULL;  // A. Nasar
 #endif
 #ifdef EXTRA_STAR_LOOPS
   struct task *t_star_prep1 = NULL;
@@ -2551,8 +2552,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_self, task_subtype_force,
                                   flags, 0, ci, NULL);
       /* Task for the second GPU hydro loop A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_f,
-                                  0, 0, ci, NULL);
+      t_force_gpu = scheduler_addtask(sched, task_type_self,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, NULL);
 
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -2656,10 +2657,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
       /* Same work for the additional GPU hydro loop A. Nasar */
-      t_gradient_gpu = scheduler_addtask(sched, task_type_self,
-                                     task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL);
 
-      /* Add the link between the new loops and the cell. Same for GPU task A. Nasar */
+      /* Add the link between the new loops and the cell. Same for GPU task A.
+       * Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
@@ -2668,9 +2670,12 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
 
-      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and will be used to create downstream deps later
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and
+      // will be used to create downstream deps later
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                          t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                          t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro */
@@ -2849,8 +2854,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       t_force = scheduler_addtask(sched, task_type_pair, task_subtype_force,
                                   flags, 0, ci, cj);
       /* New task for the force A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_f,
-                                  0, 0, ci, cj);
+      t_force_gpu = scheduler_addtask(sched, task_type_pair,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3007,14 +3012,15 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-      /* Start by constructing the task for the second and third GPU hydro loop A. Nasar */
+      /* Start by constructing the task for the second and third GPU hydro loop
+       * A. Nasar */
       t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
-                                     task_subtype_gpu_pack_g, 0, 0, ci, cj);
+                                         task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
-//      /* Add the link between the new loop and both cells */
+      //      /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
       engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
@@ -3025,16 +3031,20 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_gradient_gpu);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
         /*Same for GPU tasks*/
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_gradient_gpu);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
       }
 #else
 
@@ -3045,14 +3055,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
         // GPU tasks A. Nasar
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
       }
 
 #endif
@@ -4266,13 +4278,13 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       continue;
 
     /* If the cell is local build a self-interaction */
-    struct task *t_pack_self;// A. Nasar
+    struct task *t_pack_self;  // A. Nasar
     if (ci->nodeID == nodeID) {
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
       // A. Nasar also add a pack task for GPU
       scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci,
-    		            NULL);
+                        NULL);
     }
 
     /* Now loop over all the neighbours of this cell */
@@ -4306,8 +4318,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
-          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid, 0,
-        		            ci, cj); // A. Nasar
+          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid,
+                            0, ci, cj);  // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
 #ifdef WITH_MPI
@@ -4735,7 +4747,7 @@ void engine_maketasks(struct engine *e) {
 
   ticks tic2 = getticks();
   /*Initialise GPU task size in prep. for creation A. Nasar */
-  sched->target_gpu_tasks = s->nr_cells; // OK AS LONG AS NOT SPLITTING
+  sched->target_gpu_tasks = s->nr_cells;  // OK AS LONG AS NOT SPLITTING
   const int target_gpu_tasks = sched->target_gpu_tasks;
 
   /* Construct the first hydro loop over neighbours */
@@ -4848,8 +4860,7 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype != task_subtype_gpu_pack)
-      continue;
+    if (t->subtype != task_subtype_gpu_pack) continue;
 
     if (t->type == task_type_self) {
 
@@ -4862,7 +4873,9 @@ void engine_maketasks(struct engine *e) {
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_self_unpack);
       scheduler_addunlock(sched, last_created_self_unpack,
-                          t->ci->hydro.super->hydro.ghost_in); //Keep self_unpack dependency here, pairs added later using links
+                          t->ci->hydro.super->hydro
+                              .ghost_in);  // Keep self_unpack dependency here,
+                                           // pairs added later using links
       /*Creating links between each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_self_unpack);
       t->ci->hydro.d_unpack = last_created_self_unpack;
@@ -4876,28 +4889,30 @@ void engine_maketasks(struct engine *e) {
       }
 
       /* pack -> unpack -> ghost_in */
-      if(t->ci->hydro.ghost_in == NULL)
+      if (t->ci->hydro.ghost_in == NULL)
         fprintf(stderr, "Ghost in for cell i is NULL\n");
-      if(t->cj->hydro.ghost_in == NULL)
+      if (t->cj->hydro.ghost_in == NULL)
         fprintf(stderr, "Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
-      scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->ci->hydro.super->hydro.ghost_in);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
-      scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.ghost_in);
+      if (t->ci->nodeID == e->nodeID)
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->ci->hydro.super->hydro.ghost_in);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
+        scheduler_addunlock(sched, last_created_pair_unpack,
+                            t->cj->hydro.super->hydro.ghost_in);
 
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
 
-      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
-       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
-//      t->ci->hydro.d_unpack = last_created_pair_unpack;
-//      t->cj->hydro.d_unpack = last_created_pair_unpack;
+      /*Useless as this ends up only setting one pair unpack as the unpack task
+       * for this cell whilst the cell interacts with many other cells and can
+       * be linked to another unpack task. Rely on links instead*/
+      //      t->ci->hydro.d_unpack = last_created_pair_unpack;
+      //      t->cj->hydro.d_unpack = last_created_pair_unpack;
 
-//      t->ci->hydro.super->hydro.d_unpack = last_created_self_unpack;
+      //      t->ci->hydro.super->hydro.d_unpack = last_created_self_unpack;
 
       ++count_current_pair;
     } else {
@@ -4911,18 +4926,20 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks)
     error("We did not find the correct number of pair pack tasks!!");
 #endif
-  /* Loop over all the currently existing ghost_in tasks to add unpack dependency*/
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//    struct task *t = &sched->tasks[i];
-//    if (t->type != task_type_ghost_in)
-//      continue;
-////    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
-////          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-//    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l = l->next) {
-////      if(l->t->type == task_type_pair)
-//    	  scheduler_addunlock(sched, l->t, t);
-//    }
-//  }
+  /* Loop over all the currently existing ghost_in tasks to add unpack
+   * dependency*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //    struct task *t = &sched->tasks[i];
+  //    if (t->type != task_type_ghost_in)
+  //      continue;
+  ////    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
+  ////          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
+  //    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l =
+  //    l->next) {
+  ////      if(l->t->type == task_type_pair)
+  //    	  scheduler_addunlock(sched, l->t, t);
+  //    }
+  //  }
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
@@ -4938,7 +4955,6 @@ void engine_maketasks(struct engine *e) {
      *                threadpool_auto_chunk_size, e); */
   }
 
-
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
   count_current_pair = 0;
@@ -4949,8 +4965,7 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype != task_subtype_gpu_pack_g)
-      continue;
+    if (t->subtype != task_subtype_gpu_pack_g) continue;
 
     if (t->type == task_type_self) {
 
@@ -4965,7 +4980,8 @@ void engine_maketasks(struct engine *e) {
       scheduler_addunlock(sched, last_created_self_unpack,
                           t->ci->hydro.super->hydro.extra_ghost);
       /*Creating links between a each cell and its unpack task*/
-      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_self_unpack);
+      engine_addlink(e, &t->ci->hydro.gradient_unpack,
+                     last_created_self_unpack);
       t->ci->hydro.g_unpack = last_created_self_unpack;
 
       ++count_current_self;
@@ -4979,19 +4995,23 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
+      if (t->ci->nodeID == e->nodeID)
         scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.extra_ghost);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+                            t->ci->hydro.super->hydro.extra_ghost);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
         scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.extra_ghost);
-
-      engine_addlink(e, &t->ci->hydro.gradient_unpack, last_created_pair_unpack);
-      engine_addlink(e, &t->cj->hydro.gradient_unpack, last_created_pair_unpack);
-      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
-       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
-//      t->ci->hydro.g_unpack = last_created_pair_unpack;
-//      t->cj->hydro.g_unpack = last_created_pair_unpack;
+                            t->cj->hydro.super->hydro.extra_ghost);
+
+      engine_addlink(e, &t->ci->hydro.gradient_unpack,
+                     last_created_pair_unpack);
+      engine_addlink(e, &t->cj->hydro.gradient_unpack,
+                     last_created_pair_unpack);
+      /*Useless as this ends up only setting one pair unpack as the unpack task
+       * for this cell whilst the cell interacts with many other cells and can
+       * be linked to another unpack task. Rely on links instead*/
+      //      t->ci->hydro.g_unpack = last_created_pair_unpack;
+      //      t->cj->hydro.g_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -5001,22 +5021,30 @@ void engine_maketasks(struct engine *e) {
   }
 #ifdef SWIFT_DEBUG_CHECKS
   if (count_current_self != sched->nr_self_pack_tasks_g)
-    error("We did not find the correct number of G self pack tasks!! count %i what it shoudl be %i", count_current_self, sched->nr_self_pack_tasks_g);
+    error(
+        "We did not find the correct number of G self pack tasks!! count %i "
+        "what it shoudl be %i",
+        count_current_self, sched->nr_self_pack_tasks_g);
   if (count_current_pair != sched->nr_pair_pack_tasks_g)
-	    error("We did not find the correct number of G pair pack tasks!! count %i what it shoudl be %i", count_current_pair, sched->nr_pair_pack_tasks_g);
+    error(
+        "We did not find the correct number of G pair pack tasks!! count %i "
+        "what it shoudl be %i",
+        count_current_pair, sched->nr_pair_pack_tasks_g);
 #endif
-  /* Loop over all the currently existing extra_ghost tasks to add unpack dependency*/
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//    struct task *t = &sched->tasks[i];
-//    if (t->type != task_type_extra_ghost)
-//      continue;
-////    if(t->ci->nodeID == e->nodeID)
-////      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-//    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l = l->next) {
-////    	if(l->t->type == task_type_pair)
-//    		scheduler_addunlock(sched, l->t, t);
-//    }
-//  }
+  /* Loop over all the currently existing extra_ghost tasks to add unpack
+   * dependency*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //    struct task *t = &sched->tasks[i];
+  //    if (t->type != task_type_extra_ghost)
+  //      continue;
+  ////    if(t->ci->nodeID == e->nodeID)
+  ////      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
+  //    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l =
+  //    l->next) {
+  ////    	if(l->t->type == task_type_pair)
+  //    		scheduler_addunlock(sched, l->t, t);
+  //    }
+  //  }
   /*Now create unpacks for all gpu_pack_f (force) tasks*/
   count_current_self = 0;
   count_current_pair = 0;
@@ -5027,14 +5055,13 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype != task_subtype_gpu_pack_f)
-      continue;
+    if (t->subtype != task_subtype_gpu_pack_f) continue;
 
     if (t->type == task_type_self) {
 
       if (count_current_self % pack_size == 0) {
-          last_created_self_unpack = scheduler_addtask(
-              sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
+        last_created_self_unpack = scheduler_addtask(
+            sched, task_type_self, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
@@ -5043,8 +5070,8 @@ void engine_maketasks(struct engine *e) {
                           t->ci->hydro.super->hydro.end_force);
       /*Creating links between a each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
-//
-//      t->ci->hydro.f_unpack = last_created_self_unpack;
+      //
+      //      t->ci->hydro.f_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
@@ -5057,19 +5084,21 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       scheduler_addunlock(sched, t, last_created_pair_unpack);
-      if(t->ci->nodeID == e->nodeID)
+      if (t->ci->nodeID == e->nodeID)
         scheduler_addunlock(sched, last_created_pair_unpack,
-    		              t->ci->hydro.super->hydro.end_force);
-      if((t->cj->nodeID == e->nodeID) && (t->ci->hydro.super != t->cj->hydro.super))
+                            t->ci->hydro.super->hydro.end_force);
+      if ((t->cj->nodeID == e->nodeID) &&
+          (t->ci->hydro.super != t->cj->hydro.super))
         scheduler_addunlock(sched, last_created_pair_unpack,
-                          t->cj->hydro.super->hydro.end_force);
+                            t->cj->hydro.super->hydro.end_force);
 
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
-      /*Useless as this ends up only setting one pair unpack as the unpack task for this cell whilst the cell
-       * interacts with many other cells and can be linked to another unpack task. Rely on links instead*/
-//      t->ci->hydro.f_unpack = last_created_pair_unpack;
-//      t->cj->hydro.f_unpack = last_created_pair_unpack;
+      /*Useless as this ends up only setting one pair unpack as the unpack task
+       * for this cell whilst the cell interacts with many other cells and can
+       * be linked to another unpack task. Rely on links instead*/
+      //      t->ci->hydro.f_unpack = last_created_pair_unpack;
+      //      t->cj->hydro.f_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -5083,18 +5112,20 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_f)
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
-  /* Loop over all the currently existing end_force tasks to add unpack dependency*/
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//    struct task *t = &sched->tasks[i];
-//    if (t->type != task_type_end_hydro_force)
-//      continue;
-////    if(t->ci->nodeID == e->nodeID)
-////      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-//    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next) {
-////    	if(l->t->type == task_type_pair)
-//    		scheduler_addunlock(sched, l->t, t);
-//    }
-//  }
+  /* Loop over all the currently existing end_force tasks to add unpack
+   * dependency*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //    struct task *t = &sched->tasks[i];
+  //    if (t->type != task_type_end_hydro_force)
+  //      continue;
+  ////    if(t->ci->nodeID == e->nodeID)
+  ////      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
+  //    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next)
+  //    {
+  ////    	if(l->t->type == task_type_pair)
+  //    		scheduler_addunlock(sched, l->t, t);
+  //    }
+  //  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 11d8b46d92..26bdc1333f 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -86,20 +86,23 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     const enum task_types t_type = t->type;
     const enum task_subtypes t_subtype = t->subtype;
 
-    //Activate GPU unpack tasks (cell-less dummy tasks so need activating separately)
-    if (t_type == task_type_self && (t_subtype == task_subtype_gpu_unpack ||
-    		t_subtype == task_subtype_gpu_unpack_g ||
-			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+    // Activate GPU unpack tasks (cell-less dummy tasks so need activating
+    // separately)
+    if (t_type == task_type_self &&
+        (t_subtype == task_subtype_gpu_unpack ||
+         t_subtype == task_subtype_gpu_unpack_g ||
+         t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
       scheduler_activate(s, t);
       continue;
     }
 
-    if (t_type == task_type_pair && (t_subtype == task_subtype_gpu_unpack ||
-    		t_subtype == task_subtype_gpu_unpack_g ||
-			t_subtype == task_subtype_gpu_unpack_f)){ // A. Nasar
+    if (t_type == task_type_pair &&
+        (t_subtype == task_subtype_gpu_unpack ||
+         t_subtype == task_subtype_gpu_unpack_g ||
+         t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
       scheduler_activate(s, t);
       continue;
-//      fprintf(stderr,"activated pair unpack in marktasks\n");
+      //      fprintf(stderr,"activated pair unpack in marktasks\n");
     }
 
     /* Single-cell task? */
@@ -109,14 +112,15 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       struct cell *ci = t->ci;
 
 #ifdef SWIFT_DEBUG_CHECKS
-#ifndef WITH_CUDA // A. Nasar
+#ifndef WITH_CUDA  // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
 #else
       if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
-    		  t_subtype != task_subtype_gpu_unpack_f &&
-			  t_subtype != task_subtype_gpu_unpack_g){
-    	        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
-    	        error("Non-local self task found. Task is subtaskID_names[%s]", subtaskID_names[t->subtype]);
+          t_subtype != task_subtype_gpu_unpack_f &&
+          t_subtype != task_subtype_gpu_unpack_g) {
+        fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
+        error("Non-local self task found. Task is subtaskID_names[%s]",
+              subtaskID_names[t->subtype]);
       }
 #endif
 #endif
@@ -151,7 +155,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       /* Activate packing for GPU */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_g) {
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_g) {
         if (ci_active_hydro) {
           scheduler_activate(s, t);
           ci->pack_done_g = 0;
@@ -161,7 +166,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       /* Activate packing for GPU */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_f) {
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_f) {
         if (ci_active_hydro) {
           scheduler_activate(s, t);
           ci->pack_done_f = 0;
@@ -465,29 +471,28 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
       /* Activate packing for GPU A. Nasar */
-	  if(t_subtype == task_subtype_gpu_pack &&
-	   ((ci_active_hydro && ci_nodeID == nodeID) ||
-		(cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair = 0;
-	    cj->gpu_done_pair = 0;
-	  }
-	  else if (t_subtype == task_subtype_gpu_pack_g &&
-		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-		      (cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair_g = 0;
-	    cj->gpu_done_pair_g = 0;
-	  }
-	  else if (t_subtype == task_subtype_gpu_pack_f &&
-		     ((ci_active_hydro && ci_nodeID == nodeID) ||
-		      (cj_active_hydro && cj_nodeID == nodeID))) {
-	    scheduler_activate(s, t);
-	    ci->gpu_done_pair_f = 0;
-	    cj->gpu_done_pair_f = 0;
-      }
-
-      /* Only activate tasks that involve a local active cell. A. Nasar THIS COULD BE SOURCE OF BUG */
+      if (t_subtype == task_subtype_gpu_pack &&
+          ((ci_active_hydro && ci_nodeID == nodeID) ||
+           (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair = 0;
+        cj->gpu_done_pair = 0;
+      } else if (t_subtype == task_subtype_gpu_pack_g &&
+                 ((ci_active_hydro && ci_nodeID == nodeID) ||
+                  (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair_g = 0;
+        cj->gpu_done_pair_g = 0;
+      } else if (t_subtype == task_subtype_gpu_pack_f &&
+                 ((ci_active_hydro && ci_nodeID == nodeID) ||
+                  (cj_active_hydro && cj_nodeID == nodeID))) {
+        scheduler_activate(s, t);
+        ci->gpu_done_pair_f = 0;
+        cj->gpu_done_pair_f = 0;
+      }
+
+      /* Only activate tasks that involve a local active cell. A. Nasar THIS
+       * COULD BE SOURCE OF BUG */
       if ((t_subtype == task_subtype_density ||
            t_subtype == task_subtype_gradient ||
            t_subtype == task_subtype_limiter ||
diff --git a/src/engine_unskip.c b/src/engine_unskip.c
index d14c29cffe..0d0ac13f74 100644
--- a/src/engine_unskip.c
+++ b/src/engine_unskip.c
@@ -79,8 +79,8 @@ struct unskip_data {
  */
 static void engine_do_unskip_hydro(struct cell *c, struct engine *e) {
 
-//  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_pack); A. Nasar
-//  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_unpack);
+  //  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_pack); A. Nasar
+  //  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_unpack);
 
   /* Early abort (are we below the level where tasks are)? */
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
diff --git a/src/files_for_new_functions/arrays_malloc.cu b/src/files_for_new_functions/arrays_malloc.cu
old mode 100755
new mode 100644
index 11726e8528..3bbf998231
--- a/src/files_for_new_functions/arrays_malloc.cu
+++ b/src/files_for_new_functions/arrays_malloc.cu
@@ -1,4 +1,5 @@
 #include "cuda/part_gpu.h"
+
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
@@ -10,284 +11,351 @@ extern "C" {
 
 #include "arrays_malloc.h"
 
-void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp){
-	///////////Malloc Host arrays
-	cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int));
-	cudaMallocHost((void **)&parts_soa->id, count_max_parts_tmp * sizeof(long long));
-	cudaMallocHost((void **)&parts_soa->mass, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->u_dt, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->SPH_sum, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->x_p, count_max_parts_tmp * sizeof(double));
-	cudaMallocHost((void **)&parts_soa->y_p, count_max_parts_tmp * sizeof(double));
-	cudaMallocHost((void **)&parts_soa->z_p, count_max_parts_tmp * sizeof(double));
-	cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->a_hydrox, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->a_hydroy, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->a_hydroz, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->locx, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->locy, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->locz, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->widthx, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->widthy, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->widthz, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->h_max, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->count_p, count_max_parts_tmp * sizeof(int));
-	cudaMallocHost((void **)&parts_soa->wcount, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->wcount_dh, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->rho_dh, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->rot_ux, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->rot_uy, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->rot_uz, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->div_v, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->div_v_previous_step,
-				   count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->alpha_visc, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->v_sig, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->laplace_u, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->alpha_diff, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->soundspeed, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->h_dt, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->balsara, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->pressure, count_max_parts_tmp * sizeof(float));
-	cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb,
-				   count_max_parts_tmp * sizeof(float));
-	/* timestep stuff */
-	cudaMallocHost((void **)&parts_soa->time_bin, count_max_parts_tmp * sizeof(timebin_t));
-	cudaMallocHost((void **)&parts_soa->wakeup, count_max_parts_tmp * sizeof(timebin_t));
-	cudaMallocHost((void **)&parts_soa->min_ngb_time_bin,
-				   count_max_parts_tmp * sizeof(timebin_t));
-	cudaMallocHost((void **)&parts_soa->to_be_synchronized,
-				   count_max_parts_tmp * sizeof(char));
+void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp) {
+  ///////////Malloc Host arrays
+  cudaMallocHost((void **)&parts_soa->tid_p, count_max_parts_tmp * sizeof(int));
+  cudaMallocHost((void **)&parts_soa->id,
+                 count_max_parts_tmp * sizeof(long long));
+  cudaMallocHost((void **)&parts_soa->mass,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->u, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->u_dt,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rho, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->SPH_sum,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->x_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->y_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->z_p,
+                 count_max_parts_tmp * sizeof(double));
+  cudaMallocHost((void **)&parts_soa->ux, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->uy, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->uz, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydrox,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydroy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->a_hydroz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locx,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->locz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthx,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->widthz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h_max,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->count_p,
+                 count_max_parts_tmp * sizeof(int));
+  cudaMallocHost((void **)&parts_soa->wcount,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->wcount_dh,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rho_dh,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_ux,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_uy,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->rot_uz,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->div_v,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->div_v_previous_step,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_visc,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->v_sig,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->laplace_u,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_diff,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->f, count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->soundspeed,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->h_dt,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->balsara,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->pressure,
+                 count_max_parts_tmp * sizeof(float));
+  cudaMallocHost((void **)&parts_soa->alpha_visc_max_ngb,
+                 count_max_parts_tmp * sizeof(float));
+  /* timestep stuff */
+  cudaMallocHost((void **)&parts_soa->time_bin,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->wakeup,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->min_ngb_time_bin,
+                 count_max_parts_tmp * sizeof(timebin_t));
+  cudaMallocHost((void **)&parts_soa->to_be_synchronized,
+                 count_max_parts_tmp * sizeof(char));
 }
 
-void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp){
-	////////now malloc variables for particle data on the GPU. Sheesh
-	  fprintf(stderr, "before malloc\n");
-	  cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp);
-	  fprintf(stderr, "after malloc\n");
-	  cudaMalloc((void **)&(d_parts_soa.id), sizeof(long long) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.a_hydrox), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.a_hydroy), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.a_hydroz), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.SPH_sum), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.widthx), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.widthy), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.widthz), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.h_max), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.count_p), sizeof(int) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.wcount), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.wcount_dh), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.rho_dh), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.rot_ux), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.rot_uy), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.rot_uz), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.div_v), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.div_v_previous_step),
-	             sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.alpha_visc), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.v_sig), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.laplace_u), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.alpha_diff), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.soundspeed), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.balsara), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.pressure), sizeof(float) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb),
-	             sizeof(float) * count_max_parts_tmp);
-	  /* timestep stuff */
-	  cudaMalloc((void **)&(d_parts_soa.time_bin), sizeof(timebin_t) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.wakeup), sizeof(timebin_t) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin),
-	             sizeof(timebin_t) * count_max_parts_tmp);
-	  cudaMalloc((void **)&(d_parts_soa.to_be_synchronized),
-	             sizeof(char) * count_max_parts_tmp);
+void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp) {
+  ////////now malloc variables for particle data on the GPU. Sheesh
+  fprintf(stderr, "before malloc\n");
+  cudaMalloc((void **)&(d_parts_soa.tid_p), sizeof(int) * count_max_parts_tmp);
+  fprintf(stderr, "after malloc\n");
+  cudaMalloc((void **)&(d_parts_soa.id),
+             sizeof(long long) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.x_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.y_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.z_p), sizeof(double) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.ux), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.uy), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.uz), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydrox),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydroy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.a_hydroz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.mass), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.u), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.u_dt), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rho), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.SPH_sum),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locx), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locy), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.locz), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthx),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.widthz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h_max),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.count_p),
+             sizeof(int) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wcount),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wcount_dh),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rho_dh),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_ux),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_uy),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.rot_uz),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.div_v),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.div_v_previous_step),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_visc),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.v_sig),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.laplace_u),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_diff),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.f), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.soundspeed),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.h_dt), sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.balsara),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.pressure),
+             sizeof(float) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.alpha_visc_max_ngb),
+             sizeof(float) * count_max_parts_tmp);
+  /* timestep stuff */
+  cudaMalloc((void **)&(d_parts_soa.time_bin),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.wakeup),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.min_ngb_time_bin),
+             sizeof(timebin_t) * count_max_parts_tmp);
+  cudaMalloc((void **)&(d_parts_soa.to_be_synchronized),
+             sizeof(char) * count_max_parts_tmp);
 }
 
-cudaError_t cudaAllocInt(int ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(int));
+cudaError_t cudaAllocInt(int **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(int));
 }
-cudaError_t cudaAllocFloat(float ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(float));
+cudaError_t cudaAllocFloat(float **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(float));
 }
-cudaError_t cudaAllocDouble(double ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(double));
+cudaError_t cudaAllocDouble(double **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(double));
 }
-cudaError_t cudaAllocLonglong(long long ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(long long));
+cudaError_t cudaAllocLonglong(long long **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(long long));
 }
-cudaError_t cudaAllocChar(char ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(char));
+cudaError_t cudaAllocChar(char **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(char));
 }
-cudaError_t cudaAllocTimebin(timebin_t ** d_var, int elements){
-    return cudaMalloc((void**)d_var, elements * sizeof(timebin_t));
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements) {
+  return cudaMalloc((void **)d_var, elements * sizeof(timebin_t));
 }
 
+void allocate_device_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp) {
+  ////////Malloc variables for particle data on the GPU. Sheesh, that's a lot
 
-void allocate_device_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
-		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
-		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
-		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
-		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
-		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
-		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
-		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
-        char **d_to_be_synchronized, int count_max_parts_tmp){
-	////////Malloc variables for particle data on the GPU. Sheesh, that's a lot
-
-	      size_t free_byte ;
-          size_t total_byte ;
+  size_t free_byte;
+  size_t total_byte;
 
-          cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-          double free = (double)free_byte;
-          double available = (double)total_byte;
-          double used = (available - free);
-//          message("free %lf used %lf", free/10.E8, used/10.E8);
+  cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
+  double free = (double)free_byte;
+  double available = (double)total_byte;
+  double used = (available - free);
+  //          message("free %lf used %lf", free/10.E8, used/10.E8);
 
-          cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp);
-          cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp);
-		  cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp);
-		  cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp);
-		  cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp);
-          cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_h, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_u, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp);
-		  cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_f, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp);
-		  cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp);
-		  /* timestep stuff */
-		  cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp);
-		  cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp);
-		  cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp);
-		  cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp);
+  cudaError_t cu_error = cudaAllocInt(d_tid_p, count_max_parts_tmp);
+  cu_error = cudaAllocLonglong(d_id, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_x_p, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_y_p, count_max_parts_tmp);
+  cu_error = cudaAllocDouble(d_z_p, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_ux, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_uy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_uz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydrox, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydroy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_a_hydroz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_mass, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_u, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_u_dt, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rho, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locx, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_locz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthx, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_widthz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h_max, count_max_parts_tmp);
+  cu_error = cudaAllocInt(d_count_p, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_wcount, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_wcount_dh, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rho_dh, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_ux, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_uy, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_rot_uz, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_div_v, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_div_v_previous_step, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_visc, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_v_sig, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_laplace_u, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_diff, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_f, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_soundspeed, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_h_dt, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_balsara, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_pressure, count_max_parts_tmp);
+  cu_error = cudaAllocFloat(d_alpha_visc_max_ngb, count_max_parts_tmp);
+  /* timestep stuff */
+  cu_error = cudaAllocTimebin(d_time_bin, count_max_parts_tmp);
+  cu_error = cudaAllocTimebin(d_wakeup, count_max_parts_tmp);
+  cu_error = cudaAllocTimebin(d_min_ngb_time_bin, count_max_parts_tmp);
+  cu_error = cudaAllocChar(d_to_be_synchronized, count_max_parts_tmp);
 //		  cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
 //		  double free_end = (double)free_byte;
 //		  available = (double)total_byte;
 //		  double used_end = (available - free_end);
-//          message("cuda malloc self free %lf GB used %lf GB used to allocate self"
-//        		  " data %lf MB", free_end/10.E8, used_end/10.E8, (used_end - used)/10.E5);
+//          message("cuda malloc self free %lf GB used %lf GB used to allocate
+//          self"
+//        		  " data %lf MB", free_end/10.E8, used_end/10.E8,
+//        (used_end - used)/10.E5);
 //          message("at end of malloc dirty: %s",
 //		  	       cudaGetErrorString(cu_error));
 #ifdef CUDA_DEBUG
-		  if (cu_error != cudaSuccess) {
-		    fprintf(stderr,
-		  	"CUDA error at end of malloc dirty: %s\n",
-		  	cudaGetErrorString(cu_error));
-		  	exit(0);
-		  }
+  if (cu_error != cudaSuccess) {
+    fprintf(stderr, "CUDA error at end of malloc dirty: %s\n",
+            cudaGetErrorString(cu_error));
+    exit(0);
+  }
 #endif
-
-
 }
 
-void allocate_device_test(int **tid_test, int count_max_parts_tmp){
-	////////now malloc variables for particle data on the GPU. Sheesh
+void allocate_device_test(int **tid_test, int count_max_parts_tmp) {
+  ////////now malloc variables for particle data on the GPU. Sheesh
 
-		  cudaMalloc((void **) tid_test, sizeof(int) * count_max_parts_tmp);
+  cudaMalloc((void **)tid_test, sizeof(int) * count_max_parts_tmp);
 
-		  cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
-		  fprintf(stderr,
-			"malloc tid: %s\n",
-			cudaGetErrorString(cu_error));
-
-		  if (cu_error != cudaSuccess) {
-			fprintf(stderr,
-			"CUDA error with malloc tid: %s\n",
-			cudaGetErrorString(cu_error));
-			exit(0);
-		  }
+  cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+  fprintf(stderr, "malloc tid: %s\n", cudaGetErrorString(cu_error));
 
+  if (cu_error != cudaSuccess) {
+    fprintf(stderr, "CUDA error with malloc tid: %s\n",
+            cudaGetErrorString(cu_error));
+    exit(0);
+  }
 }
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_malloc(struct part_soa *parts_soa, int alloc_type, int count_max_parts_tmp){
-	allocate_host(parts_soa, count_max_parts_tmp);
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+                 int count_max_parts_tmp) {
+  allocate_host(parts_soa, count_max_parts_tmp);
 }
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_malloc(struct part_soa d_parts_soa, int alloc_type, int count_max_parts_tmp){
-	allocate_device(d_parts_soa, count_max_parts_tmp);
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+                   int count_max_parts_tmp) {
+  allocate_device(d_parts_soa, count_max_parts_tmp);
 }
 
-void device_malloc_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
-		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
-		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
-		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
-		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
-		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
-		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
-		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
-        char **d_to_be_synchronized, int count_max_parts_tmp){
-
-	allocate_device_dirty(d_tid_p, d_id, d_x_p, d_y_p, d_z_p,
-			d_ux, d_uy, d_uz, d_a_hydrox, d_a_hydroy, d_a_hydroz,
-			 d_mass, d_h , d_u, d_u_dt, d_rho, d_locx, d_locy,
-			 d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p,
-			 d_wcount, d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz,
-			 d_div_v, d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u,
-			 d_alpha_diff, d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure,
-			 d_alpha_visc_max_ngb, d_time_bin, d_wakeup, d_min_ngb_time_bin,
-	         d_to_be_synchronized, count_max_parts_tmp);
+void device_malloc_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp) {
 
+  allocate_device_dirty(
+      d_tid_p, d_id, d_x_p, d_y_p, d_z_p, d_ux, d_uy, d_uz, d_a_hydrox,
+      d_a_hydroy, d_a_hydroz, d_mass, d_h, d_u, d_u_dt, d_rho, d_locx, d_locy,
+      d_locz, d_widthx, d_widthy, d_widthz, d_h_max, d_count_p, d_wcount,
+      d_wcount_dh, d_rho_dh, d_rot_ux, d_rot_uy, d_rot_uz, d_div_v,
+      d_div_v_previous_step, d_alpha_visc, d_v_sig, d_laplace_u, d_alpha_diff,
+      d_f, d_soundspeed, d_h_dt, d_balsara, d_pressure, d_alpha_visc_max_ngb,
+      d_time_bin, d_wakeup, d_min_ngb_time_bin, d_to_be_synchronized,
+      count_max_parts_tmp);
 }
 
-void device_malloc_test(int **tid_test, int count_max_parts_tmp){
-
-	allocate_device_test(tid_test, count_max_parts_tmp);
+void device_malloc_test(int **tid_test, int count_max_parts_tmp) {
 
+  allocate_device_test(tid_test, count_max_parts_tmp);
 }
 
 #ifdef WITH_CUDA
diff --git a/src/files_for_new_functions/arrays_malloc.h b/src/files_for_new_functions/arrays_malloc.h
old mode 100755
new mode 100644
index 798dc7895b..1107b51444
--- a/src/files_for_new_functions/arrays_malloc.h
+++ b/src/files_for_new_functions/arrays_malloc.h
@@ -1,50 +1,64 @@
 #include "cuda/part_gpu.h"
+
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include <error.h>
 
-cudaError_t cudaAllocInt(int ** d_var, int elements);
-cudaError_t cudaAllocFloat(float ** d_var, int elements);
-cudaError_t cudaAllocDouble(double ** d_var, int elements);
-cudaError_t cudaAllocLonglong(long long ** d_var, int elements);
-cudaError_t cudaAllocChar(char ** d_var, int elements);
-cudaError_t cudaAllocTimebin(timebin_t ** d_var, int elements);
+cudaError_t cudaAllocInt(int **d_var, int elements);
+cudaError_t cudaAllocFloat(float **d_var, int elements);
+cudaError_t cudaAllocDouble(double **d_var, int elements);
+cudaError_t cudaAllocLonglong(long long **d_var, int elements);
+cudaError_t cudaAllocChar(char **d_var, int elements);
+cudaError_t cudaAllocTimebin(timebin_t **d_var, int elements);
 
 void allocate_host(struct part_soa *parts_soa, int count_max_parts_tmp);
 
 void allocate_device(struct part_soa d_parts_soa, int count_max_parts_tmp);
 
-void allocate_device_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
-		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
-		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
-		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
-		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
-		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
-		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
-		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
-        char **d_to_be_synchronized, int count_max_parts_tmp);
+void allocate_device_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp);
 
 void allocate_device_test(int **tid_test, int count_max_parts_tmp);
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_malloc(struct part_soa *parts_soa, int alloc_type, int count_max_parts_tmp);
+void host_malloc(struct part_soa *parts_soa, int alloc_type,
+                 int count_max_parts_tmp);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_malloc(struct part_soa d_parts_soa, int alloc_type, int count_max_parts_tmp);
-
-void device_malloc_dirty(int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p, double **d_z_p,
-		float **d_ux, float **d_uy, float **d_uz, float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz,
-		float **d_mass, float **d_h ,float **d_u, float **d_u_dt, float **d_rho, float **d_locx, float **d_locy,
-		float **d_locz, float **d_widthx, float **d_widthy, float **d_widthz, float **d_h_max, int **d_count_p,
-		float **d_wcount, float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy, float **d_rot_uz,
-		float **d_div_v, float **d_div_v_previous_step, float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
-		float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt, float **d_balsara,float **d_pressure,
-		float **d_alpha_visc_max_ngb, timebin_t **d_time_bin, timebin_t **d_wakeup, timebin_t **d_min_ngb_time_bin,
-        char **d_to_be_synchronized, int count_max_parts_tmp);
+void device_malloc(struct part_soa d_parts_soa, int alloc_type,
+                   int count_max_parts_tmp);
 
-void device_malloc_test(int **tid_test, int count_max_parts_tmp);
+void device_malloc_dirty(
+    int **d_tid_p, long long **d_id, double **d_x_p, double **d_y_p,
+    double **d_z_p, float **d_ux, float **d_uy, float **d_uz,
+    float **d_a_hydrox, float **d_a_hydroy, float **d_a_hydroz, float **d_mass,
+    float **d_h, float **d_u, float **d_u_dt, float **d_rho, float **d_locx,
+    float **d_locy, float **d_locz, float **d_widthx, float **d_widthy,
+    float **d_widthz, float **d_h_max, int **d_count_p, float **d_wcount,
+    float **d_wcount_dh, float **d_rho_dh, float **d_rot_ux, float **d_rot_uy,
+    float **d_rot_uz, float **d_div_v, float **d_div_v_previous_step,
+    float **d_alpha_visc, float **d_v_sig, float **d_laplace_u,
+    float **d_alpha_diff, float **d_f, float **d_soundspeed, float **d_h_dt,
+    float **d_balsara, float **d_pressure, float **d_alpha_visc_max_ngb,
+    timebin_t **d_time_bin, timebin_t **d_wakeup,
+    timebin_t **d_min_ngb_time_bin, char **d_to_be_synchronized,
+    int count_max_parts_tmp);
 
+void device_malloc_test(int **tid_test, int count_max_parts_tmp);
diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu
old mode 100755
new mode 100644
index 238e345e5f..63748e7e18
--- a/src/files_for_new_functions/host_device_data_transfer.cu
+++ b/src/files_for_new_functions/host_device_data_transfer.cu
@@ -1,4 +1,5 @@
 #include "cuda/part_gpu.h"
+
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
@@ -8,520 +9,556 @@
 extern "C" {
 #endif
 
-void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp){
-//	int * tid_h;
-//	cudaMallocHost((void **)&tid_h,
-//			count_max_parts_tmp * sizeof(int));
-	for (int i =0; i< count_max_parts_tmp; i++){
-		tid_h[i] = 100;
-//		fprintf(stderr,"tid_h %i\n", tid_h[i]);
-	}
-
-	cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
-	cudaDeviceSynchronize();
-//	cudaFree(tid_h);
+void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
+  //	int * tid_h;
+  //	cudaMallocHost((void **)&tid_h,
+  //			count_max_parts_tmp * sizeof(int));
+  for (int i = 0; i < count_max_parts_tmp; i++) {
+    tid_h[i] = 100;
+    //		fprintf(stderr,"tid_h %i\n", tid_h[i]);
+  }
+
+  cudaMemcpy(d_tid_p, tid_h, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  //	cudaFree(tid_h);
 }
 
-void device2host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp){
-	int *tid_p = parts_soa.tid_p;
-	cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int), cudaMemcpyDeviceToHost);
-	for (int i =0; i< count_max_parts_tmp; i++){
-		fprintf(stderr,"tid is %i\n", tid_h[i]);
-	}
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp) {
+  int *tid_p = parts_soa.tid_p;
+  cudaMemcpy(tid_h, tid_p, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < count_max_parts_tmp; i++) {
+    fprintf(stderr, "tid is %i\n", tid_h[i]);
+  }
 }
 
-void device2device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp){
-	cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice);
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp) {
+  cudaMemcpy(tid_p, parts_soa.tid_p, sizeof(int *), cudaMemcpyHostToDevice);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp){
-
-	host2device_test(d_tid_p, tid_h, count_max_parts_tmp);
+void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp) {
 
+  host2device_test(d_tid_p, tid_h, count_max_parts_tmp);
 }
 
-void device_host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp){
-
-	device2host_test(parts_soa, tid_h, count_max_parts_tmp);
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp) {
 
+  device2host_test(parts_soa, tid_h, count_max_parts_tmp);
 }
 
-void device_device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp){
-
-	device2device_test(tid_p, parts_soa, count_max_parts_tmp);
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp) {
 
+  device2device_test(tid_p, parts_soa, count_max_parts_tmp);
 }
 
-void device2host_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp){
-  cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int), cudaMemcpyDeviceToHost);
+void device2host_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+  cudaMemcpy(parts_soa_buffer.tid_p, tid_p, count_max_parts_tmp * sizeof(int),
+             cudaMemcpyDeviceToHost);
 }
-void device_host_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp){
-
-	device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, count_max_parts_tmp);
-
+void device_host_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+
+  device2host_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+                      a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+                      locy, locz, widthx, widthy, widthz, h_max, count_p,
+                      wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+                      div_v_previous_step, alpha_visc, v_sig, laplace_u,
+                      alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+                      alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+                      to_be_synchronized, count_max_parts_tmp);
 }
 
-void device2device_density(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream){
-
-  cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p,
-				  sizeof(int *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->h), &h,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p,
-				  sizeof(double *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p,
-				  sizeof(double *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p,
-				  sizeof(double *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz,
-				  sizeof(float *), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin,
-				  sizeof(timebin_t *), cudaMemcpyHostToDevice, stream);
-
+void device2device_density(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream) {
+
+  cudaMemcpyAsync(&(parts_soa_buffer->tid_p), &tid_p, sizeof(int *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locx), &locx, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locy), &locy, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->locz), &locz, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->h), &h, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->mass), &mass, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->x_p), &x_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->y_p), &y_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->z_p), &z_p, sizeof(double *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->ux), &ux, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uy), &uy, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->uz), &uz, sizeof(float *),
+                  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&(parts_soa_buffer->time_bin), &time_bin, sizeof(timebin_t *),
+                  cudaMemcpyHostToDevice, stream);
 }
 
-
-void host2device_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp){
+void host2device_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
   cudaError_t cu_error;
   cudaMemcpy(&tid_p, &(parts_soa_buffer.tid_p),
-		  count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
+             count_max_parts_tmp * sizeof(int), cudaMemcpyHostToDevice);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp){
-
-	host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, count_max_parts_tmp);
-
+void host_device_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp) {
+
+  host2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz,
+                      a_hydrox, a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx,
+                      locy, locz, widthx, widthy, widthz, h_max, count_p,
+                      wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz, div_v,
+                      div_v_previous_step, alpha_visc, v_sig, laplace_u,
+                      alpha_diff, f, soundspeed, h_dt, balsara, pressure,
+                      alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
+                      to_be_synchronized, count_max_parts_tmp);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_device_bind(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream){
-
-	device2device_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-				 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-				 mass, h , u, u_dt, rho, locx, locy,
-				 locz, widthx, widthy, widthz, h_max, count_p,
-				 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-				 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-				 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-				 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-		         to_be_synchronized, count_max_parts_tmp, stream);
-
+void device_device_bind(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream) {
+
+  device2device_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
+      count_max_parts_tmp, stream);
 }
 
-void host2device_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+void host2device_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
   cudaError_t cu_error;
-  cudaMemcpyAsync(&tid_p[first_part_tmp], &(parts_soa_buffer.tid_p[first_part_tmp]),
-				  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice,
-				  stream);
-  cudaMemcpyAsync(&locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(&tid_p[first_part_tmp],
+                  &(parts_soa_buffer.tid_p[first_part_tmp]),
+                  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(
+      &locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
+      bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(
+      &locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
+      bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice, stream);
   cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&time_bin[first_part_tmp],
-				  &parts_soa_buffer.time_bin[first_part_tmp],
-				  bundle_n_parts * sizeof(timebin_t),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(
+      &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+      bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
 }
 
-void host2device_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
-
-//  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+void host2device_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  //  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
   cudaError_t cu_error;
-//  cudaMemcpyAsync(&tid_p[first_part_tmp], &(parts_soa_buffer.tid_p[first_part_tmp]),
-//				  bundle_n_parts * sizeof(int), cudaMemcpyHostToDevice,
-//				  stream);
-//  cudaMemcpyAsync(&locx[first_part_tmp], &(parts_soa_buffer.locx[first_part_tmp]),
-//				  bundle_n_parts * sizeof(float),
-//				  cudaMemcpyHostToDevice, stream);
-//  cudaMemcpyAsync(&locy[first_part_tmp], &(parts_soa_buffer.locy[first_part_tmp]),
-//				  bundle_n_parts * sizeof(float),
-//				  cudaMemcpyHostToDevice, stream);
-//  cudaMemcpyAsync(&locz[first_part_tmp], &parts_soa_buffer.locz[first_part_tmp],
-//				  bundle_n_parts * sizeof(float),
-//				  cudaMemcpyHostToDevice, stream);
+  //  cudaMemcpyAsync(&tid_p[first_part_tmp],
+  //  &(parts_soa_buffer.tid_p[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(int),
+  //cudaMemcpyHostToDevice, 				  stream);
+  //  cudaMemcpyAsync(&locx[first_part_tmp],
+  //  &(parts_soa_buffer.locx[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
+  //  cudaMemcpyAsync(&locy[first_part_tmp],
+  //  &(parts_soa_buffer.locy[first_part_tmp]),
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
+  //  cudaMemcpyAsync(&locz[first_part_tmp],
+  //  &parts_soa_buffer.locz[first_part_tmp],
+  //				  bundle_n_parts * sizeof(float),
+  //				  cudaMemcpyHostToDevice, stream);
   cudaMemcpyAsync(&h[first_part_tmp], &parts_soa_buffer.h[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&mass[first_part_tmp], &parts_soa_buffer.mass[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&x_p[first_part_tmp], &parts_soa_buffer.x_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&y_p[first_part_tmp], &parts_soa_buffer.y_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&z_p[first_part_tmp], &parts_soa_buffer.z_p[first_part_tmp],
-				  bundle_n_parts * sizeof(double),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(double), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&ux[first_part_tmp], &parts_soa_buffer.ux[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&uy[first_part_tmp], &parts_soa_buffer.uy[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
   cudaMemcpyAsync(&uz[first_part_tmp], &parts_soa_buffer.uz[first_part_tmp],
-				  bundle_n_parts * sizeof(float),
-				  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(&time_bin[first_part_tmp],
-				  &parts_soa_buffer.time_bin[first_part_tmp],
-				  bundle_n_parts * sizeof(timebin_t),
-				  cudaMemcpyHostToDevice, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyHostToDevice,
+                  stream);
+  cudaMemcpyAsync(
+      &time_bin[first_part_tmp], &parts_soa_buffer.time_bin[first_part_tmp],
+      bundle_n_parts * sizeof(timebin_t), cudaMemcpyHostToDevice, stream);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
-
-	host2device_async_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
-
+void host_device_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  host2device_async_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp_i, int bundle_n_parts, cudaStream_t stream){
-
-	host2device_async_density_pair(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, first_part_tmp_i, bundle_n_parts,
-			 stream);
-
+void host_device_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp_i,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  host2device_async_density_pair(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp_i,
+      bundle_n_parts, stream);
 }
 
-void device2host_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+void device2host_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
   cudaError_t cu_error;
 
   cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+                  stream);
   cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
-				&rho_dh[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
-				&wcount[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
-				&wcount_dh[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], &div_v[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+                  &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
-				&rot_ux[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
-				&rot_uy[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
-				&rot_uz[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
 }
 
-void device2host_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
+void device2host_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
   cudaError_t cu_error;
-//  fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i, bundle_n_parts_j);
-//  int bundle_n_parts = bundle_n_parts_i + bundle_n_parts_j;
+  //  fprintf(stderr, "parts i %i parts j %i\n", bundle_n_parts_i,
+  //  bundle_n_parts_j); int bundle_n_parts = bundle_n_parts_i +
+  //  bundle_n_parts_j;
 
   cudaMemcpyAsync(&parts_soa_buffer.rho[first_part_tmp], &rho[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  bundle_n_parts * sizeof(float), cudaMemcpyDeviceToHost,
+                  stream);
   cudaMemcpyAsync(&parts_soa_buffer.rho_dh[first_part_tmp],
-				&rho_dh[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rho_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.wcount[first_part_tmp],
-				&wcount[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &wcount[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.wcount_dh[first_part_tmp],
-				&wcount_dh[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp], &div_v[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &wcount_dh[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(&parts_soa_buffer.div_v[first_part_tmp],
+                  &div_v[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_ux[first_part_tmp],
-				&rot_ux[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_ux[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_uy[first_part_tmp],
-				&rot_uy[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_uy[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(&parts_soa_buffer.rot_uz[first_part_tmp],
-				&rot_uz[first_part_tmp],
-				bundle_n_parts * sizeof(float),
-				cudaMemcpyDeviceToHost, stream);
+                  &rot_uz[first_part_tmp], bundle_n_parts * sizeof(float),
+                  cudaMemcpyDeviceToHost, stream);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_host_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
-
-	device2host_async_density(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
-
+void device_host_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  device2host_async_density(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_host_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream){
-
-	device2host_async_density_pair(parts_soa_buffer, tid_p, id, x_p, y_p, z_p,
-			 ux, uy, uz, a_hydrox, a_hydroy, a_hydroz,
-			 mass, h , u, u_dt, rho, locx, locy,
-			 locz, widthx, widthy, widthz, h_max, count_p,
-			 wcount, wcount_dh, rho_dh, rot_ux, rot_uy, rot_uz,
-			 div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-			 alpha_diff, f, soundspeed, h_dt, balsara, pressure,
-			 alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-	         to_be_synchronized, first_part_tmp, bundle_n_parts, stream);
-
+void device_host_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream) {
+
+  device2host_async_density_pair(
+      parts_soa_buffer, tid_p, id, x_p, y_p, z_p, ux, uy, uz, a_hydrox,
+      a_hydroy, a_hydroz, mass, h, u, u_dt, rho, locx, locy, locz, widthx,
+      widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_ux, rot_uy,
+      rot_uz, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
+      alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
+      time_bin, wakeup, min_ngb_time_bin, to_be_synchronized, first_part_tmp,
+      bundle_n_parts, stream);
 }
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_device_async_bind(struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized){
-
-    parts_soa->tid_p = tid_p;
-    parts_soa->locx = locx;
-    parts_soa->locy = locy;
-    parts_soa->locz = locz;
-    parts_soa->h = h;
-    parts_soa->mass = mass;
-    parts_soa->x_p = x_p;
-    parts_soa->y_p = y_p;
-    parts_soa->z_p = z_p;
-    parts_soa->rho = rho;
-    parts_soa->rho_dh = rho_dh;
-    parts_soa->wcount = wcount;
-    parts_soa->wcount_dh = wcount_dh;
-    parts_soa->ux = ux;
-    parts_soa->uy = uy;
-    parts_soa->uz = uz;
-    parts_soa->div_v = div_v;
-    parts_soa->rot_ux = rot_ux;
-    parts_soa->rot_uy = rot_uy;
-    parts_soa->rot_uz = rot_uz;
-    parts_soa->time_bin = time_bin;
-
+void device_device_async_bind(
+    struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+
+  parts_soa->tid_p = tid_p;
+  parts_soa->locx = locx;
+  parts_soa->locy = locy;
+  parts_soa->locz = locz;
+  parts_soa->h = h;
+  parts_soa->mass = mass;
+  parts_soa->x_p = x_p;
+  parts_soa->y_p = y_p;
+  parts_soa->z_p = z_p;
+  parts_soa->rho = rho;
+  parts_soa->rho_dh = rho_dh;
+  parts_soa->wcount = wcount;
+  parts_soa->wcount_dh = wcount_dh;
+  parts_soa->ux = ux;
+  parts_soa->uy = uy;
+  parts_soa->uz = uz;
+  parts_soa->div_v = div_v;
+  parts_soa->rot_ux = rot_ux;
+  parts_soa->rot_uy = rot_uy;
+  parts_soa->rot_uz = rot_uz;
+  parts_soa->time_bin = time_bin;
 }
 
 #ifdef WITH_CUDA
diff --git a/src/files_for_new_functions/host_device_data_transfer.h b/src/files_for_new_functions/host_device_data_transfer.h
old mode 100755
new mode 100644
index c97b4a5d49..204afd51fa
--- a/src/files_for_new_functions/host_device_data_transfer.h
+++ b/src/files_for_new_functions/host_device_data_transfer.h
@@ -1,4 +1,5 @@
 #include "cuda/part_gpu.h"
+
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
@@ -6,171 +7,228 @@
 
 void host2device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
 
-void device2host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp);
+void device2host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp);
 
-void device2device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp);
+void device2device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
 void host_device_test(int *d_tid_p, int *tid_h, int count_max_parts_tmp);
 
-void device_host_test(struct part_soa parts_soa, int *tid_h, int count_max_parts_tmp);
-
-void device_device_test(int *tid_p, struct part_soa parts_soa, int count_max_parts_tmp);
-
-void device2host_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp);
-
-void device_host_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp);
-
-void device2device_density(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream);
-
-
-void host2device_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp);
+void device_host_test(struct part_soa parts_soa, int *tid_h,
+                      int count_max_parts_tmp);
+
+void device_device_test(int *tid_p, struct part_soa parts_soa,
+                        int count_max_parts_tmp);
+
+void device2host_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+void device_host_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
+
+void device2device_density(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp);
+void host_device_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_device_bind(struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int count_max_parts_tmp, cudaStream_t stream);
-
-void host2device_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+void device_device_bind(
+    struct part_soa *parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized,
+    int count_max_parts_tmp, cudaStream_t stream);
+
+void host2device_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void host_device_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
-
-void device2host_async_density(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+void host_device_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_host_async_cpy(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
+void device_host_async_cpy(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
 
 /*Function to be overloaded using different part_soa structs
  * and allocate their internal arrays
  * alloc_type 0 for density, 1 for force, 2 for gradient*/
-void device_device_async_bind(struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized);
-
-void host_device_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
-
-void device_host_async_cpy_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts, cudaStream_t stream);
-
-void device2host_async_density_pair(struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
-		float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-		float *mass, float *h ,float *u, float *u_dt, float *rho, float *locx, float *locy,
-		float *locz, float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
-		float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux, float *rot_uy, float *rot_uz,
-		float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig, float *laplace_u,
-		float *alpha_diff, float *f, float *soundspeed, float *h_dt, float *balsara,float *pressure,
-		float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-        char *to_be_synchronized, int first_part_tmp, int bundle_n_parts_i, int bundle_n_parts_j,
-		cudaStream_t stream);
+void device_device_async_bind(
+    struct part_soa *parts_soa, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized);
+
+void host_device_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device_host_async_cpy_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts, cudaStream_t stream);
+
+void device2host_async_density_pair(
+    struct part_soa parts_soa_buffer, int *tid_p, long long *id, double *x_p,
+    double *y_p, double *z_p, float *ux, float *uy, float *uz, float *a_hydrox,
+    float *a_hydroy, float *a_hydroz, float *mass, float *h, float *u,
+    float *u_dt, float *rho, float *locx, float *locy, float *locz,
+    float *widthx, float *widthy, float *widthz, float *h_max, int *count_p,
+    float *wcount, float *wcount_dh, float *rho_dh, float *rot_ux,
+    float *rot_uy, float *rot_uz, float *div_v, float *div_v_previous_step,
+    float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
+    float *f, float *soundspeed, float *h_dt, float *balsara, float *pressure,
+    float *alpha_visc_max_ngb, timebin_t *time_bin, timebin_t *wakeup,
+    timebin_t *min_ngb_time_bin, char *to_be_synchronized, int first_part_tmp,
+    int bundle_n_parts_i, int bundle_n_parts_j, cudaStream_t stream);
diff --git a/src/fof.c b/src/fof.c
index 6594b2b60e..e9f69932cf 100644
--- a/src/fof.c
+++ b/src/fof.c
@@ -2350,8 +2350,8 @@ void fof_calc_group_mass(struct fof_props *props, const struct space *s,
         }
 
       } /* Foreign root */
-    } /* Particle is in a group */
-  } /* Loop over particles */
+    }   /* Particle is in a group */
+  }     /* Loop over particles */
 
   size_t nsend = map.size;
   struct fof_mass_send_hashmap hashmap_mass_send = {NULL, 0};
diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
old mode 100755
new mode 100644
index b476b4d766..d36e10b99b
--- a/src/hip/BLOCK_SIZE.h
+++ b/src/hip/BLOCK_SIZE.h
@@ -7,4 +7,4 @@
 #ifdef WITH_CUDA
 //}
 #endif
-#endif // BLOCK_SIZE_H
+#endif  // BLOCK_SIZE_H
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
old mode 100755
new mode 100644
index b85772f6b0..43a52f96ed
--- a/src/hip/HIP_runner_functions.h
+++ b/src/hip/HIP_runner_functions.h
@@ -19,4 +19,4 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
 }
 #endif
 
-#endif // CUDA_HEADER_H
+#endif  // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
old mode 100755
new mode 100644
index 2cba0e9829..237c87dec1
--- a/src/hip/device_functions.h
+++ b/src/hip/device_functions.h
@@ -3,11 +3,11 @@
 #include "../../config.h"
 
 /* Local headers. */
-//#include "../dimension.h"
-//#include "../error.h"
-//#include "../inline.h"
-//#include "../minmax.h"
-//#include "../vector.h"
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
 
 // Is this even necessary? Probably not as our code will operate differently
 #define num_cuda_threads 128
@@ -22,11 +22,11 @@
 #define kernel_ivals 2
 #define kernel_degree 3 /*!< Degree of the polynomial */
 #define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_dim_plus_one                                              \
+#define kernel_gamma_dim_plus_one \
   ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_inv_dim                                                   \
+#define kernel_gamma_inv_dim \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
-#define kernel_gamma_inv_dim_plus_one                                          \
+#define kernel_gamma_inv_dim_plus_one \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
 #define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
 #define kernel_constant ((float)(16. * M_1_PI))
@@ -146,4 +146,4 @@ __device__ void d_kernel_deval(float u, float *restrict W,
 }
 #endif
 
-#endif // DEVICE_FUNCTIONS_H
+#endif  // DEVICE_FUNCTIONS_H
diff --git a/src/lightcone/lightcone_crossing.h b/src/lightcone/lightcone_crossing.h
index 226fbfa3b8..78777f1ff0 100644
--- a/src/lightcone/lightcone_crossing.h
+++ b/src/lightcone/lightcone_crossing.h
@@ -246,7 +246,7 @@ lightcone_check_particle_crosses(
         lightcone_buffer_map_update(props, e, gp, a_cross, x_cross);
 
     } /* Next periodic replication*/
-  } /* Next lightcone */
+  }   /* Next lightcone */
 }
 
 #endif /* SWIFT_LIGHTCONE_CROSSING_H */
diff --git a/src/lightcone/lightcone_replications.c b/src/lightcone/lightcone_replications.c
index 22ec701d92..f65044814c 100644
--- a/src/lightcone/lightcone_replications.c
+++ b/src/lightcone/lightcone_replications.c
@@ -128,8 +128,8 @@ void replication_list_init(struct replication_list *replication_list,
             replication_list->nrep += 1;
           }
         } /* Next replication in z */
-      } /* Next replication in y */
-    } /* Next replication in x */
+      }   /* Next replication in y */
+    }     /* Next replication in x */
 
     /* Allocate storage after first pass */
     if (ipass == 0) {
diff --git a/src/lightcone/lightcone_shell.c b/src/lightcone/lightcone_shell.c
index 51d07bd8b7..e3ac45885f 100644
--- a/src/lightcone/lightcone_shell.c
+++ b/src/lightcone/lightcone_shell.c
@@ -762,7 +762,7 @@ void healpix_smoothing_mapper(void *map_data, int num_elements,
               } /* Next smoothed map */
             }
           } /* Next pixel in this range */
-        } /* Next range of pixels */
+        }   /* Next range of pixels */
 
         /* Free array of pixel ranges */
         free(range);
diff --git a/src/power_spectrum.c b/src/power_spectrum.c
index f6693c7c5a..2f252fca48 100644
--- a/src/power_spectrum.c
+++ b/src/power_spectrum.c
@@ -804,8 +804,8 @@ void pow_from_grid_mapper(void* map_data, const int num, void* extra) {
                          (powgridft[index][0] * powgridft2[index][0] +
                           powgridft[index][1] * powgridft2[index][1]));
       } /* Loop over z */
-    } /* Loop over y */
-  } /* Loop over z */
+    }   /* Loop over y */
+  }     /* Loop over z */
 }
 
 /**
diff --git a/src/queue.h b/src/queue.h
index 9ff0787cfa..9ba441d55e 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -76,9 +76,9 @@ struct queue {
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
   /*Number of pack tasks left in queue A. Nasar */
-  int n_packs_self_left;  /*Number of density pack tasks left in queue*/
-  int n_packs_self_left_f;  /*Number of force pack tasks left in queue*/
-  int n_packs_self_left_g;  /*Number of gradient pack tasks left in queue*/
+  int n_packs_self_left;   /*Number of density pack tasks left in queue*/
+  int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
+  int n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
 
   int n_packs_pair_left;
   int n_packs_pair_left_f;
diff --git a/src/runner_black_holes.c b/src/runner_black_holes.c
index ca5dc32461..aebef16591 100644
--- a/src/runner_black_holes.c
+++ b/src/runner_black_holes.c
@@ -211,7 +211,7 @@ void runner_do_gas_swallow(struct runner *r, struct cell *c, int timer) {
               break;
             }
           } /* Loop over foreign BHs */
-        } /* Is the cell local? */
+        }   /* Is the cell local? */
 #endif
 
         /* If we have a local particle, we must have found the BH in one
@@ -221,8 +221,8 @@ void runner_do_gas_swallow(struct runner *r, struct cell *c, int timer) {
                 p->id, swallow_id);
         }
       } /* Part was flagged for swallowing */
-    } /* Loop over the parts */
-  } /* Cell is not split */
+    }   /* Loop over the parts */
+  }     /* Cell is not split */
 }
 
 /**
@@ -449,7 +449,7 @@ void runner_do_bh_swallow(struct runner *r, struct cell *c, int timer) {
               break;
             }
           } /* Loop over foreign BHs */
-        } /* Is the cell local? */
+        }   /* Is the cell local? */
 #endif
 
         /* If we have a local particle, we must have found the BH in one
@@ -460,8 +460,8 @@ void runner_do_bh_swallow(struct runner *r, struct cell *c, int timer) {
         }
 
       } /* Part was flagged for swallowing */
-    } /* Loop over the parts */
-  } /* Cell is not split */
+    }   /* Loop over the parts */
+  }     /* Cell is not split */
 }
 
 /**
diff --git a/src/runner_doiact_functions_black_holes.h b/src/runner_doiact_functions_black_holes.h
index cbfa9c78ed..09ef41e852 100644
--- a/src/runner_doiact_functions_black_holes.h
+++ b/src/runner_doiact_functions_black_holes.h
@@ -116,8 +116,8 @@ void DOSELF1_BH(struct runner *r, struct cell *c, int timer) {
           }
         }
       } /* loop over the parts in ci. */
-    } /* loop over the bparts in ci. */
-  } /* Do we have gas particles in the cell? */
+    }   /* loop over the bparts in ci. */
+  }     /* Do we have gas particles in the cell? */
 
   /* When doing BH swallowing, we need a quick loop also over the BH
    * neighbours */
@@ -177,7 +177,7 @@ void DOSELF1_BH(struct runner *r, struct cell *c, int timer) {
         }
       }
     } /* loop over the bparts in ci. */
-  } /* loop over the bparts in ci. */
+  }   /* loop over the bparts in ci. */
 
 #endif /* (FUNCTION_TASK_LOOP == TASK_LOOP_SWALLOW) */
 
@@ -286,8 +286,8 @@ void DO_NONSYM_PAIR1_BH_NAIVE(struct runner *r, struct cell *restrict ci,
           }
         }
       } /* loop over the parts in cj. */
-    } /* loop over the bparts in ci. */
-  } /* Do we have gas particles in the cell? */
+    }   /* loop over the bparts in ci. */
+  }     /* Do we have gas particles in the cell? */
 
   /* When doing BH swallowing, we need a quick loop also over the BH
    * neighbours */
@@ -347,7 +347,7 @@ void DO_NONSYM_PAIR1_BH_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the bparts in cj. */
-  } /* loop over the bparts in ci. */
+  }   /* loop over the bparts in ci. */
 
 #endif /* (FUNCTION_TASK_LOOP == TASK_LOOP_SWALLOW) */
 }
@@ -469,7 +469,7 @@ void DOPAIR1_SUBSET_BH_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 }
 
 /**
@@ -557,7 +557,7 @@ void DOSELF1_SUBSET_BH(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 }
 
 /**
diff --git a/src/runner_doiact_functions_hydro.h b/src/runner_doiact_functions_hydro.h
index 1bcf1af207..eff6702b82 100644
--- a/src/runner_doiact_functions_hydro.h
+++ b/src/runner_doiact_functions_hydro.h
@@ -156,7 +156,7 @@ void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -310,7 +310,7 @@ void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -451,7 +451,7 @@ void DOSELF1_NAIVE(struct runner *r, struct cell *restrict c) {
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -592,7 +592,7 @@ void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) {
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -694,7 +694,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_subset_naive);
 }
@@ -803,7 +803,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the parts in ci. */
+    }   /* loop over the parts in ci. */
   }
 
   /* Parts are on the right. */
@@ -869,7 +869,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the parts in ci. */
+    }   /* loop over the parts in ci. */
   }
 
   TIMER_TOC(timer_dopair_subset);
@@ -1036,7 +1036,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(timer_doself_subset);
 }
@@ -1219,8 +1219,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the parts in ci. */
-  } /* Cell ci is active */
+    }   /* loop over the parts in ci. */
+  }     /* Cell ci is active */
 
   if (CELL_IS_ACTIVE(cj, e)) {
 
@@ -1319,8 +1319,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
 #endif
         }
       } /* loop over the parts in ci. */
-    } /* loop over the parts in cj. */
-  } /* Cell cj is active */
+    }   /* loop over the parts in cj. */
+  }     /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -1741,8 +1741,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           }
         }
       } /* loop over the parts in cj. */
-    } /* Is pi active? */
-  } /* Loop over all ci */
+    }   /* Is pi active? */
+  }     /* Loop over all ci */
 
   /* Loop over *all* the parts in cj starting from the centre until
      we are out of range of anything in ci (using the maximal hj). */
@@ -1958,8 +1958,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           }
         }
       } /* loop over the parts in ci. */
-    } /* Is pj active? */
-  } /* Loop over all cj */
+    }   /* Is pj active? */
+  }     /* Loop over all cj */
 
   /* Clean-up if necessary */  // MATTHIEU: temporary disable this optimization
   if (CELL_IS_ACTIVE(ci, e))   // && !cell_is_all_active_hydro(ci, e))
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index d525ed84eb..ca49e2aa45 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1,4254 +1,4618 @@
 #include "scheduler.h"
-struct pack_vars_self{
-    /*List of tasks and respective cells to be packed*/
-	struct task **task_list;
-	struct cell **cell_list;
-	/*List of cell positions*/
-	double *cellx;
-	double *celly;
-	double *cellz;
-	/*List of cell positions*/
-	double *d_cellx;
-	double *d_celly;
-	double *d_cellz;
-	int bundle_size;
-	/*How many particles in a bundle*/
-	int count_parts;
-	/**/
-	int tasks_packed;
-	int *task_first_part;
-	int *task_last_part;
-	int *d_task_first_part;
-	int *d_task_last_part;
-	int * bundle_first_part;
-	int * bundle_last_part;
-	int * bundle_first_task_list;
-	int count_max_parts;
-	int launch;
-	int launch_leftovers;
-	int target_n_tasks;
-	int nBundles;
-	int tasksperbundle;
-
-}pack_vars_self;
-
-struct pack_vars_pair{
-    /*List of tasks and respective cells to be packed*/
-	struct task **task_list;
-	struct cell **ci_list;
-	struct cell **cj_list;
-	/*List of cell shifts*/
-	double *shiftx;
-	double *shifty;
-	double *shiftz;
-	/*List of cell shifts*/
-	double *d_shiftx;
-	double *d_shifty;
-	double *d_shiftz;
-	int bundle_size;
-	/*How many particles in a bundle*/
-	int count_parts;
-	/**/
-	int tasks_packed;
-	int *task_first_part;
-	int *task_last_part;
-	int *d_task_first_part;
-	int *d_task_last_part;
-	int * bundle_first_part;
-	int * bundle_last_part;
-	int * bundle_first_task_list;
-	int count_max_parts;
-	int launch;
-	int launch_leftovers;
-	int target_n_tasks;
-	int nBundles;
-	int tasksperbundle;
-
-}pack_vars_pair;
-
-struct pack_vars_pair_f4{
-    /*List of tasks and respective cells to be packed*/
-	struct task **task_list;
-	struct cell **ci_list;
-	struct cell **cj_list;
-	/*List of cell shifts*/
-	float3 *shift;
-	/*List of cell shifts*/
-	float3 *d_shift;
-	int bundle_size;
-	/*How many particles in a bundle*/
-	int count_parts;
-	/**/
-	int tasks_packed;
-	int4 *fparti_fpartj_lparti_lpartj;
-	int4 *d_fparti_fpartj_lparti_lpartj;
-	int * bundle_first_part;
-	int * bundle_last_part;
-	int * bundle_first_task_list;
-	int count_max_parts;
-	int launch;
-	int launch_leftovers;
-	int target_n_tasks;
-	int nBundles;
-	int tasksperbundle;
-
-}pack_vars_pair_f4;
+struct pack_vars_self {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct cell **cell_list;
+  /*List of cell positions*/
+  double *cellx;
+  double *celly;
+  double *cellz;
+  /*List of cell positions*/
+  double *d_cellx;
+  double *d_celly;
+  double *d_cellz;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int *task_first_part;
+  int *task_last_part;
+  int *d_task_first_part;
+  int *d_task_last_part;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+
+} pack_vars_self;
+
+struct pack_vars_pair {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct cell **ci_list;
+  struct cell **cj_list;
+  /*List of cell shifts*/
+  double *shiftx;
+  double *shifty;
+  double *shiftz;
+  /*List of cell shifts*/
+  double *d_shiftx;
+  double *d_shifty;
+  double *d_shiftz;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int *task_first_part;
+  int *task_last_part;
+  int *d_task_first_part;
+  int *d_task_last_part;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+
+} pack_vars_pair;
+
+struct pack_vars_pair_f4 {
+  /*List of tasks and respective cells to be packed*/
+  struct task **task_list;
+  struct cell **ci_list;
+  struct cell **cj_list;
+  /*List of cell shifts*/
+  float3 *shift;
+  /*List of cell shifts*/
+  float3 *d_shift;
+  int bundle_size;
+  /*How many particles in a bundle*/
+  int count_parts;
+  /**/
+  int tasks_packed;
+  int4 *fparti_fpartj_lparti_lpartj;
+  int4 *d_fparti_fpartj_lparti_lpartj;
+  int *bundle_first_part;
+  int *bundle_last_part;
+  int *bundle_first_task_list;
+  int count_max_parts;
+  int launch;
+  int launch_leftovers;
+  int target_n_tasks;
+  int nBundles;
+  int tasksperbundle;
+
+} pack_vars_pair_f4;
 
-#include "task.h"
-#include "runner_gpu_pack_functions.h"
 #include "cuda/BLOCK_SIZE.h"
 #include "cuda/GPU_runner_functions.h"
+#include "runner_gpu_pack_functions.h"
+#include "task.h"
 #define CUDA_DEBUG
-void runner_doself1_pack(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos *parts_aos, int * packing_time){
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-	pack_vars->cellx[tasks_packed] = ci->loc[0];
-	pack_vars->celly[tasks_packed] = ci->loc[1];
-	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left));
-	t->done = 1;
-	/* Release the lock on the cell */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
+void runner_doself1_pack(struct runner *r, struct scheduler *s,
+                         struct pack_vars_self *pack_vars, struct cell *ci,
+                         struct task *t, struct part_aos *parts_aos,
+                         int *packing_time) {
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->cellx[tasks_packed] = ci->loc[0];
+  pack_vars->celly[tasks_packed] = ci->loc[1];
+  pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos(
+      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        pack_vars->task_first_part[tasks_packed];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left));
+  t->done = 1;
+  /* Release the lock on the cell */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_doself1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_send *parts_send, int2 *task_first_part_f4){
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-//	pack_vars->cellx[tasks_packed] = ci->loc[0];
-//	pack_vars->celly[tasks_packed] = ci->loc[1];
-//	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-//	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos_f4(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-//	d_task_first_part_self_dens_f4[tasks_packed].y = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left));
-	t->done = 1;
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-	  clock_gettime(CLOCK_REALTIME, &t1);
-		/* Release the lock on the cell */
-		task_unlock(t);
-		t->gpu_done = 1;
-//		cell_unlocktree(ci);
-//		signal_sleeping_runners(s, t);
-	  return (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
+double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
+                              struct pack_vars_self *pack_vars, struct cell *ci,
+                              struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              int2 *task_first_part_f4) {
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
+  //	pack_vars->celly[tasks_packed] = ci->loc[1];
+  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  //	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  //	d_task_first_part_self_dens_f4[tasks_packed].y = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left));
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  task_unlock(t);
+  t->gpu_done = 1;
+  //		cell_unlocktree(ci);
+  //		signal_sleeping_runners(s, t);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_doself1_pack_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_g *parts_aos, double *packing_time){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-	pack_vars->cellx[tasks_packed] = ci->loc[0];
-	pack_vars->celly[tasks_packed] = ci->loc[1];
-	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos_g(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done_g++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left_g));
-	t->done = 1;
-	/* Release the lock on the cell */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left_g == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
+void runner_doself1_pack_g(struct runner *r, struct scheduler *s,
+                           struct pack_vars_self *pack_vars, struct cell *ci,
+                           struct task *t, struct part_aos_g *parts_aos,
+                           double *packing_time) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->cellx[tasks_packed] = ci->loc[0];
+  pack_vars->celly[tasks_packed] = ci->loc[1];
+  pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_g(
+      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        pack_vars->task_first_part[tasks_packed];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_g++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+  t->done = 1;
+  /* Release the lock on the cell */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left_g == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_g_send *parts_send, int2 * task_first_part_f4){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-//	pack_vars->cellx[tasks_packed] = ci->loc[0];
-//	pack_vars->celly[tasks_packed] = ci->loc[1];
-//	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos_f4_g(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done_g++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left_g));
-	t->done = 1;
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left_g == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-	/* Release the lock on the cell */
-//	task_unlock(t);
-	cell_unlocktree(ci);
-//	signal_sleeping_runners(s, t);
-    return (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
+                                struct pack_vars_self *pack_vars,
+                                struct cell *ci, struct task *t,
+                                struct part_aos_f4_g_send *parts_send,
+                                int2 *task_first_part_f4) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
+  //	pack_vars->celly[tasks_packed] = ci->loc[1];
+  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4_g(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_g++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left_g == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  //	task_unlock(t);
+  cell_unlocktree(ci);
+  //	signal_sleeping_runners(s, t);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_doself1_pack_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f *parts_aos, double *packing_time){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-	pack_vars->cellx[tasks_packed] = ci->loc[0];
-	pack_vars->celly[tasks_packed] = ci->loc[1];
-	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos_f(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[tasks_packed];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done_f++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left_f));
-	t->done = 1;
-	/* Release the lock on the cell */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left_f == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+void runner_doself1_pack_f(struct runner *r, struct scheduler *s,
+                           struct pack_vars_self *pack_vars, struct cell *ci,
+                           struct task *t, struct part_aos_f *parts_aos,
+                           double *packing_time) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  pack_vars->cellx[tasks_packed] = ci->loc[0];
+  pack_vars->celly[tasks_packed] = ci->loc[1];
+  pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f(
+      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        pack_vars->task_first_part[tasks_packed];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_f++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+  t->done = 1;
+  /* Release the lock on the cell */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left_f == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_f_send *parts_send, int2 * task_first_part_f4){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	int tasks_packed = pack_vars->tasks_packed;
-//	pack_vars->cellx[tasks_packed] = ci->loc[0];
-//	pack_vars->celly[tasks_packed] = ci->loc[1];
-//	pack_vars->cellz[tasks_packed] = ci->loc[2];
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->cell_list[tasks_packed] = ci;
-	//    /* Identify row in particle arrays where this task starts*/
-//	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
-	int *count_parts_self = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-	runner_doself1_gpu_pack_neat_aos_f4_f(r, ci, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-		  count_parts_self, tasks_packed, pack_vars->count_max_parts);
-	//    // identify the row in the array where this task ends (row id of its last particle)
-//	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-	task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-	/* Tell the cell it has been packed */
-	ci->pack_done_f++;
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_self_left_f));
-	t->done = 1;
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_self_left_f == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-	/* Release the lock on the cell */
-//	task_unlock(t);
-	cell_unlocktree(ci);
-//	signal_sleeping_runners(s, t);
-    return (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
+                                struct pack_vars_self *pack_vars,
+                                struct cell *ci, struct task *t,
+                                struct part_aos_f4_f_send *parts_send,
+                                int2 *task_first_part_f4) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  int tasks_packed = pack_vars->tasks_packed;
+  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
+  //	pack_vars->celly[tasks_packed] = ci->loc[1];
+  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->cell_list[tasks_packed] = ci;
+  //    /* Identify row in particle arrays where this task starts*/
+  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
+  int *count_parts_self = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_doself1_gpu_pack_neat_aos_f4_f(
+      r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts_self, tasks_packed, pack_vars->count_max_parts);
+  //    // identify the row in the array where this task ends (row id of its
+  //    last particle)
+  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = task_first_part_f4[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+  /* Tell the cell it has been packed */
+  ci->pack_done_f++;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+  t->done = 1;
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_self_left_f == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  /* Release the lock on the cell */
+  //	task_unlock(t);
+  cell_unlocktree(ci);
+  //	signal_sleeping_runners(s, t);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_dopair1_pack(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		 struct cell *cj, struct task *t, struct part_aos *parts_aos, struct engine *e, double *packing_time){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
-//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
-	                                                              //Be sure to test that pack_vars->count_parts
-	                                                              //is actually increment here
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
-//    		pack_vars->count_parts);
-    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done++;
-	cj->pack_done++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left));
-	t->done = 1;
-
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-	/* Copies done. Release the lock ! */
-//	task_unlock(t);
-	cell_unlocktree(ci);
-	cell_unlocktree(cj);
+void runner_dopair1_pack(struct runner *r, struct scheduler *s,
+                         struct pack_vars_pair *pack_vars, struct cell *ci,
+                         struct cell *cj, struct task *t,
+                         struct part_aos *parts_aos, struct engine *e,
+                         double *packing_time) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+  /*Indexing increment per task is 2 fot these arrays*/
+  const int packed_tmp = tasks_packed * 2;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  pack_vars->task_first_part[packed_tmp + 1] =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos(
+      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
+      tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done++;
+  cj->pack_done++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+  t->done = 1;
+
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Copies done. Release the lock ! */
+  //	task_unlock(t);
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
 }
 
-double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-
-    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
-    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos_f4(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
-//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
-	                                                              //Be sure to test that pack_vars->count_parts
-	                                                              //is actually increment here
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
-//    		pack_vars->count_parts);
-    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
-    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
-//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done++;
-	cj->pack_done++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left));
-	t->done = 1;
-	/* Copies done. Release the lock ! */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    return (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
+                              struct pack_vars_pair *restrict pack_vars,
+                              struct cell *ci, struct cell *cj, struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              struct engine *e,
+                              int4 *fparti_fpartj_lparti_lpartj) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+  //    count_ci;
+
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+  //    pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done++;
+  cj->pack_done++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_dopair1_pack_g(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		 struct cell *cj, struct task *t, struct part_aos_g *parts_aos, struct engine *e, double * packing_time){
-
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed;
-	const int tid_tmp = 2 * tasks_packed;
-	/*shifts for ci*/
-	pack_vars->shiftx[tid_tmp] = 0.0;
-	pack_vars->shifty[tid_tmp] = 0.0;
-	pack_vars->shiftz[tid_tmp] = 0.0;
-	/*shifts for cj. Stored using strided indexing (stride of two per task)*/
-	pack_vars->shiftx[tid_tmp + 1] = 0.0;
-	pack_vars->shifty[tid_tmp + 1] = 0.0;
-	pack_vars->shiftz[tid_tmp + 1] = 0.0;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-    const double cjx = cj->loc[0];
-    const double cjy = cj->loc[1];
-    const double cjz = cj->loc[2];
-
-    /*Correct the shifts for cell i*/
-	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
-	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
-	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
-	pack_vars->shiftx[tid_tmp + 1] = cjx;
-	pack_vars->shifty[tid_tmp + 1] = cjy;
-	pack_vars->shiftz[tid_tmp + 1] = cjz;
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos_g(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj);
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done_g++;
-	cj->pack_done_g++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-	t->done = 1;
-	/* Copies done. Release the lock ! */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left_g == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+void runner_dopair1_pack_g(struct runner *r, struct scheduler *s,
+                           struct pack_vars_pair *pack_vars, struct cell *ci,
+                           struct cell *cj, struct task *t,
+                           struct part_aos_g *parts_aos, struct engine *e,
+                           double *packing_time) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+  const int tid_tmp = 2 * tasks_packed;
+  /*shifts for ci*/
+  pack_vars->shiftx[tid_tmp] = 0.0;
+  pack_vars->shifty[tid_tmp] = 0.0;
+  pack_vars->shiftz[tid_tmp] = 0.0;
+  /*shifts for cj. Stored using strided indexing (stride of two per task)*/
+  pack_vars->shiftx[tid_tmp + 1] = 0.0;
+  pack_vars->shifty[tid_tmp + 1] = 0.0;
+  pack_vars->shiftz[tid_tmp + 1] = 0.0;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  const double cjx = cj->loc[0];
+  const double cjy = cj->loc[1];
+  const double cjz = cj->loc[2];
+
+  /*Correct the shifts for cell i*/
+  pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
+  pack_vars->shifty[tid_tmp] = y_tmp + cjy;
+  pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
+  /*Shift for cell j is it's position. Stored using strided indexing (stride of
+   * two per task)*/
+  pack_vars->shiftx[tid_tmp + 1] = cjx;
+  pack_vars->shifty[tid_tmp + 1] = cjy;
+  pack_vars->shiftz[tid_tmp + 1] = cjz;
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+  /*Indexing increment per task is 2 fot these arrays*/
+  const int packed_tmp = tasks_packed * 2;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  pack_vars->task_first_part[packed_tmp + 1] =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_g(
+      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
+      tid, pack_vars->count_max_parts, count_ci, count_cj);
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_g++;
+  cj->pack_done_g++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left_g == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
+double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
+                                struct pack_vars_pair *restrict pack_vars,
+                                struct cell *ci, struct cell *cj,
+                                struct task *t,
+                                struct part_aos_f4_g_send *parts_send,
+                                struct engine *e,
+                                int4 *fparti_fpartj_lparti_lpartj) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+  //    count_ci;
+
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+  //    pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_g++;
+  cj->pack_done_g++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left_g == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
 
-double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_g_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+void runner_dopair1_pack_f(struct runner *r, struct scheduler *s,
+                           struct pack_vars_pair *pack_vars, struct cell *ci,
+                           struct cell *cj, struct task *t,
+                           struct part_aos_f *parts_aos, struct engine *e,
+                           double *packing_time) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed =
+      pack_vars->tasks_packed;  // Copy pasted this code again. Issue isn't here
+  const int tid_tmp = 2 * tasks_packed;
+  /*shifts for ci*/
+  pack_vars->shiftx[tid_tmp] = 0.0;
+  pack_vars->shifty[tid_tmp] = 0.0;
+  pack_vars->shiftz[tid_tmp] = 0.0;
+  /*shifts for cj. Stored using strided indexing (stride of two per task)*/
+  pack_vars->shiftx[tid_tmp + 1] = 0.0;
+  pack_vars->shifty[tid_tmp + 1] = 0.0;
+  pack_vars->shiftz[tid_tmp + 1] = 0.0;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  const double cjx = cj->loc[0];
+  const double cjy = cj->loc[1];
+  const double cjz = cj->loc[2];
+
+  /*Correct the shifts for cell i*/
+  pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
+  pack_vars->shifty[tid_tmp] = y_tmp + cjy;
+  pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
+  /*Shift for cell j is it's position. Stored using strided indexing (stride of
+   * two per task)*/
+  pack_vars->shiftx[tid_tmp + 1] = cjx;
+  pack_vars->shifty[tid_tmp + 1] = cjy;
+  pack_vars->shiftz[tid_tmp + 1] = cjz;
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+  /*Indexing increment per task is 2 fot these arrays*/
+  const int packed_tmp = tasks_packed * 2;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  pack_vars->task_first_part[packed_tmp + 1] =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f(
+      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
+      tid, pack_vars->count_max_parts, count_ci, count_cj);
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
+  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_f++;
+  cj->pack_done_f++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left_f == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+}
 
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-
-    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
-    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos_f4_g(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
-//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
-	                                                              //Be sure to test that pack_vars->count_parts
-	                                                              //is actually increment here
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
-//    		pack_vars->count_parts);
-    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
-    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
-//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done_g++;
-	cj->pack_done_g++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-	t->done = 1;
-	/* Copies done. Release the lock ! */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left_g == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    return (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
+                                struct pack_vars_pair *restrict pack_vars,
+                                struct cell *ci, struct cell *cj,
+                                struct task *t,
+                                struct part_aos_f4_f_send *parts_send,
+                                struct engine *e,
+                                int4 *fparti_fpartj_lparti_lpartj) {
+
+  /* Timers for how long this all takes.
+   * t0 and t1 are from start to finish including GPU calcs
+   * tp0 and tp1 only time packing and unpacking*/
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+  int tasks_packed = pack_vars->tasks_packed;
+
+  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  /*Get the shifts in case of periodics*/
+  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
+
+  /*Get pointers to the list of tasks and cells packed*/
+  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->ci_list[tasks_packed] = ci;
+  pack_vars->cj_list[tasks_packed] = cj;
+
+  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
+
+  const int count_ci = ci->hydro.count;
+  const int count_cj = cj->hydro.count;
+
+  /*Assign an id for this task*/
+  const int tid = tasks_packed;
+
+  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
+  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
+  //    count_ci;
+
+  fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
+  fparti_fpartj_lparti_lpartj[tasks_packed].y =
+      pack_vars->count_parts + count_ci;
+
+  int *count_parts = &pack_vars->count_parts;
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
+  //    pack_vars->count_parts);
+  /* This re-arranges the particle data from cell->hydro->parts into a
+  long array of part structs*/
+  runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
+      r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
+      count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
+      shift_tmp);
+  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
+  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // pack_vars->count_parts is actually increment here
+  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
+   * packed_tmp+1 is index for cell j */
+
+  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
+  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  fparti_fpartj_lparti_lpartj[tasks_packed].z =
+      pack_vars->count_parts - count_cj;
+  fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
+  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
+  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
+  //    pack_vars->count_parts;
+
+  /* Tell the cells they have been packed */
+  ci->pack_done_f++;
+  cj->pack_done_f++;
+
+  /* Identify first particle for each bundle of tasks */
+  const int bundle_size = pack_vars->bundle_size;
+  if (tasks_packed % bundle_size == 0) {
+    int bid = tasks_packed / bundle_size;
+    pack_vars->bundle_first_part[bid] =
+        fparti_fpartj_lparti_lpartj[tasks_packed].x;
+    pack_vars->bundle_first_task_list[bid] = tasks_packed;
+  }
+
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  t->done = 1;
+  /* Copies done. Release the lock ! */
+  task_unlock(t);
+  pack_vars->tasks_packed++;
+  pack_vars->launch = 0;
+  pack_vars->launch_leftovers = 0;
+  if ((s->queues[qid].n_packs_pair_left_f == 0))
+    pack_vars->launch_leftovers = 1;
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+    pack_vars->launch = 1;
+  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
+   * launch_leftovers statement)*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
+void runner_doself1_launch(struct runner *r, struct scheduler *s,
+                           struct pack_vars_self *pack_vars, struct cell *ci,
+                           struct task *t, struct part_aos *parts_aos,
+                           struct part_aos *d_parts_aos, cudaStream_t *stream,
+                           float d_a, float d_H, struct engine *e,
+                           double *packing_time, double *gpu_time,
+                           double *hmemcpy_time) {
+
+  struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[tasks_packed - 1];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+  *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+                   (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count =
+            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+      }
+    }
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+    ////
+    //										//
+    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error
+    //in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+    //			cudaGetErrorString(cu_error), r->cpuid);
+    //		exit(0);
+    //	  }
+    // #endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+    //	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
+    //			  tasks_packed, pack_vars->launch_leftovers);
+    // Launch the kernel
+    launch_density_aos(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
+        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
+        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // Get error code
+    //	  if (cu_error != cudaSuccess) {
+    //		fprintf(stderr,
+    //				"CUDA error with self density kernel launch: %s
+    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    //	  }
+    // #endif
+    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+    //										//
+    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
+    //with self density D2H memcpy: %s cpuid id is: %i\n ",
+    //				cudaGetErrorString(cu_error), r->cpuid);
+    //		error("Something's up with your cuda code");
+    //	  }
+    // #endif
+  } /*End of looping over bundles to launch in streams*/
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &tp0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    struct cell *cii = pack_vars->cell_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    //              struct cell *cii = ci_list_self_dens[tid];
+    //              struct task *tii = task_list_self_dens[tid];
+
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_doself1_gpu_unpack_neat_aos(r, cii, parts_aos, 0,
+                                       &pack_length_unpack, tid,
+                                       pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done++;
+
+    /* Release the lock */
+    cell_unlocktree(cii);
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    tii->gpu_done = 1;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &tp1);
+  *packing_time +=
+      (tp1.tv_sec - tp0.tv_sec) + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+} /*End of GPU work Self*/
 
-void runner_dopair1_pack_f(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		 struct cell *cj, struct task *t, struct part_aos_f *parts_aos, struct engine *e, double *packing_time){
+void runner_doself1_launch_f4(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int2 *d_task_first_part_self_dens_f4, int devId,
+    int2 *task_first_part_f4, int2 *d_task_first_part_f4,
+    cudaEvent_t *self_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[tasks_packed - 1];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  //    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+  /*Copy arrays containing first and last part for each task to GPU*/
+  //    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed *
+  //    sizeof(int2), devId, NULL);
+  /*Copy cell shifts to device*/
+  //    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+  //    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+  //			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
+    //	  const int n_tasks = last_task - first_task;
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+    //	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+    //      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task],
+    //      (last_task - first_task) * sizeof(int2),
+    //    		  devId, stream[bid]);
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+                    &task_first_part_f4[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
+    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+    //// 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error in density
+    //self host 2 device memcpy: %s cpuid id is: %i\n ",
+    //			cudaGetErrorString(cu_error), r->cpuid);
+    //		exit(0);
+    //	  }
+    //      clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+    //      *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+    //  			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
+    //  1000000000.0;
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
+    ////
+    //										//
+    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error
+    //in density self host 2 device memcpy: %s cpuid id is: %i\n ",
+    //			cudaGetErrorString(cu_error), r->cpuid);
+    //		exit(0);
+    //	  }
+    // #endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
+    //	  struct first_part first_parts;
+    //	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
+    //pack_vars->task_first_part[i]; 	  fprintf(stderr, "Launching kernel with %i
+    //tasks leftovers %i\n", 			  tasks_packed, pack_vars->launch_leftovers);
+    // Launch the kernel
+    launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                          numBlocks_x, numBlocks_y, bundle_first_task,
+                          d_task_first_part_f4);
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // Get error code
+    //	  if (cu_error != cudaSuccess) {
+    //		fprintf(stderr,
+    //				"CUDA error with self density kernel launch: %s
+    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    //	  }
+    // #endif
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+    //										//
+    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
+    //with self density D2H memcpy: %s cpuid id is: %i\n ",
+    //				cudaGetErrorString(cu_error), r->cpuid);
+    //		error("Something's up with your cuda code");
+    //	  }
+    // #endif
+  } /*End of looping over bundles to launch in streams*/
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
 
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed; //Copy pasted this code again. Issue isn't here
-	const int tid_tmp = 2 * tasks_packed;
-	/*shifts for ci*/
-	pack_vars->shiftx[tid_tmp] = 0.0;
-	pack_vars->shifty[tid_tmp] = 0.0;
-	pack_vars->shiftz[tid_tmp] = 0.0;
-	/*shifts for cj. Stored using strided indexing (stride of two per task)*/
-	pack_vars->shiftx[tid_tmp + 1] = 0.0;
-	pack_vars->shifty[tid_tmp + 1] = 0.0;
-	pack_vars->shiftz[tid_tmp + 1] = 0.0;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-    const double cjx = cj->loc[0];
-    const double cjy = cj->loc[1];
-    const double cjz = cj->loc[2];
-
-    /*Correct the shifts for cell i*/
-	pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
-	pack_vars->shifty[tid_tmp] = y_tmp + cjy;
-	pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-	/*Shift for cell j is it's position. Stored using strided indexing (stride of two per task)*/
-	pack_vars->shiftx[tid_tmp + 1] = cjx;
-	pack_vars->shifty[tid_tmp + 1] = cjy;
-	pack_vars->shiftz[tid_tmp + 1] = cjz;
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-    /*Indexing increment per task is 2 fot these arrays*/
-    const int packed_tmp = tasks_packed * 2;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos_f(r, ci, cj, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj);
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done_f++;
-	cj->pack_done_f++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-	t->done = 1;
-	/* Copies done. Release the lock ! */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left_f == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *packing_time += (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
 
-double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_pair * restrict pack_vars, struct cell * ci,
-		 struct cell * cj, struct task *t, struct part_aos_f4_f_send *parts_send, struct engine *e, int4 *fparti_fpartj_lparti_lpartj){
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
 
-    /* Timers for how long this all takes.
-    * t0 and t1 are from start to finish including GPU calcs
-    * tp0 and tp1 only time packing and unpacking*/
-    struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-	int tasks_packed = pack_vars->tasks_packed;
-
-    double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-    /*Get the shifts in case of periodics*/
-    space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-	/*Get pointers to the list of tasks and cells packed*/
-	pack_vars->task_list[tasks_packed] = t;
-	pack_vars->ci_list[tasks_packed] = ci;
-	pack_vars->cj_list[tasks_packed] = cj;
-
-
-    float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
-
-    const int count_ci = ci->hydro.count;
-    const int count_cj = cj->hydro.count;
-
-    /*Assign an id for this task*/
-    const int tid = tasks_packed;
-
-    /* Find first parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-//    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-//    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts + count_ci;
-
-    fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
-    fparti_fpartj_lparti_lpartj[tasks_packed].y = pack_vars->count_parts + count_ci;
-
-    int *count_parts = &pack_vars->count_parts;
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid, pack_vars->count_parts);
-	/* This re-arranges the particle data from cell->hydro->parts into a
-	long array of part structs*/
-    runner_do_ci_cj_gpu_pack_neat_aos_f4_f(r, ci, cj, parts_send, 0/*timer. 0 no timing, 1 for timing*/,
-    		count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
-//	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no timing, 1 for timing*/,
-//		  count_parts, tasks_packed, pack_vars->count_max_parts); //This may cause an issue.
-	                                                              //Be sure to test that pack_vars->count_parts
-	                                                              //is actually increment here
-	/* Find last parts in task for ci and cj. Packed_tmp is index for cell i. packed_tmp+1 is index for cell j */
-
-//    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count %i\n", r->cpuid, *count_parts,
-//    		pack_vars->count_parts);
-    fparti_fpartj_lparti_lpartj[tasks_packed].z = pack_vars->count_parts - count_cj;
-    fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
-//    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-//    pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-	/* Tell the cells they have been packed */
-	ci->pack_done_f++;
-	cj->pack_done_f++;
-
-	/* Identify first particle for each bundle of tasks */
-	const int bundle_size = pack_vars->bundle_size;
-	if (tasks_packed % bundle_size == 0) {
-	  int bid = tasks_packed / bundle_size;
-	  pack_vars->bundle_first_part[bid] = fparti_fpartj_lparti_lpartj[tasks_packed].x;
-	  pack_vars->bundle_first_task_list[bid] = tasks_packed;
-	}
-
-	/* Record that we have now done a packing (self) */
-	int qid = r->qid;
-	atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-	t->done = 1;
-	/* Copies done. Release the lock ! */
-	task_unlock(t);
-	pack_vars->tasks_packed++;
-	pack_vars->launch = 0;
-	pack_vars->launch_leftovers = 0;
-	if ((s->queues[qid].n_packs_pair_left_f == 0))
-		pack_vars->launch_leftovers = 1;
-	if(pack_vars->tasks_packed == pack_vars->target_n_tasks)
-		pack_vars->launch = 1;
-    /*Add time to packing_time. Timer for end of GPU work after the if(launch || launch_leftovers statement)*/
     clock_gettime(CLOCK_REALTIME, &t1);
-    return (t1.tv_sec - t0.tv_sec) +
-                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-void runner_doself1_launch(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos *parts_aos, struct part_aos *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *hmemcpy_time){
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
 
-	struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		}
-	  }
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos), cudaMemcpyHostToDevice, stream[bid]);
-
-//#ifdef CUDA_DEBUG
-//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//										// Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(
-//			stderr,
-//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-//			cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//#endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-        tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-	  const char *loop_type = "density";
-//	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
-//			  tasks_packed, pack_vars->launch_leftovers);
-	  // Launch the kernel
-	  launch_density_aos(
-		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
-		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
-		  numBlocks_x, numBlocks_y, bundle_first_task,
-		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//#endif
-	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos), cudaMemcpyDeviceToHost, stream[bid]);
-
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//										// Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		error("Something's up with your cuda code");
-//	  }
-//#endif
-	}/*End of looping over bundles to launch in streams*/
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &tp0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  struct cell *cii = pack_vars->cell_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-//              struct cell *cii = ci_list_self_dens[tid];
-//              struct task *tii = task_list_self_dens[tid];
-
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	 runner_doself1_gpu_unpack_neat_aos(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done++;
-
-	  /* Release the lock */
-	  cell_unlocktree(cii);
-
-	  /*schedule my dependencies (Only unpacks really)*/
-	  enqueue_dependencies(s, tii);
-	  /*Signal sleeping runners*/
-	  signal_sleeping_runners(s, tii);
-
-	  tii->gpu_done = 1;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &tp1);
-	*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-	(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-} /*End of GPU work Self*/
+      if (tid < tasks_packed) {
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
 
-void runner_doself1_launch_f4(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int2 * d_task_first_part_self_dens_f4,
-		int devId, int2 * task_first_part_f4, int2 * d_task_first_part_f4, cudaEvent_t * self_end){
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
 
-	struct timespec t0, t1, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+        clock_gettime(CLOCK_REALTIME, &tp0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
-	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-//    clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
-    /*Copy arrays containing first and last part for each task to GPU*/
-//    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-//    cudaMemPrefetchAsync(d_task_first_part_self_dens_f4, tasks_packed * sizeof(int2), devId, NULL);
-    /*Copy cell shifts to device*/
-//    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-//    *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-//			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		  last_task = tid;
-		}
-	  }
-//	  const int n_tasks = last_task - first_task;
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-//	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
-//      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task], (last_task - first_task) * sizeof(int2),
-//    		  devId, stream[bid]);
-      cudaMemcpyAsync(&d_task_first_part_f4[first_task], &task_first_part_f4[first_task],
-    		  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
-//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(
-//			stderr,
-//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-//			cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//      clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-//      *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-//  			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
-	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
-
-//#ifdef CUDA_DEBUG
-//	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//										// Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(
-//			stderr,
-//			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-//			cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//#endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-            tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//	  const char *loop_type = "density";
-//	  struct first_part first_parts;
-//	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] = pack_vars->task_first_part[i];
-//	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
-//			  tasks_packed, pack_vars->launch_leftovers);
-	  // Launch the kernel
-	  launch_density_aos_f4(
-			  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
-		  d_task_first_part_f4);
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//#endif
-	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(self_end[bid], stream[bid]);
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//										// Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		error("Something's up with your cuda code");
-//	  }
-//#endif
-	}/*End of looping over bundles to launch in streams*/
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-		cudaEventSynchronize(self_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-			  struct cell *cii = pack_vars->cell_list[tid];
-			  struct task *tii = pack_vars->task_list[tid];
-
-		//              struct cell *cii = ci_list_self_dens[tid];
-		//              struct task *tii = task_list_self_dens[tid];
-
-				clock_gettime(CLOCK_REALTIME, &tp0);
-
-//			  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
-			  while(cell_locktree(cii)) {
-				;  /* spin until we acquire the lock */
-			  }
-//			  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-//				*hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-//				(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
-			  /* Do the copy */
-			 runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-
-			  /* Record things for debugging */
-			  cii->gpu_done++;
-			  /*Time end of unpacking*/
-			  clock_gettime(CLOCK_REALTIME, &tp1);
-			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-			    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-			  /* Release the lock */
-			  cell_unlocktree(cii);
-
-			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
-			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
-
-			  tii->gpu_done = 1;
-
-		  }
-
-		}
-		/*Time end of unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp1);
-//		*hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
+        //			  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        //			  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+        //				*hmemcpy_time += (t1hmemcpy.tv_sec -
+        //t0hmemcpy.tv_sec) + 				(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
+        //1000000000.0;
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0,
+                                              &pack_length_unpack, tid,
+                                              pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done++;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*hmemcpy_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
 
 } /*End of GPU work Self*/
 
-void runner_doself1_launch_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_g *parts_aos, struct part_aos_g *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time){
+void runner_doself1_launch_g(struct runner *r, struct scheduler *s,
+                             struct pack_vars_self *pack_vars, struct cell *ci,
+                             struct task *t, struct part_aos_g *parts_aos,
+                             struct part_aos_g *d_parts_aos,
+                             cudaStream_t *stream, float d_a, float d_H,
+                             struct engine *e, double *packing_time,
+                             double *gpu_time) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[tasks_packed - 1];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count =
+            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+      }
+    }
 
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
 
-	struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		}
-	  }
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_g),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(
-			stderr,
-			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-			cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in density self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-  tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-	  const char *loop_type = "density";
-	  // Launch the kernel
-	  launch_gradient_aos(
-		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
-		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
-		  numBlocks_x, numBlocks_y, bundle_first_task,
-		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+    // Launch the kernel
+    launch_gradient_aos(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
+        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
+        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+          cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyDeviceToHost, stream[bid]);
+    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_g),
+                    cudaMemcpyDeviceToHost, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    struct cell *cii = pack_vars->cell_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    //              struct cell *cii = ci_list_self_dens[tid];
+    //              struct task *tii = task_list_self_dens[tid];
+
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_doself1_gpu_unpack_neat_aos_g(r, cii, parts_aos, 0,
+                                         &pack_length_unpack, tid,
+                                         pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_g++;
+
+    /* Release the lock */
+    cell_unlocktree(cii);
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    tii->gpu_done = 1;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work Self Gradient*/
 
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
+void runner_doself1_launch_f4_g(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    int2 *task_first_part_f4, int2 *d_task_first_part_f4, cudaEvent_t *self_end,
+    double *unpack_time) {
+
+  struct timespec t0, t1, tp0, tp1;
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    //	  if(tasks_packed == 0) error("zero tasks packed but somehow got into
+    //GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  struct cell *cii = pack_vars->cell_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-//              struct cell *cii = ci_list_self_dens[tid];
-//              struct task *tii = task_list_self_dens[tid];
-
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	 runner_doself1_gpu_unpack_neat_aos_g(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_g++;
-
-	  /* Release the lock */
-	  cell_unlocktree(cii);
-
-	  /*schedule my dependencies (Only unpacks really)*/
-	  enqueue_dependencies(s, tii);
-	  /*Signal sleeping runners*/
-	  signal_sleeping_runners(s, tii);
-
-	  tii->gpu_done = 1;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work Self Gradient*/
-
-void runner_doself1_launch_f4_g(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
-		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, int2 *task_first_part_f4, int2 *d_task_first_part_f4,
-		cudaEvent_t *self_end, double *unpack_time){
-
-
-	struct timespec t0, t1, tp0, tp1;
-    clock_gettime(CLOCK_REALTIME, &t0);
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+                    &task_first_part_f4[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-//	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = task_first_part_f4[tid].y - task_first_part_f4[tid].x;
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		  last_task = tid;
-		}
-	  }
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-	  cudaMemcpyAsync(&d_task_first_part_f4[first_task], &task_first_part_f4[first_task],
-			  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
-
-	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
-//	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid, first_part_tmp, bundle_n_parts);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
+    //	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
+    //first_part_tmp, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(
-			stderr,
-			"CUDA error in gradient self host 2 device memcpy: %s cpuid id is: %i\n ",
-			cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in gradient self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-            tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//	  const char *loop_type = "density";
-	  // Launch the kernel
-	  launch_gradient_aos_f4(
-		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
-		  d_task_first_part_f4);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
+    // Launch the kernel
+    launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                           numBlocks_x, numBlocks_y, bundle_first_task,
+                           d_task_first_part_f4);
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with self gradient kernel launch: %s cpuid id is: %i\n ",
+          cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(self_end[bid], stream[bid]);
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self gradient D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-//	exit(0);
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-		cudaEventSynchronize(self_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-			  if(tid < tasks_packed){
-
-			  struct cell *cii = pack_vars->cell_list[tid];
-			  struct task *tii = pack_vars->task_list[tid];
-
-		//              struct cell *cii = ci_list_self_dens[tid];
-		//              struct task *tii = task_list_self_dens[tid];
-
-			  while(cell_locktree(cii)) {
-				;  /* spin until we acquire the lock */
-			  }
-			    /*Time unpacking*/
-				clock_gettime(CLOCK_REALTIME, &tp0);
-			  /* Do the copy */
-			 runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-				/*Time end of unpacking*/
-				clock_gettime(CLOCK_REALTIME, &tp1);
-				*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-				(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-			  /* Record things for debugging */
-			  cii->gpu_done_g++;
-
-			  /* Release the lock */
-			  cell_unlocktree(cii);
-
-			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
-			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
-
-			  tii->gpu_done = 1;
-
-			}
-		}
-		/*Time end of unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp1);
-//		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-
-} /*End of GPU work Self Gradient*/
+  } /*End of looping over bundles to launch in streams*/
+    //	exit(0);
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
 
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-void runner_doself1_launch_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f *parts_aos, struct part_aos_f *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time){
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
 
-	struct timespec t0, t1; //
     clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[tasks_packed - 1];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		}
-	  }
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-	  cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyHostToDevice, stream[bid]);
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
+
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Time unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0,
+                                                &pack_length_unpack, tid,
+                                                pack_vars->count_max_parts, e);
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /* Record things for debugging */
+        cii->gpu_done_g++;
+
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+
+} /*End of GPU work Self Gradient*/
+
+void runner_doself1_launch_f(struct runner *r, struct scheduler *s,
+                             struct pack_vars_self *pack_vars, struct cell *ci,
+                             struct task *t, struct part_aos_f *parts_aos,
+                             struct part_aos_f *d_parts_aos,
+                             cudaStream_t *stream, float d_a, float d_H,
+                             struct engine *e, double *packing_time,
+                             double *gpu_time) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[tasks_packed - 1];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count =
+            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+      }
+    }
+
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(
-			stderr,
-			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-			cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in density self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-  tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-	  const char *loop_type = "density";
-	  // Launch the kernel
-	  launch_force_aos(
-		  d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part, d_a, d_H, loop_type,
-		  stream[bid], BLOCK_SIZE, tasks_packed, tasksperbundle,
-		  numBlocks_x, numBlocks_y, bundle_first_task,
-		  max_parts, pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+    // Launch the kernel
+    launch_force_aos(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
+        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
+        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyDeviceToHost, stream[bid]);
+    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f),
+                    cudaMemcpyDeviceToHost, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    struct cell *cii = pack_vars->cell_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    //              struct cell *cii = ci_list_self_dens[tid];
+    //              struct task *tii = task_list_self_dens[tid];
+
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_doself1_gpu_unpack_neat_aos_f(r, cii, parts_aos, 0,
+                                         &pack_length_unpack, tid,
+                                         pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_f++;
+
+    /* Release the lock */
+    cell_unlocktree(cii);
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    tii->gpu_done = 1;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work Self Gradient*/
 
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
+void runner_doself1_launch_f4_f(
+    struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
+    struct cell *ci, struct task *t, struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv,
+    struct part_aos_f4_f_send *d_parts_send,
+    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f,
+    cudaEvent_t *self_end, double *unpack_time) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        task_first_part_f4_f[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  //    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+  //               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  //    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  //    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
+  //               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    max_parts = 0;
+    int parts_in_bundle = 0;
+    const int first_task = bid * bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in the bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x;
+        parts_in_bundle += count;
+        max_parts = max(max_parts, count);
+        last_task = tid;
+      }
+    }
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  struct cell *cii = pack_vars->cell_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-//              struct cell *cii = ci_list_self_dens[tid];
-//              struct task *tii = task_list_self_dens[tid];
-
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	 runner_doself1_gpu_unpack_neat_aos_f(r, cii, parts_aos, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_f++;
-
-	  /* Release the lock */
-	  cell_unlocktree(cii);
-
-	  /*schedule my dependencies (Only unpacks really)*/
-	  enqueue_dependencies(s, tii);
-	  /*Signal sleeping runners*/
-	  signal_sleeping_runners(s, tii);
-
-	  tii->gpu_done = 1;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work Self Gradient*/
-
-void runner_doself1_launch_f4_f(struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv,
-		struct part_aos_f4_f_send *d_parts_send, struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, int2 *task_first_part_f4_f, int2 *d_task_first_part_f4_f, cudaEvent_t * self_end,
-		double *unpack_time){
-
-	struct timespec t0, t1, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+    const int first_part_tmp = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp;
+    cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
+                    &task_first_part_f4_f[first_task],
+                    (last_task + 1 - first_task) * sizeof(int2),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = task_first_part_f4_f[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-//    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-//               tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-//    cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-//               tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-	  max_parts = 0;
-	  int parts_in_bundle = 0;
-	  const int first_task = bid * bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-		   tid++) {
-		if (tid < tasks_packed) {
-		  /*Get an estimate for the max number of parts per cell in the bundle.
-		   *  Used for determining the number of GPU CUDA blocks*/
-		  int count = task_first_part_f4_f[tid].y - task_first_part_f4_f[tid].x;
-		  parts_in_bundle += count;
-		  max_parts = max(max_parts, count);
-		  last_task = tid;
-		}
-	  }
-
-	  const int first_part_tmp = pack_vars->bundle_first_part[bid];
-	  const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp;
-	  cudaMemcpyAsync(&d_task_first_part_f4_f[first_task], &task_first_part_f4_f[first_task],
-			  (last_task + 1  - first_task) * sizeof(int2), cudaMemcpyHostToDevice, stream[bid]);
-
-	  cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(
-			stderr,
-			"CUDA error in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-			cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error in density self host 2 device memcpy: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  int tasks_left = tasksperbundle;
-	  if (bid == nBundles_temp - 1) {
-		tasks_left =
-             tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-	  }
-	  // Will launch a 2d grid of GPU thread blocks (number of tasks is
-	  // the y dimension and max_parts is the x dimension
-	  int numBlocks_y = tasks_left;
-	  int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-	  // Launch the kernel
-	  launch_force_aos_f4(
-		  d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task,
-		  d_task_first_part_f4_f);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    // Launch the kernel
+    launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+                        numBlocks_x, numBlocks_y, bundle_first_task,
+                        d_task_first_part_f4_f);
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	  cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
-			bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(self_end[bid], stream[bid]);
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-		cudaEventSynchronize(self_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-			  struct cell *cii = pack_vars->cell_list[tid];
-			  struct task *tii = pack_vars->task_list[tid];
-
-		//              struct cell *cii = ci_list_self_dens[tid];
-		//              struct task *tii = task_list_self_dens[tid];
-
-			  while(cell_locktree(cii)) {
-				;  /* spin until we acquire the lock */
-			  }
-			 clock_gettime(CLOCK_REALTIME, &tp0);
-			  /* Do the copy */
-			 runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0, &pack_length_unpack, tid, pack_vars->count_max_parts, e);
-
-			  /* Record things for debugging */
-			  cii->gpu_done_f++;
-			  clock_gettime(CLOCK_REALTIME, &tp1);
-			  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-			  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-			  /* Release the lock */
-			  cell_unlocktree(cii);
-
-			  /*schedule my dependencies (Only unpacks really)*/
-			  enqueue_dependencies(s, tii);
-			  /*Signal sleeping runners*/
-			  signal_sleeping_runners(s, tii);
-
-			  tii->gpu_done = 1;
-		  }
-		}
-		/*Time end of unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp1);
-//		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-	}
-
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-  } /*End of GPU work Self Gradient*/
-
-void runner_dopair1_launch(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos *parts_aos, struct part_aos *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time){
-
-	struct timespec t0, t1; //
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
     clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-//    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
-//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-//    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
-//    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
-          int count_i = pack_vars->task_last_part[tid_tmp]
-															  - pack_vars->task_first_part[tid_tmp];
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = pack_vars->task_last_part[tid_tmp + 1]
-															  - pack_vars->task_first_part[tid_tmp + 1];
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-        }
-      }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos), cudaMemcpyHostToDevice, stream[bid]);
-
-//#ifdef CUDA_DEBUG
-//      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//                                                // Get error code
-//      if (cu_error != cudaSuccess) {
-//        fprintf(stderr,
-//        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-//        cudaGetErrorString(cu_error), r->cpuid);
-//        error("Something's up with your cuda code");
-//      }
-//#endif
-
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(self_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = tasks_left;
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-      const char *loop_type = "density";
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopairci_branch_density_gpu_aos(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopaircj_branch_density_gpu_aos(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
-
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		exit(0);
-//	  }
-//#endif
-
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos), cudaMemcpyDeviceToHost, stream[bid]);
-
-//#ifdef CUDA_DEBUG
-//	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-//										// Get error code
-//	  if (cu_error != cudaSuccess) {
-//		fprintf(stderr,
-//				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-//				cudaGetErrorString(cu_error), r->cpuid);
-//		error("Something's up with your cuda code");
-//	  }
-//#endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
     /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  /*grab cell and task pointers*/
-	  struct cell *cii = pack_vars->ci_list[tid];
-      struct cell *cjj = pack_vars->cj_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-	  /*Let's lock ci*/
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /*Let's lock cj*/
-	  while(cell_locktree(cjj)) {
-	    ;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	  runner_do_ci_cj_gpu_unpack_neat_aos(r, cii, cjj, parts_aos, 0,
-		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_pair++;
-	  cjj->gpu_done_pair++;
-
-	  tii->gpu_done = 1;
-
-      /*schedule my dependencies (Only unpacks really)*/
-      enqueue_dependencies(s, tii);
-      /*Signal sleeping runners*/
-      signal_sleeping_runners(s, tii);
-
-	  /* Release the locks */
-	  cell_unlocktree(cii);
-	  /* Release the locks */
-	  cell_unlocktree(cjj);
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_f4(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-		int4 *d_fparti_fpartj_lparti_lpartj_dens, cudaEvent_t * pair_end){
-
-	struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        struct cell *cii = pack_vars->cell_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        //              struct cell *cii = ci_list_self_dens[tid];
+        //              struct task *tii = task_list_self_dens[tid];
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      const int first_task = bid * pack_vars->bundle_size;
-	  int last_task = (bid + 1) * bundle_size;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
-															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
-															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-
-		  last_task = tid;
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
         }
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /* Do the copy */
+        runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0,
+                                                &pack_length_unpack, tid,
+                                                pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_f++;
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /* Release the lock */
+        cell_unlocktree(cii);
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+    /*Time end of unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp1);
+    //		*unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+} /*End of GPU work Self Gradient*/
+
+void runner_dopair1_launch(struct runner *r, struct scheduler *s,
+                           struct pack_vars_pair *pack_vars, struct cell *ci,
+                           struct task *t, struct part_aos *parts_aos,
+                           struct part_aos *d_parts_aos, cudaStream_t *stream,
+                           float d_a, float d_H, struct engine *e,
+                           double *packing_time, double *gpu_time) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[packed_tmp - 2];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  //    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+  //    		2 * tasks_packed * sizeof(double),
+  //    cudaMemcpyHostToDevice); cudaMemcpy(pack_vars->d_shifty,
+  //    pack_vars->shifty, 		2 * tasks_packed * sizeof(double),
+  //    cudaMemcpyHostToDevice); cudaMemcpy(pack_vars->d_shiftz,
+  //    pack_vars->shiftz, 		2 * tasks_packed * sizeof(double),
+  //    cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        const int tid_tmp = 2 * tid;
+        int count_i = pack_vars->task_last_part[tid_tmp] -
+                      pack_vars->task_first_part[tid_tmp];
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
+                      pack_vars->task_first_part[tid_tmp + 1];
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
+                    &parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos),
+                    cudaMemcpyHostToDevice, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //       cudaError_t cu_error = cudaPeekAtLastError(); //
+    //       cudaGetLastError();        //
+    //                                                 // Get error code
+    //       if (cu_error != cudaSuccess) {
+    //         fprintf(stderr,
+    //         "CUDA error with pair density H2D async  memcpy ci: %s cpuid id
+    //         is: %i\n ", cudaGetErrorString(cu_error), r->cpuid);
+    //         error("Something's up with your cuda code");
+    //       }
+    // #endif
+
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    int tid = 0;
+    int offset = bid * tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopairci_branch_density_gpu_aos(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopaircj_branch_density_gpu_aos(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
+
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // Get error code
+    //	  if (cu_error != cudaSuccess) {
+    //		fprintf(stderr,
+    //				"CUDA error with self density kernel launch: %s
+    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    //	  }
+    // #endif
+
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
+                    &d_parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+
+    // #ifdef CUDA_DEBUG
+    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
+    //										//
+    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
+    //with self density D2H memcpy: %s cpuid id is: %i\n ",
+    //				cudaGetErrorString(cu_error), r->cpuid);
+    //		error("Something's up with your cuda code");
+    //	  }
+    // #endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    /*grab cell and task pointers*/
+    struct cell *cii = pack_vars->ci_list[tid];
+    struct cell *cjj = pack_vars->cj_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    /*Let's lock ci*/
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /*Let's lock cj*/
+    while (cell_locktree(cjj)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_do_ci_cj_gpu_unpack_neat_aos(r, cii, cjj, parts_aos, 0,
+                                        &pack_length_unpack, tid,
+                                        2 * pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_pair++;
+    cjj->gpu_done_pair++;
+
+    tii->gpu_done = 1;
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    /* Release the locks */
+    cell_unlocktree(cii);
+    /* Release the locks */
+    cell_unlocktree(cjj);
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    int4 *d_fparti_fpartj_lparti_lpartj_dens, cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    const int first_task = bid * pack_vars->bundle_size;
+    int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        last_task = tid;
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-      cudaMemcpyAsync(&d_fparti_fpartj_lparti_lpartj_dens[first_task], &fparti_fpartj_lparti_lpartj_dens[first_task],
-    		  (last_task + 1  - first_task) * sizeof(int4), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_fparti_fpartj_lparti_lpartj_dens[first_task],
+                    &fparti_fpartj_lparti_lpartj_dens[first_task],
+                    (last_task + 1 - first_task) * sizeof(int4),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
 
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-      }
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
 
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = tasks_left;
-      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopairci_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopairci_branch_density_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
 
-      numBlocks_x = (max_parts_j + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    numBlocks_x = (max_parts_j + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopaircj_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopaircj_branch_density_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  /*grab cell and task pointers*/
-	  struct cell *cii = pack_vars->ci_list[tid];
-      struct cell *cjj = pack_vars->cj_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-	  /*Let's lock ci*/
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /*Let's lock cj*/
-	  while(cell_locktree(cjj)) {
-	    ;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
-		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_pair++;
-	  cjj->gpu_done_pair++;
-
-	  tii->gpu_done = 1;
-
-      /*schedule my dependencies (Only unpacks really)*/
-      enqueue_dependencies(s, tii);
-      /*Signal sleeping runners*/
-      signal_sleeping_runners(s, tii);
-
-	  /* Release the locks */
-	  cell_unlocktree(cii);
-	  /* Release the locks */
-	  cell_unlocktree(cjj);
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-void runner_dopair1_launch_f4_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
-															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
-															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-        }
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    /*grab cell and task pointers*/
+    struct cell *cii = pack_vars->ci_list[tid];
+    struct cell *cjj = pack_vars->cj_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    /*Let's lock ci*/
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /*Let's lock cj*/
+    while (cell_locktree(cjj)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
+                                           &pack_length_unpack, tid,
+                                           2 * pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_pair++;
+    cjj->gpu_done_pair++;
+
+    tii->gpu_done = 1;
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    /* Release the locks */
+    cell_unlocktree(cii);
+    /* Release the locks */
+    cell_unlocktree(cjj);
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+void runner_dopair1_launch_f4_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = 0;//tasks_left;
-      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_density_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-		/*Time unpacking*/
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-        cudaEventSynchronize(pair_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-		  clock_gettime(CLOCK_REALTIME, &tp0);
-		  /*grab cell and task pointers*/
-		  struct cell *cii = pack_vars->ci_list[tid];
-		  struct cell *cjj = pack_vars->cj_list[tid];
-		  struct task *tii = pack_vars->task_list[tid];
-
-		  /*Let's lock ci*/
-		  if(tii->corner_pair == 1)fprintf(stderr, "Corner task\n");
-		  while(cell_locktree(cii)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /*Let's lock cj*/
-		  while(cell_locktree(cjj)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /* Do the copy */
-		  runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		  /* Record things for debugging */
-		  cii->gpu_done_pair++;
-		  cjj->gpu_done_pair++;
-
-//		  /* Release the locks */
-		  cell_unlocktree(cii);
-//		  /* Release the locks */
-		  cell_unlocktree(cjj);
-
-		  /*Time end of unpacking*/
-		  clock_gettime(CLOCK_REALTIME, &tp1);
-		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
-		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
-
-		  tii->gpu_done = 1;
-
-
-		  }
-		}
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-//	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_f4_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_send *parts_send, struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-		struct part_aos_f4_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
-															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
-															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        /*Let's lock ci*/
+        if (tii->corner_pair == 1) fprintf(stderr, "Corner task\n");
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
         }
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair++;
+        cjj->gpu_done_pair++;
+
+        //		  /* Release the locks */
+        cell_unlocktree(cii);
+        //		  /* Release the locks */
+        cell_unlocktree(cjj);
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_send), cudaMemcpyHostToDevice, stream[bid]);
+void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}
-      	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-            int max_parts_i = 0;
-            int max_parts_j = 0;
-            int parts_in_bundle_ci = 0;
-            int parts_in_bundle_cj = 0;
-            for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-                 tid++) {
-              if (tid < tasks_packed) {
-                /*Get an estimate for the max number of parts per cell in each bundle.
-                 *  Used for determining the number of GPU CUDA blocks*/
-                int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
-      															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
-                parts_in_bundle_ci += count_i;
-                max_parts_i = max(max_parts_i, count_i);
-                int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
-      															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
-                parts_in_bundle_cj += count_j;
-                max_parts_j = max(max_parts_j, count_j);
-              }
-            }
-            const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-            const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-//////////////////////////////////
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = 0;//tasks_left;
-      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopair_branch_density_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+  }
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    //////////////////////////////////
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_density_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		exit(0);
-	  }
+    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
-      	}
-
-    	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-          int max_parts_i = 0;
-          int max_parts_j = 0;
-          int parts_in_bundle_ci = 0;
-          int parts_in_bundle_cj = 0;
-          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-               tid++) {
-            if (tid < tasks_packed) {
-              /*Get an estimate for the max number of parts per cell in each bundle.
-               *  Used for determining the number of GPU CUDA blocks*/
-              int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z
-    															  - fparti_fpartj_lparti_lpartj_dens[tid].x;
-              parts_in_bundle_ci += count_i;
-              max_parts_i = max(max_parts_i, count_i);
-              int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w
-    															  - fparti_fpartj_lparti_lpartj_dens[tid].y;
-              parts_in_bundle_cj += count_j;
-              max_parts_j = max(max_parts_j, count_j);
-            }
-          }
-          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-///////////////////////////////////////////////////////////////////
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+  }
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
+                      fparti_fpartj_lparti_lpartj_dens[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
+                      fparti_fpartj_lparti_lpartj_dens[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    ///////////////////////////////////////////////////////////////////
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
     /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-	  clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-	  cudaEventSynchronize(pair_end[bid]);
-
-	  clock_gettime(CLOCK_REALTIME, &t1);
-	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &tp0);
-//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-
-		    /*grab cell and task pointers*/
-		    struct cell *cii = pack_vars->ci_list[tid];
-		    struct cell *cjj = pack_vars->cj_list[tid];
-		    struct task *tii = pack_vars->task_list[tid];
-
-		  /*Let's lock ci*/
-		    while(cell_locktree(cii)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /*Let's lock cj*/
-		    while(cell_locktree(cjj)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /* Do the copy */
-		    /*Time unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp0);
-		    runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		    /* Record things for debugging */
-		    cii->gpu_done_pair++;
-		    cjj->gpu_done_pair++;
-
-		    tii->gpu_done = 1;
-		    /*Time end of unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp1);
-		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-		    /*schedule my dependencies (Only unpacks really)*/
-		    enqueue_dependencies(s, tii);
-		    /*Signal sleeping runners*/
-		    signal_sleeping_runners(s, tii);
-
-		    /* Release the locks */
-		    cell_unlocktree(cii);
-		    /* Release the locks */
-		    cell_unlocktree(cjj);
-
-		  }
-	  }
-	  /*Time end of unpacking*/
-//	  clock_gettime(CLOCK_REALTIME, &tp1);
-//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_g(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_g *parts_aos, struct part_aos_g *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time){
-
-	struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+    //	clock_gettime(CLOCK_REALTIME, &tp0);
+    //	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
-          int count_i = pack_vars->task_last_part[tid_tmp]
-															  - pack_vars->task_first_part[tid_tmp];
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = pack_vars->task_last_part[tid_tmp + 1]
-															  - pack_vars->task_first_part[tid_tmp + 1];
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-        }
-      }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyHostToDevice, stream[bid]);
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
-#ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
-#endif
+      if (tid < tasks_packed) {
+
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
 
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
+        }
+        /* Do the copy */
+        /*Time unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair++;
+        cjj->gpu_done_pair++;
+
+        tii->gpu_done = 1;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        /* Release the locks */
+        cell_unlocktree(cii);
+        /* Release the locks */
+        cell_unlocktree(cjj);
       }
+    }
+    /*Time end of unpacking*/
+    //	  clock_gettime(CLOCK_REALTIME, &tp1);
+    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
 
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = tasks_left;
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-      const char *loop_type = "density";
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopairci_branch_density_gpu_aos_g(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopaircj_branch_density_gpu_aos_g(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+void runner_dopair1_launch_g(struct runner *r, struct scheduler *s,
+                             struct pack_vars_pair *pack_vars, struct cell *ci,
+                             struct task *t, struct part_aos_g *parts_aos,
+                             struct part_aos_g *d_parts_aos,
+                             cudaStream_t *stream, float d_a, float d_H,
+                             struct engine *e, double *packing_time,
+                             double *gpu_time) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[packed_tmp - 2];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        const int tid_tmp = 2 * tid;
+        int count_i = pack_vars->task_last_part[tid_tmp] -
+                      pack_vars->task_first_part[tid_tmp];
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
+                      pack_vars->task_first_part[tid_tmp + 1];
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
+                    &parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_g),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_g), cudaMemcpyDeviceToHost, stream[bid]);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    int tid = 0;
+    int offset = bid * tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopairci_branch_density_gpu_aos_g(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopaircj_branch_density_gpu_aos_g(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+          cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
 
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
+                    &d_parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_g),
+                    cudaMemcpyDeviceToHost, stream[bid]);
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  /*grab cell and task pointers*/
-	  struct cell *cii = pack_vars->ci_list[tid];
-      struct cell *cjj = pack_vars->cj_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-	  /*Let's lock ci*/
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /*Let's lock cj*/
-	  while(cell_locktree(cjj)) {
-	    ;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	  runner_do_ci_cj_gpu_unpack_neat_aos_g(r, cii, cjj, parts_aos, 0,
-		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_pair_g++;
-	  cjj->gpu_done_pair_g++;
-
-	  tii->gpu_done = 1;
-
-      /*schedule my dependencies (Only unpacks really)*/
-      enqueue_dependencies(s, tii);
-      /*Signal sleeping runners*/
-      signal_sleeping_runners(s, tii);
-
-	  /* Release the locks */
-	  cell_unlocktree(cii);
-	  /* Release the locks */
-	  cell_unlocktree(cjj);
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    /*grab cell and task pointers*/
+    struct cell *cii = pack_vars->ci_list[tid];
+    struct cell *cjj = pack_vars->cj_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    /*Let's lock ci*/
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /*Let's lock cj*/
+    while (cell_locktree(cjj)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_do_ci_cj_gpu_unpack_neat_aos_g(r, cii, cjj, parts_aos, 0,
+                                          &pack_length_unpack, tid,
+                                          2 * pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_pair_g++;
+    cjj->gpu_done_pair_g++;
+
+    tii->gpu_done = 1;
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    /* Release the locks */
+    cell_unlocktree(cii);
+    /* Release the locks */
+    cell_unlocktree(cjj);
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 } /*End of GPU work*/
 
-void runner_dopair1_launch_f4_g_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
-		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj[tid].z
-															  - fparti_fpartj_lparti_lpartj[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj[tid].w
-															  - fparti_fpartj_lparti_lpartj[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-        }
+void runner_dopair1_launch_f4_g_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
 
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = 0;//tasks_left;
-      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_gradient_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-		/*Time unpacking*/
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-        cudaEventSynchronize(pair_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-		  clock_gettime(CLOCK_REALTIME, &tp0);
-		  /*grab cell and task pointers*/
-		  struct cell *cii = pack_vars->ci_list[tid];
-		  struct cell *cjj = pack_vars->cj_list[tid];
-		  struct task *tii = pack_vars->task_list[tid];
-		  /*Let's lock ci*/
-		  while(cell_locktree(cii)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /*Let's lock cj*/
-		  while(cell_locktree(cjj)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /* Do the copy */
-		  runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		  /* Record things for debugging */
-		  cii->gpu_done_pair_g++;
-		  cjj->gpu_done_pair_g++;
-
-		  /* Release the locks */
-		  cell_unlocktree(cii);
-		  /* Release the locks */
-		  cell_unlocktree(cjj);
-
-		  /*Time end of unpacking*/
-		  clock_gettime(CLOCK_REALTIME, &tp1);
-		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
-		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
-
-		  tii->gpu_done = 1;
-
-
-		  }
-		}
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-//	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_g_send *parts_send, struct part_aos_f4_g_recv *parts_recv, struct part_aos_f4_g_send *d_parts_send,
-		struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj[tid].z
-															  - fparti_fpartj_lparti_lpartj[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj[tid].w
-															  - fparti_fpartj_lparti_lpartj[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-
-//		  last_task = tid;
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
         }
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_g++;
+        cjj->gpu_done_pair_g++;
+
+        /* Release the locks */
+        cell_unlocktree(cii);
+        /* Release the locks */
+        cell_unlocktree(cjj);
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_g_send), cudaMemcpyHostToDevice, stream[bid]);
+void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //      const int first_task = bid * pack_vars->bundle_size;
+    //	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //		  last_task = tid;
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}
-      	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-          int max_parts_i = 0;
-          int max_parts_j = 0;
-          int parts_in_bundle_ci = 0;
-          int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//      	  int last_task = (bid + 1) * bundle_size;
-          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-                 tid++) {
-              if (tid < tasks_packed) {
-                /*Get an estimate for the max number of parts per cell in each bundle.
-                 *  Used for determining the number of GPU CUDA blocks*/
-                int count_i = fparti_fpartj_lparti_lpartj[tid].z
-      															  - fparti_fpartj_lparti_lpartj[tid].x;
-                parts_in_bundle_ci += count_i;
-                max_parts_i = max(max_parts_i, count_i);
-                int count_j = fparti_fpartj_lparti_lpartj[tid].w
-      															  - fparti_fpartj_lparti_lpartj[tid].y;
-                parts_in_bundle_cj += count_j;
-                max_parts_j = max(max_parts_j, count_j);
-
-//      		  last_task = tid;
-              }
-          }
-          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-//////////////////////////////////
-//	      const int tasksperbundle = pack_vars->tasksperbundle;
-	      /* LAUNCH THE GPU KERNELS for ci & cj */
-          // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-          // the y dimension and max_parts is the x dimension
-          int numBlocks_y = 0;//tasks_left;
-          int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-
-          /* Launch the kernel for ci using data for ci and cj */
-          runner_dopair_branch_gradient_gpu_aos_f4(d_parts_send, d_parts_recv,
-		        d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+  }
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //          const int first_task = bid * pack_vars->bundle_size;
+    //      	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //      		  last_task = tid;
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    //////////////////////////////////
+    //	      const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_gradient_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-          cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
-	      if (cu_error != cudaSuccess) {
-		    fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		    exit(0);
-	      }
+    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
-      	}
-
-    	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-          int max_parts_i = 0;
-          int max_parts_j = 0;
-          int parts_in_bundle_ci = 0;
-          int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//    	  int last_task = (bid + 1) * bundle_size;
-          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-               tid++) {
-            if (tid < tasks_packed) {
-              /*Get an estimate for the max number of parts per cell in each bundle.
-               *  Used for determining the number of GPU CUDA blocks*/
-              int count_i = fparti_fpartj_lparti_lpartj[tid].z
-    															  - fparti_fpartj_lparti_lpartj[tid].x;
-              parts_in_bundle_ci += count_i;
-              max_parts_i = max(max_parts_i, count_i);
-              int count_j = fparti_fpartj_lparti_lpartj[tid].w
-    															  - fparti_fpartj_lparti_lpartj[tid].y;
-              parts_in_bundle_cj += count_j;
-              max_parts_j = max(max_parts_j, count_j);
-
-//    		  last_task = tid;
-            }
-          }
-          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-///////////////////////////////////////////////////////////////////
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_g_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+  }
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //          const int first_task = bid * pack_vars->bundle_size;
+    //    	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //    		  last_task = tid;
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    ///////////////////////////////////////////////////////////////////
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    clock_gettime(CLOCK_REALTIME, &t0);
+
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
 
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
     /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-	  clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-	  cudaEventSynchronize(pair_end[bid]);
-
-	  clock_gettime(CLOCK_REALTIME, &t1);
-	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &tp0);
-//	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-
-		    /*grab cell and task pointers*/
-		    struct cell *cii = pack_vars->ci_list[tid];
-		    struct cell *cjj = pack_vars->cj_list[tid];
-		    struct task *tii = pack_vars->task_list[tid];
-
-		  /*Let's lock ci*/
-		    while(cell_locktree(cii)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /*Let's lock cj*/
-		    while(cell_locktree(cjj)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /* Do the copy */
-		    /*Time unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp0);
-		    runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		    /* Record things for debugging */
-		    cii->gpu_done_pair_g++;
-		    cjj->gpu_done_pair_g++;
-
-		    tii->gpu_done = 1;
-		    /*Time end of unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp1);
-		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-		    /*schedule my dependencies (Only unpacks really)*/
-		    enqueue_dependencies(s, tii);
-		    /*Signal sleeping runners*/
-		    signal_sleeping_runners(s, tii);
-
-		    /* Release the locks */
-		    cell_unlocktree(cii);
-		    /* Release the locks */
-		    cell_unlocktree(cjj);
-
-		  }
-	  }
-	  /*Time end of unpacking*/
-//	  clock_gettime(CLOCK_REALTIME, &tp1);
-//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_f(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars, struct cell *ci,
-		struct task *t, struct part_aos_f *parts_aos, struct part_aos_f *d_parts_aos, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time){
-
-	struct timespec t0, t1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
+    //	clock_gettime(CLOCK_REALTIME, &tp0);
+    //	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-    /*Copy arrays containing first and last part for each task to GPU*/
-    cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-               2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-    /*Copy cell shifts to device*/
-    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
-    		2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-      	const int tid_tmp = 2 * tid;
-          int count_i = pack_vars->task_last_part[tid_tmp]
-															  - pack_vars->task_first_part[tid_tmp];
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = pack_vars->task_last_part[tid_tmp + 1]
-															  - pack_vars->task_first_part[tid_tmp + 1];
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-        }
-      }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-      cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i], &parts_aos[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyHostToDevice, stream[bid]);
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
-#ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
-#endif
+      if (tid < tasks_packed) {
 
-	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-      int tid = 0;
-      int offset = bid * tasksperbundle;
-      int tasks_left = tasksperbundle;
-      if (bid == nBundles_temp - 1) {
-        tasks_left =
-        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
+        }
+        /* Do the copy */
+        /*Time unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_g++;
+        cjj->gpu_done_pair_g++;
+
+        tii->gpu_done = 1;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        /* Release the locks */
+        cell_unlocktree(cii);
+        /* Release the locks */
+        cell_unlocktree(cjj);
       }
+    }
+    /*Time end of unpacking*/
+    //	  clock_gettime(CLOCK_REALTIME, &tp1);
+    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
 
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = tasks_left;
-//      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-      const char *loop_type = "density";
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopairci_branch_density_gpu_aos_f(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
-
-        /* Launch the kernel for ci using data for ci and cj */
-        runner_dopaircj_branch_density_gpu_aos_f(d_parts_aos,
-        pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-	    d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE,
-	    tasks_packed, tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid,
-	    offset, bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty, pack_vars->d_shiftz);
+void runner_dopair1_launch_f(struct runner *r, struct scheduler *s,
+                             struct pack_vars_pair *pack_vars, struct cell *ci,
+                             struct task *t, struct part_aos_f *parts_aos,
+                             struct part_aos_f *d_parts_aos,
+                             cudaStream_t *stream, float d_a, float d_H,
+                             struct engine *e, double *packing_time,
+                             double *gpu_time) {
+
+  struct timespec t0, t1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    pack_vars->bundle_first_part[nBundles_temp] =
+        pack_vars->task_first_part[packed_tmp - 2];
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+  /*Copy arrays containing first and last part for each task to GPU*/
+  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
+             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
+
+  /*Copy cell shifts to device*/
+  cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
+             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //      const int first_task = bid * pack_vars->bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        const int tid_tmp = 2 * tid;
+        int count_i = pack_vars->task_last_part[tid_tmp] -
+                      pack_vars->task_first_part[tid_tmp];
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
+                      pack_vars->task_first_part[tid_tmp + 1];
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
+                    &parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		exit(0);
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_aos[first_part_tmp_i], &d_parts_aos[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f), cudaMemcpyDeviceToHost, stream[bid]);
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    int tid = 0;
+    int offset = bid * tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    //      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    const char *loop_type = "density";
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopairci_branch_density_gpu_aos_f(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopaircj_branch_density_gpu_aos_f(
+        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
+        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
+        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
+        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
+        pack_vars->d_shiftz);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
+          cudaGetErrorString(cu_error), r->cpuid);
+      exit(0);
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
 
-	/* Make sure all the kernels and copies back are finished */
-	cudaDeviceSynchronize();
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
+                    &d_parts_aos[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f),
+                    cudaMemcpyDeviceToHost, stream[bid]);
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-    /*Time unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int tid = 0; tid < tasks_packed; tid++) {
-
-	  /*grab cell and task pointers*/
-	  struct cell *cii = pack_vars->ci_list[tid];
-      struct cell *cjj = pack_vars->cj_list[tid];
-	  struct task *tii = pack_vars->task_list[tid];
-
-	  /*Let's lock ci*/
-	  while(cell_locktree(cii)) {
-		;  /* spin until we acquire the lock */
-	  }
-	  /*Let's lock cj*/
-	  while(cell_locktree(cjj)) {
-	    ;  /* spin until we acquire the lock */
-	  }
-	  /* Do the copy */
-	  runner_do_ci_cj_gpu_unpack_neat_aos_f(r, cii, cjj, parts_aos, 0,
-		  	  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-	  /* Record things for debugging */
-	  cii->gpu_done_pair_f++;
-	  cjj->gpu_done_pair_f++;
-
-	  tii->gpu_done = 1;
-
-      /*schedule my dependencies (Only unpacks really)*/
-      enqueue_dependencies(s, tii);
-      /*Signal sleeping runners*/
-      signal_sleeping_runners(s, tii);
-
-	  /* Release the locks */
-	  cell_unlocktree(cii);
-	  /* Release the locks */
-	  cell_unlocktree(cjj);
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*packing_time += (t1.tv_sec - t0.tv_sec) +
-	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#ifdef CUDA_DEBUG
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
+#endif
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int tid = 0; tid < tasks_packed; tid++) {
+
+    /*grab cell and task pointers*/
+    struct cell *cii = pack_vars->ci_list[tid];
+    struct cell *cjj = pack_vars->cj_list[tid];
+    struct task *tii = pack_vars->task_list[tid];
+
+    /*Let's lock ci*/
+    while (cell_locktree(cii)) {
+      ; /* spin until we acquire the lock */
+    }
+    /*Let's lock cj*/
+    while (cell_locktree(cjj)) {
+      ; /* spin until we acquire the lock */
+    }
+    /* Do the copy */
+    runner_do_ci_cj_gpu_unpack_neat_aos_f(r, cii, cjj, parts_aos, 0,
+                                          &pack_length_unpack, tid,
+                                          2 * pack_vars->count_max_parts, e);
+
+    /* Record things for debugging */
+    cii->gpu_done_pair_f++;
+    cjj->gpu_done_pair_f++;
+
+    tii->gpu_done = 1;
+
+    /*schedule my dependencies (Only unpacks really)*/
+    enqueue_dependencies(s, tii);
+    /*Signal sleeping runners*/
+    signal_sleeping_runners(s, tii);
+
+    /* Release the locks */
+    cell_unlocktree(cii);
+    /* Release the locks */
+    cell_unlocktree(cjj);
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *packing_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 } /*End of GPU work*/
 
-void runner_dopair1_launch_f4_f_one_memcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, struct part_aos_f4_f_send *d_parts_send,
-		struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/*tasks-packed needs decrementing before calculating packed_tmp as it was incremented in runner_dopair1_pack*/
-//	const int packed_tmp = 2 * (tasks_packed - 1);
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-//	int max_parts = 0;
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj[tid].z
-															  - fparti_fpartj_lparti_lpartj[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj[tid].w
-															  - fparti_fpartj_lparti_lpartj[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-
-//		  last_task = tid;
-        }
+void runner_dopair1_launch_f4_f_one_memcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv,
+    struct part_aos_f4_f_send *d_parts_send,
+    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //      const int first_task = bid * pack_vars->bundle_size;
+    //	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //		  last_task = tid;
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
 
-//	  const int tasksperbundle = pack_vars->tasksperbundle;
-	  /* LAUNCH THE GPU KERNELS for ci & cj */
-//      int tid = 0;
-//      int offset = bid * tasksperbundle;
-//      int tasks_left = tasksperbundle;
-//      if (bid == nBundles_temp - 1) {
-//        tasks_left =
-//        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-//      }
-
-      // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-      // the y dimension and max_parts is the x dimension
-      int numBlocks_y = 0;//tasks_left;
-      int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-      int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-
-      /* Launch the kernel for ci using data for ci and cj */
-      runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
-		      d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    //      int tid = 0;
+    //      int offset = bid * tasksperbundle;
+    //      int tasks_left = tasksperbundle;
+    //      if (bid == nBundles_temp - 1) {
+    //        tasks_left =
+    //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    //      }
+
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+                                          stream[bid], numBlocks_x, numBlocks_y,
+                                          bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		exit(0);
-	  }
+    cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
 
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
-
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
-
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-		/*Time unpacking*/
-		clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-        cudaEventSynchronize(pair_end[bid]);
-
-		clock_gettime(CLOCK_REALTIME, &t1);
-		*gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//		clock_gettime(CLOCK_REALTIME, &tp0);
-//		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-		for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-		  clock_gettime(CLOCK_REALTIME, &tp0);
-		  /*grab cell and task pointers*/
-		  struct cell *cii = pack_vars->ci_list[tid];
-		  struct cell *cjj = pack_vars->cj_list[tid];
-		  struct task *tii = pack_vars->task_list[tid];
-		  /*Let's lock ci*/
-		  while(cell_locktree(cii)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /*Let's lock cj*/
-		  while(cell_locktree(cjj)) {
-			;  /* spin until we acquire the lock */
-		  }
-		  /* Do the copy */
-		  runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		  /* Record things for debugging */
-		  cii->gpu_done_pair_f++;
-		  cjj->gpu_done_pair_f++;
-
-//		  /* Release the locks */
-		  cell_unlocktree(cii);
-//		  /* Release the locks */
-		  cell_unlocktree(cjj);
-
-		  /*Time end of unpacking*/
-		  clock_gettime(CLOCK_REALTIME, &tp1);
-		  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-		  /*schedule my dependencies (Only unpacks really)*/
-		  enqueue_dependencies(s, tii);
-		  /*Signal sleeping runners*/
-		  signal_sleeping_runners(s, tii);
-
-		  tii->gpu_done = 1;
-
-
-		  }
-		}
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-//	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
-
-void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-		struct task *t, struct part_aos_f4_f_send *parts_send, struct part_aos_f4_f_recv *parts_recv, struct part_aos_f4_f_send *d_parts_send,
-		struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t * stream, float d_a, float d_H,
-		struct engine *e, double *packing_time, double *gpu_time, double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-		cudaEvent_t * pair_end){
-
-	struct timespec t0, t1, tp0, tp1; //
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Identify the number of GPU bundles to run in ideal case*/
-	int nBundles_temp = pack_vars->nBundles;
-	/*How many tasks have we packed?*/
-	const int tasks_packed = pack_vars->tasks_packed;
-
-	/*How many tasks should be in a bundle?*/
-	const int bundle_size = pack_vars->bundle_size;
-
-	/* Special case for incomplete bundles (when having leftover tasks not enough to fill a bundle) */
-	if (pack_vars->launch_leftovers) {
-	  nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-	  if(tasks_packed == 0) error("zero pair tasks packed but somehow got into GPU loop");
-//	  pack_vars->bundle_first_part[nBundles_temp] = pack_vars->task_first_part[packed_tmp - 2];
-	  pack_vars->bundle_first_part[nBundles_temp] = fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-	}
-    /* Identify the last particle for each bundle of tasks */
-    for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    	pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-	}
-    /* special treatment for the last bundle */
-    if(nBundles_temp > 1)pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-    else pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-	/* Launch the copies for each bundle and run the GPU kernel */
-	/*We don't go into this loop if tasks_left_self == 1 as
-	 nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-      int max_parts_i = 0;
-      int max_parts_j = 0;
-      int parts_in_bundle_ci = 0;
-      int parts_in_bundle_cj = 0;
-//      const int first_task = bid * pack_vars->bundle_size;
-//	  int last_task = (bid + 1) * bundle_size;
-      for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-           tid++) {
-        if (tid < tasks_packed) {
-          /*Get an estimate for the max number of parts per cell in each bundle.
-           *  Used for determining the number of GPU CUDA blocks*/
-          int count_i = fparti_fpartj_lparti_lpartj[tid].z
-															  - fparti_fpartj_lparti_lpartj[tid].x;
-          parts_in_bundle_ci += count_i;
-          max_parts_i = max(max_parts_i, count_i);
-          int count_j = fparti_fpartj_lparti_lpartj[tid].w
-															  - fparti_fpartj_lparti_lpartj[tid].y;
-          parts_in_bundle_cj += count_j;
-          max_parts_j = max(max_parts_j, count_j);
-
-//		  last_task = tid;
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
         }
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_f++;
+        cjj->gpu_done_pair_f++;
+
+        //		  /* Release the locks */
+        cell_unlocktree(cii);
+        //		  /* Release the locks */
+        cell_unlocktree(cjj);
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
+    }
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
+
+void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_f_send *parts_send,
+    struct part_aos_f4_f_recv *parts_recv,
+    struct part_aos_f4_f_send *d_parts_send,
+    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
+    cudaEvent_t *pair_end) {
+
+  struct timespec t0, t1, tp0, tp1;  //
+  clock_gettime(CLOCK_REALTIME, &t0);
+
+  /* Identify the number of GPU bundles to run in ideal case*/
+  int nBundles_temp = pack_vars->nBundles;
+  /*How many tasks have we packed?*/
+  const int tasks_packed = pack_vars->tasks_packed;
+
+  /*How many tasks should be in a bundle?*/
+  const int bundle_size = pack_vars->bundle_size;
+
+  /* Special case for incomplete bundles (when having leftover tasks not enough
+   * to fill a bundle) */
+  if (pack_vars->launch_leftovers) {
+    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
+    if (tasks_packed == 0)
+      error("zero pair tasks packed but somehow got into GPU loop");
+    //	  pack_vars->bundle_first_part[nBundles_temp] =
+    //pack_vars->task_first_part[packed_tmp - 2];
+    pack_vars->bundle_first_part[nBundles_temp] =
+        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
+  }
+  /* Identify the last particle for each bundle of tasks */
+  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
+    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
+  }
+  /* special treatment for the last bundle */
+  if (nBundles_temp > 1)
+    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
+  else
+    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
+
+  /* Launch the copies for each bundle and run the GPU kernel */
+  /*We don't go into this loop if tasks_left_self == 1 as
+   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //      const int first_task = bid * pack_vars->bundle_size;
+    //	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //		  last_task = tid;
       }
-      const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-      const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-      cudaMemcpyAsync(&d_parts_send[first_part_tmp_i], &parts_send[first_part_tmp_i],
-        bundle_n_parts * sizeof(struct part_aos_f4_f_send), cudaMemcpyHostToDevice, stream[bid]);
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+                    &parts_send[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-                                                // Get error code
-      if (cu_error != cudaSuccess) {
-        fprintf(stderr,
-        "CUDA error with pair density H2D async  memcpy ci: %s cpuid id is: %i\n ",
-        cudaGetErrorString(cu_error), r->cpuid);
-        error("Something's up with your cuda code");
-      }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
+              "is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}
-      	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-          int max_parts_i = 0;
-          int max_parts_j = 0;
-          int parts_in_bundle_ci = 0;
-          int parts_in_bundle_cj = 0;
-//          const int first_task = bid * pack_vars->bundle_size;
-//      	  int last_task = (bid + 1) * bundle_size;
-          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-                 tid++) {
-              if (tid < tasks_packed) {
-                /*Get an estimate for the max number of parts per cell in each bundle.
-                 *  Used for determining the number of GPU CUDA blocks*/
-                int count_i = fparti_fpartj_lparti_lpartj[tid].z
-      															  - fparti_fpartj_lparti_lpartj[tid].x;
-                parts_in_bundle_ci += count_i;
-                max_parts_i = max(max_parts_i, count_i);
-                int count_j = fparti_fpartj_lparti_lpartj[tid].w
-      															  - fparti_fpartj_lparti_lpartj[tid].y;
-                parts_in_bundle_cj += count_j;
-                max_parts_j = max(max_parts_j, count_j);
-
-//      		  last_task = tid;
-              }
-          }
-          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-//////////////////////////////////
-	      /* LAUNCH THE GPU KERNELS for ci & cj */
-          // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-          // the y dimension and max_parts is the x dimension
-          int numBlocks_y = 0;//tasks_left;
-          int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-          int bundle_part_0 = pack_vars->bundle_first_part[bid];
-//          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-//              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n", bundle_part_0, bundle_first_task);
-
-          /* Launch the kernel for ci using data for ci and cj */
-          runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv,
-		        d_a, d_H, stream[bid], numBlocks_x, numBlocks_y, bundle_part_0, bundle_n_parts);
+  }
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    //          const int first_task = bid * pack_vars->bundle_size;
+    //      	  int last_task = (bid + 1) * bundle_size;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+
+        //      		  last_task = tid;
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    //////////////////////////////////
+    /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+    //          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
+    //              bundle_part_0, bundle_first_task);
+
+    /* Launch the kernel for ci using data for ci and cj */
+    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+                                          stream[bid], numBlocks_x, numBlocks_y,
+                                          bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-          cudaError_t cu_error = cudaPeekAtLastError(); // Get error code
-	      if (cu_error != cudaSuccess) {
-		    fprintf(stderr,
-				"CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-				"nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-				cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y, max_parts_i, max_parts_j);
-		    exit(0);
-	      }
+    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      exit(0);
+    }
 #endif
-      	}
-
-    	for (int bid = 0; bid < nBundles_temp; bid++) {
-
-          int max_parts_i = 0;
-          int max_parts_j = 0;
-          int parts_in_bundle_ci = 0;
-          int parts_in_bundle_cj = 0;
-          for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size;
-               tid++) {
-            if (tid < tasks_packed) {
-              /*Get an estimate for the max number of parts per cell in each bundle.
-               *  Used for determining the number of GPU CUDA blocks*/
-              int count_i = fparti_fpartj_lparti_lpartj[tid].z
-    															  - fparti_fpartj_lparti_lpartj[tid].x;
-              parts_in_bundle_ci += count_i;
-              max_parts_i = max(max_parts_i, count_i);
-              int count_j = fparti_fpartj_lparti_lpartj[tid].w
-    															  - fparti_fpartj_lparti_lpartj[tid].y;
-              parts_in_bundle_cj += count_j;
-              max_parts_j = max(max_parts_j, count_j);
-            }
-          }
-          const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-          const int bundle_n_parts = pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-///////////////////////////////////////////////////////////////////
-        // Copy results back to CPU BUFFERS
-      cudaMemcpyAsync(&parts_recv[first_part_tmp_i], &d_parts_recv[first_part_tmp_i],
-    	  	  bundle_n_parts * sizeof(struct part_aos_f4_f_recv), cudaMemcpyDeviceToHost, stream[bid]);
-      cudaEventRecord(pair_end[bid], stream[bid]);
+  }
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+
+    int max_parts_i = 0;
+    int max_parts_j = 0;
+    int parts_in_bundle_ci = 0;
+    int parts_in_bundle_cj = 0;
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+      if (tid < tasks_packed) {
+        /*Get an estimate for the max number of parts per cell in each bundle.
+         *  Used for determining the number of GPU CUDA blocks*/
+        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
+                      fparti_fpartj_lparti_lpartj[tid].x;
+        parts_in_bundle_ci += count_i;
+        max_parts_i = max(max_parts_i, count_i);
+        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
+                      fparti_fpartj_lparti_lpartj[tid].y;
+        parts_in_bundle_cj += count_j;
+        max_parts_j = max(max_parts_j, count_j);
+      }
+    }
+    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
+    const int bundle_n_parts =
+        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    ///////////////////////////////////////////////////////////////////
+    // Copy results back to CPU BUFFERS
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-      cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-										// Get error code
-	  if (cu_error != cudaSuccess) {
-		fprintf(stderr,
-				"CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-				cudaGetErrorString(cu_error), r->cpuid);
-		error("Something's up with your cuda code");
-	  }
+    cudaError_t cu_error =
+        cudaPeekAtLastError();  // cudaGetLastError();        //
+                                // Get error code
+    if (cu_error != cudaSuccess) {
+      fprintf(stderr,
+              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
+              cudaGetErrorString(cu_error), r->cpuid);
+      error("Something's up with your cuda code");
+    }
 #endif
-	}/*End of looping over bundles to launch in streams*/
+  } /*End of looping over bundles to launch in streams*/
+
+  /* Make sure all the kernels and copies back are finished */
+  //	cudaDeviceSynchronize();
+
+  /*Time end of GPU work*/
+  clock_gettime(CLOCK_REALTIME, &t1);
+  *gpu_time +=
+      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+  /*Time unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t0);
+  /* Now copy the data back from the CPU thread-local buffers to the cells */
+  /* Pack length counter for use in unpacking */
+  int pack_length_unpack = 0;
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    clock_gettime(CLOCK_REALTIME, &t0);
 
-	/* Make sure all the kernels and copies back are finished */
-//	cudaDeviceSynchronize();
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
+
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-    /*Time end of GPU work*/
-	clock_gettime(CLOCK_REALTIME, &t1);
-	*gpu_time += (t1.tv_sec - t0.tv_sec) +
-			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
     /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t0);
-	/* Now copy the data back from the CPU thread-local buffers to the cells */
-	/* Pack length counter for use in unpacking */
-	int pack_length_unpack=0;
-	for (int bid = 0; bid < nBundles_temp; bid++){
-	  clock_gettime(CLOCK_REALTIME, &t0);
-
-//		cudaStreamSynchronize(stream[bid]);
-	  cudaEventSynchronize(pair_end[bid]);
-
-	  clock_gettime(CLOCK_REALTIME, &t1);
-	  *gpu_time += (t1.tv_sec - t0.tv_sec) +
-				(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-	    /*Time unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &tp0);
-
-	  for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-		  if(tid < tasks_packed){
-
-		    /*grab cell and task pointers*/
-		    struct cell *cii = pack_vars->ci_list[tid];
-		    struct cell *cjj = pack_vars->cj_list[tid];
-		    struct task *tii = pack_vars->task_list[tid];
-
-		  /*Let's lock ci*/
-		    while(cell_locktree(cii)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /*Let's lock cj*/
-		    while(cell_locktree(cjj)) {
-			  ;  /* spin until we acquire the lock */
-		    }
-		  /* Do the copy */
-		    /*Time unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp0);
-		    runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(r, cii, cjj, parts_recv, 0,
-				  &pack_length_unpack, tid, 2 * pack_vars->count_max_parts, e);
-
-		    /* Record things for debugging */
-		    cii->gpu_done_pair_f++;
-		    cjj->gpu_done_pair_f++;
-
-		    tii->gpu_done = 1;
-		    /*Time end of unpacking*/
-		    clock_gettime(CLOCK_REALTIME, &tp1);
-		    *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-		    (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-		    /*schedule my dependencies (Only unpacks really)*/
-		    enqueue_dependencies(s, tii);
-		    /*Signal sleeping runners*/
-		    signal_sleeping_runners(s, tii);
-
-		    /* Release the locks */
-		    cell_unlocktree(cii);
-		    /* Release the locks */
-		    cell_unlocktree(cjj);
-
-		  }
-	  }
-	  /*Time end of unpacking*/
-//	  clock_gettime(CLOCK_REALTIME, &tp1);
-//	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-//		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-//		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-	}
-	/* Zero counters for the next pack operations */
-	pack_vars->count_parts = 0;
-	pack_vars->tasks_packed = 0;
-	/*Time end of unpacking*/
-//	clock_gettime(CLOCK_REALTIME, &t1);
-//	*packing_time += (t1.tv_sec - t0.tv_sec) +
-//	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  } /*End of GPU work*/
+    //	clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
+      if (tid < tasks_packed) {
+
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+        /*Let's lock ci*/
+        while (cell_locktree(cii)) {
+          ; /* spin until we acquire the lock */
+        }
+        /*Let's lock cj*/
+        while (cell_locktree(cjj)) {
+          ; /* spin until we acquire the lock */
+        }
+        /* Do the copy */
+        /*Time unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        /* Record things for debugging */
+        cii->gpu_done_pair_f++;
+        cjj->gpu_done_pair_f++;
+
+        tii->gpu_done = 1;
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        /*schedule my dependencies (Only unpacks really)*/
+        enqueue_dependencies(s, tii);
+        /*Signal sleeping runners*/
+        signal_sleeping_runners(s, tii);
+
+        /* Release the locks */
+        cell_unlocktree(cii);
+        /* Release the locks */
+        cell_unlocktree(cjj);
+      }
+    }
+    /*Time end of unpacking*/
+    //	  clock_gettime(CLOCK_REALTIME, &tp1);
+    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
+    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+  }
+  /* Zero counters for the next pack operations */
+  pack_vars->count_parts = 0;
+  pack_vars->tasks_packed = 0;
+  /*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+} /*End of GPU work*/
diff --git a/src/runner_doiact_functions_limiter.h b/src/runner_doiact_functions_limiter.h
index 0d7e07de6a..44f6572b42 100644
--- a/src/runner_doiact_functions_limiter.h
+++ b/src/runner_doiact_functions_limiter.h
@@ -123,7 +123,7 @@ void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
       }
 
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -216,7 +216,7 @@ void DOSELF1_NAIVE(struct runner *r, struct cell *restrict c) {
         IACT_NONSYM(r2, dx, hj, hi, pj, pi, a, H);
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -355,8 +355,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           IACT_NONSYM(r2, dx, hi, hj, pi, pj, a, H);
         }
       } /* loop over the parts in cj. */
-    } /* loop over the parts in ci. */
-  } /* Cell ci is active */
+    }   /* loop over the parts in ci. */
+  }     /* Cell ci is active */
 
   if (cell_is_starting_hydro(cj, e)) {
 
@@ -439,8 +439,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           IACT_NONSYM(r2, dx, hj, hi, pj, pi, a, H);
         }
       } /* loop over the parts in ci. */
-    } /* loop over the parts in cj. */
-  } /* Cell cj is active */
+    }   /* loop over the parts in cj. */
+  }     /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
diff --git a/src/runner_doiact_functions_stars.h b/src/runner_doiact_functions_stars.h
index b91066f509..4cf6a7c9a9 100644
--- a/src/runner_doiact_functions_stars.h
+++ b/src/runner_doiact_functions_stars.h
@@ -150,7 +150,7 @@ void DOSELF1_STARS(struct runner *r, struct cell *c, int timer) {
 #endif
       }
     } /* loop over the parts in ci. */
-  } /* loop over the sparts in ci. */
+  }   /* loop over the sparts in ci. */
 
   TIMER_TOC(TIMER_DOSELF_STARS);
 }
@@ -280,7 +280,7 @@ void DO_NONSYM_PAIR1_STARS_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 }
 
 /**
@@ -475,8 +475,8 @@ void DO_SYM_PAIR1_STARS(struct runner *r, struct cell *ci, struct cell *cj,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the parts in ci. */
-  } /* do_ci_stars */
+    }   /* loop over the parts in ci. */
+  }     /* do_ci_stars */
 
   if (do_cj_stars) {
     /* Pick-out the sorted lists. */
@@ -629,8 +629,8 @@ void DO_SYM_PAIR1_STARS(struct runner *r, struct cell *ci, struct cell *cj,
 #endif
         }
       } /* loop over the parts in ci. */
-    } /* loop over the parts in cj. */
-  } /* Cell cj is active */
+    }   /* loop over the parts in cj. */
+  }     /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR_STARS);
 }
@@ -755,7 +755,7 @@ void DOPAIR1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the sparts in ci. */
+    }   /* loop over the sparts in ci. */
   }
 
   /* Sparts are on the right. */
@@ -818,7 +818,7 @@ void DOPAIR1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    } /* loop over the sparts in ci. */
+    }   /* loop over the sparts in ci. */
   }
 }
 
@@ -916,7 +916,7 @@ void DOPAIR1_SUBSET_STARS_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 }
 
 /**
@@ -1003,7 +1003,7 @@ void DOSELF1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 }
 
 /**
diff --git a/src/runner_doiact_grav.c b/src/runner_doiact_grav.c
index fb4fef6bd1..e2f50d7214 100644
--- a/src/runner_doiact_grav.c
+++ b/src/runner_doiact_grav.c
@@ -2509,7 +2509,7 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci,
       multi_i->pot.interacted = 1;
 
     } /* We are in charge of this pair */
-  } /* Loop over top-level cells */
+  }   /* Loop over top-level cells */
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
 }
diff --git a/src/runner_doiact_nosort.h b/src/runner_doiact_nosort.h
index 51d2412d0f..4b500fe2e6 100644
--- a/src/runner_doiact_nosort.h
+++ b/src/runner_doiact_nosort.h
@@ -315,7 +315,7 @@ void DOPAIR_SUBSET_NOSORT(struct runner *r, struct cell *restrict ci,
         IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
       }
     } /* loop over the parts in cj. */
-  } /* loop over the parts in ci. */
+  }   /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_subset);
 }
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
old mode 100755
new mode 100644
index 1394fb0cad..de0feff44d
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -1,8 +1,8 @@
-//#include "active.h"
-//#include <cuda_runtime.h>
-//#include <vector>
-//#include "cuda/cell_gpu.h"
-//#include "runner_gpu_functions.cuh"
+// #include "active.h"
+// #include <cuda_runtime.h>
+// #include <vector>
+// #include "cuda/cell_gpu.h"
+// #include "runner_gpu_functions.cuh"
 /* This object's header. */
 #include "runner.h"
 /* Local headers. */
@@ -13,19 +13,19 @@
 #include "space_getsid.h"
 #include "timers.h"
 
-//#ifdef WITHCUDA
-//extern "C" {
-//#endif
+// #ifdef WITHCUDA
+// extern "C" {
+// #endif
 
-void runner_doself1_gpu_pack_neat(
-    struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c,
+                                  struct part_soa parts_soa_buffer, int timer,
+                                  int *pack_length, int tid,
+                                  int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -42,19 +42,18 @@ void runner_doself1_gpu_pack_neat(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_doself1_gpu_pack_neat_aos(
-    struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos_buffer,
+                                      int timer, int *pack_length, int tid,
+                                      int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -71,19 +70,18 @@ void runner_doself1_gpu_pack_neat_aos(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_doself1_gpu_pack_neat_aos_f4(
-    struct runner *r, struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+    struct runner *r, struct cell *__restrict__ c,
+    struct part_aos_f4_send *__restrict__ parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -96,23 +94,23 @@ void runner_doself1_gpu_pack_neat_aos_f4(
 #endif
   int2 frst_lst_prts = {local_pack_position, local_pack_position + count};
   /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count, frst_lst_prts);
+  pack_neat_aos_f4(c, parts_aos_buffer, tid, local_pack_position, count,
+                   frst_lst_prts);
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_doself1_gpu_pack_neat_aos_g(
-    struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c,
+                                        struct part_aos_g *parts_aos_buffer,
+                                        int timer, int *pack_length, int tid,
+                                        int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -129,19 +127,18 @@ void runner_doself1_gpu_pack_neat_aos_g(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_doself1_gpu_pack_neat_aos_f4_g(
-    struct runner *r, struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_send *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -158,19 +155,18 @@ void runner_doself1_gpu_pack_neat_aos_f4_g(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_doself1_gpu_pack_neat_aos_f(
-    struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c,
+                                        struct part_aos_f *parts_aos_buffer,
+                                        int timer, int *pack_length, int tid,
+                                        int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -187,19 +183,18 @@ void runner_doself1_gpu_pack_neat_aos_f(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_doself1_gpu_pack_neat_aos_f4_f(
-    struct runner *r, struct cell *restrict c, struct part_aos_f4_f_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -216,11 +211,11 @@ void runner_doself1_gpu_pack_neat_aos_f4_f(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void pack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count) {
+void pack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
+               int local_pack_position, int count) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
@@ -240,20 +235,21 @@ void pack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int lo
     parts_soa_buffer.locz[id_in_pack] = c->loc[2];
     parts_soa_buffer.mass[id_in_pack] = p.mass;
     parts_soa_buffer.h[id_in_pack] = p.h;
-//    parts_soa_buffer.time_bin[id_in_pack] = p.time_bin;
+    //    parts_soa_buffer.time_bin[id_in_pack] = p.time_bin;
     /*Initialise sums to zero before CPU/GPU copy*/
-    parts_soa_buffer.rho[id_in_pack] = 0.f;//p.rho;
-    parts_soa_buffer.rho_dh[id_in_pack] = 0.f;//p.density.rho_dh;
-    parts_soa_buffer.wcount[id_in_pack] = 0.f;//p.density.wcount;
-    parts_soa_buffer.wcount_dh[id_in_pack] = 0.f;//p.density.wcount_dh;
-    parts_soa_buffer.div_v[id_in_pack] = 0.f;//p.viscosity.div_v;
-    parts_soa_buffer.rot_ux[id_in_pack] = 0.f;//p.density.rot_v[0];
-    parts_soa_buffer.rot_uy[id_in_pack] = 0.f;//p.density.rot_v[1];
-    parts_soa_buffer.rot_uz[id_in_pack] = 0.f;//p.density.rot_v[2];
+    parts_soa_buffer.rho[id_in_pack] = 0.f;        // p.rho;
+    parts_soa_buffer.rho_dh[id_in_pack] = 0.f;     // p.density.rho_dh;
+    parts_soa_buffer.wcount[id_in_pack] = 0.f;     // p.density.wcount;
+    parts_soa_buffer.wcount_dh[id_in_pack] = 0.f;  // p.density.wcount_dh;
+    parts_soa_buffer.div_v[id_in_pack] = 0.f;      // p.viscosity.div_v;
+    parts_soa_buffer.rot_ux[id_in_pack] = 0.f;     // p.density.rot_v[0];
+    parts_soa_buffer.rot_uy[id_in_pack] = 0.f;     // p.density.rot_v[1];
+    parts_soa_buffer.rot_uz[id_in_pack] = 0.f;     // p.density.rot_v[2];
   }
 }
 
-void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count) {
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                   int local_pack_position, int count) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
@@ -269,20 +265,22 @@ void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, i
     parts_aos_buffer[id_in_pack].uz = p.v[2];
     parts_aos_buffer[id_in_pack].mass = p.mass;
     parts_aos_buffer[id_in_pack].h = p.h;
-    parts_aos_buffer[id_in_pack].time_bin = 1000;//p.time_bin;
+    parts_aos_buffer[id_in_pack].time_bin = 1000;  // p.time_bin;
     /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].rho = 0.f;//p.rho;
-    parts_aos_buffer[id_in_pack].rho_dh = 0.f;//p.density.rho_dh;
-    parts_aos_buffer[id_in_pack].wcount = 0.f;//p.density.wcount;
-    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;//p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].div_v = 0.f;//p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].rot_ux = 0.f;//p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rot_uy = 0.f;//p.density.rot_v[1];
-    parts_aos_buffer[id_in_pack].rot_uz = 0.f;//p.density.rot_v[2];
+    parts_aos_buffer[id_in_pack].rho = 0.f;        // p.rho;
+    parts_aos_buffer[id_in_pack].rho_dh = 0.f;     // p.density.rho_dh;
+    parts_aos_buffer[id_in_pack].wcount = 0.f;     // p.density.wcount;
+    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;  // p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].div_v = 0.f;      // p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].rot_ux = 0.f;     // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rot_uy = 0.f;     // p.density.rot_v[1];
+    parts_aos_buffer[id_in_pack].rot_uz = 0.f;     // p.density.rot_v[2];
   }
 }
 
-void pack_neat_pair_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, float3 shift) {
+void pack_neat_pair_aos(struct cell *c, struct part_aos *parts_aos_buffer,
+                        int tid, int local_pack_position, int count,
+                        float3 shift) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
@@ -298,21 +296,25 @@ void pack_neat_pair_aos(struct cell *c, struct part_aos *parts_aos_buffer, int t
     parts_aos_buffer[id_in_pack].uz = p.v[2];
     parts_aos_buffer[id_in_pack].mass = p.mass;
     parts_aos_buffer[id_in_pack].h = p.h;
-    parts_aos_buffer[id_in_pack].time_bin = 1000;//p.time_bin;
+    parts_aos_buffer[id_in_pack].time_bin = 1000;  // p.time_bin;
     /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].rho = 0.f;//p.rho;
-    parts_aos_buffer[id_in_pack].rho_dh = 0.f;//p.density.rho_dh;
-    parts_aos_buffer[id_in_pack].wcount = 0.f;//p.density.wcount;
-    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;//p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].div_v = 0.f;//p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].rot_ux = 0.f;//p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rot_uy = 0.f;//p.density.rot_v[1];
-    parts_aos_buffer[id_in_pack].rot_uz = 0.f;//p.density.rot_v[2];
+    parts_aos_buffer[id_in_pack].rho = 0.f;        // p.rho;
+    parts_aos_buffer[id_in_pack].rho_dh = 0.f;     // p.density.rho_dh;
+    parts_aos_buffer[id_in_pack].wcount = 0.f;     // p.density.wcount;
+    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;  // p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].div_v = 0.f;      // p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].rot_ux = 0.f;     // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rot_uy = 0.f;     // p.density.rot_v[1];
+    parts_aos_buffer[id_in_pack].rot_uz = 0.f;     // p.density.rot_v[2];
   }
 }
 
-extern inline void pack_neat_pair_aos_f4(struct cell * __restrict c, struct part_aos_f4_send * __restrict parts_aos_buffer, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
-	    /*Data to be copied to GPU*/
+extern inline void pack_neat_pair_aos_f4(
+    struct cell *__restrict c,
+    struct part_aos_f4_send *__restrict parts_aos_buffer, int tid,
+    const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  /*Data to be copied to GPU*/
   for (int i = 0; i < count; i++) {
     const int id_in_pack = i + local_pack_position;
     parts_aos_buffer[id_in_pack].x_p_h.x = c->hydro.parts[i].x[0] - shift.x;
@@ -328,15 +330,18 @@ extern inline void pack_neat_pair_aos_f4(struct cell * __restrict c, struct part
   }
 }
 
-void pack_neat_aos_f4(struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos_buffer, int tid, int local_pack_position, int count, int2 frst_lst_prts) {
+void pack_neat_aos_f4(struct cell *__restrict__ c,
+                      struct part_aos_f4_send *__restrict__ parts_aos_buffer,
+                      int tid, int local_pack_position, int count,
+                      int2 frst_lst_prts) {
 
   struct part ptmps[count];
   memcpy(ptmps, (c->hydro.parts), count * sizeof(struct part));
-//  ptmps = c->hydro.parts;
+  //  ptmps = c->hydro.parts;
   const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
   for (int i = 0; i < count; i++) {
     const int id_in_pack = i + local_pack_position;
-//    const struct part p = ptmps[i];
+    //    const struct part p = ptmps[i];
     /*Data to be copied to GPU*/
     parts_aos_buffer[id_in_pack].x_p_h.x = ptmps[i].x[0] - cellx;
     parts_aos_buffer[id_in_pack].x_p_h.y = ptmps[i].x[1] - celly;
@@ -346,14 +351,15 @@ void pack_neat_aos_f4(struct cell * __restrict__ c, struct part_aos_f4_send * __
     parts_aos_buffer[id_in_pack].ux_m.y = ptmps[i].v[1];
     parts_aos_buffer[id_in_pack].ux_m.z = ptmps[i].v[2];
     parts_aos_buffer[id_in_pack].ux_m.w = ptmps[i].mass;
-//    /*Initialise sums to zero before CPU/GPU copy*/
-//    const float4 zeroes = {0.0, 0.0, 0.0, 0.0};
-//    parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes;
-//    parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes;
+    //    /*Initialise sums to zero before CPU/GPU copy*/
+    //    const float4 zeroes = {0.0, 0.0, 0.0, 0.0};
+    //    parts_aos_buffer[id_in_pack].rho_dh_wcount = zeroes;
+    //    parts_aos_buffer[id_in_pack].rot_ux_div_v = zeroes;
   }
 }
 
-void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count) {
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                     int tid, int local_pack_position, int count) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
@@ -372,21 +378,25 @@ void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int ti
     parts_aos_buffer[id_in_pack].time_bin = 1000;
     parts_aos_buffer[id_in_pack].rho = p.rho;
     parts_aos_buffer[id_in_pack].visc_alpha = p.viscosity.alpha;
-    parts_aos_buffer[id_in_pack].alpha_visc_max_ngb = p.force.alpha_visc_max_ngb;//p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].v_sig = p.viscosity.v_sig;//p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].soundspeed = p.force.soundspeed;//p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].u = p.u;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].alpha_visc_max_ngb =
+        p.force.alpha_visc_max_ngb;  // p.density.wcount_dh;
+    parts_aos_buffer[id_in_pack].v_sig =
+        p.viscosity.v_sig;  // p.viscosity.div_v;
+    parts_aos_buffer[id_in_pack].soundspeed =
+        p.force.soundspeed;                // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].u = p.u;  // p.density.rot_v[0];
     /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].laplace_u = 0.f;//p.density.wcount;
+    parts_aos_buffer[id_in_pack].laplace_u = 0.f;  // p.density.wcount;
   }
 }
 
-void pack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer, int tid, int local_pack_position, int count) {
+void pack_neat_aos_f4_g(struct cell *c,
+                        struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+                        int local_pack_position, int count) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
-  const float cellx = c->loc[0], celly = c->loc[1],
-              cellz = c->loc[2];
+  const float cellx = c->loc[0], celly = c->loc[1], cellz = c->loc[2];
   for (int i = 0; i < count; i++) {
     int id_in_pack = i + local_pack_position;
     const struct part p = ptmps[i];
@@ -401,13 +411,18 @@ void pack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_send *parts_aos_buf
     parts_aos_buffer[id_in_pack].ux_m.w = p.mass;
     parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = p.rho;
     parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = p.viscosity.alpha;
-    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u;//p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = p.force.soundspeed;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = p.u;  // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+        p.force.soundspeed;  // p.density.rot_v[0];
   }
 }
 
-extern inline void pack_neat_pair_aos_f4_g(struct cell * __restrict c, struct part_aos_f4_g_send * __restrict parts_aos_buffer, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
-	    /*Data to be copied to GPU*/
+extern inline void pack_neat_pair_aos_f4_g(
+    struct cell *__restrict c,
+    struct part_aos_f4_g_send *__restrict parts_aos_buffer, int tid,
+    const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  /*Data to be copied to GPU*/
   for (int i = 0; i < count; i++) {
     const int id_in_pack = i + local_pack_position;
     parts_aos_buffer[id_in_pack].x_h.x = c->hydro.parts[i].x[0] - shift.x;
@@ -419,15 +434,19 @@ extern inline void pack_neat_pair_aos_f4_g(struct cell * __restrict c, struct pa
     parts_aos_buffer[id_in_pack].ux_m.z = c->hydro.parts[i].v[2];
     parts_aos_buffer[id_in_pack].ux_m.w = c->hydro.parts[i].mass;
     parts_aos_buffer[id_in_pack].rho_avisc_u_c.x = c->hydro.parts[i].rho;
-    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y = c->hydro.parts[i].viscosity.alpha;
-    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z = c->hydro.parts[i].u;//p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w = c->hydro.parts[i].force.soundspeed;//p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.y =
+        c->hydro.parts[i].viscosity.alpha;
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.z =
+        c->hydro.parts[i].u;  // p.density.rot_v[0];
+    parts_aos_buffer[id_in_pack].rho_avisc_u_c.w =
+        c->hydro.parts[i].force.soundspeed;  // p.density.rot_v[0];
     parts_aos_buffer[id_in_pack].cjs_cje.x = cstarts.x;
     parts_aos_buffer[id_in_pack].cjs_cje.y = cstarts.y;
   }
 }
 
-void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, int local_pack_position, int count) {
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid,
+                     int local_pack_position, int count) {
 
   const struct part *ptmps;
   ptmps = c->hydro.parts;
@@ -463,10 +482,12 @@ void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, int
   }
 }
 
-void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send *restrict parts_aos, int tid, int local_pack_position, int count) {
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+                        struct part_aos_f4_f_send *restrict parts_aos, int tid,
+                        int local_pack_position, int count) {
 
-//  const struct part *restrict ptmps;
-//  ptmps = c->hydro.parts;
+  //  const struct part *restrict ptmps;
+  //  ptmps = c->hydro.parts;
   const int pp = local_pack_position;
   const float cellx = c->loc[0];
   const float celly = c->loc[1];
@@ -485,10 +506,14 @@ void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send
     parts_aos[i + pp].ux_m.w = c->hydro.parts[i].mass;
   }
   for (int i = 0; i < count; i++) {
-    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
-    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y = c->hydro.parts[i].force.balsara;
-    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
-    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w = c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.x =
+        c->hydro.parts[i].force.f;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.y =
+        c->hydro.parts[i].force.balsara;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.z =
+        c->hydro.parts[i].time_bin;
+    parts_aos[i + pp].f_bals_timebin_mintimebin_ngb.w =
+        c->hydro.parts[i].limiter_data.min_ngb_time_bin;
   }
   for (int i = 0; i < count; i++) {
     parts_aos[i + pp].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
@@ -498,58 +523,65 @@ void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send
   }
   for (int i = 0; i < count; i++) {
     parts_aos[i + pp].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
-    parts_aos[i + pp].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
-    parts_aos[i + pp].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+    parts_aos[i + pp].u_alphavisc_alphadiff.y =
+        c->hydro.parts[i].viscosity.alpha;
+    parts_aos[i + pp].u_alphavisc_alphadiff.z =
+        c->hydro.parts[i].diffusion.alpha;
   }
-
 }
 
-extern inline void pack_neat_pair_aos_f4_f(struct cell * __restrict c, struct part_aos_f4_f_send * __restrict parts_aos, int tid, const int local_pack_position, const int count, const float3 shift, const int2 cstarts) {
-	//  const struct part *restrict ptmps;
-	//  ptmps = c->hydro.parts;
-	  const int pp = local_pack_position;
-	  /*Data to be copied to GPU local memory*/
-	  for (int i = 0; i < count; i++) {
-		const int id = i + pp;
-	    parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x;
-	    parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y;
-	    parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z;
-	    parts_aos[id].x_h.w = c->hydro.parts[i].h;
-	    parts_aos[id].ux_m.x = c->hydro.parts[i].v[0];
-	    parts_aos[id].ux_m.y = c->hydro.parts[i].v[1];
-	    parts_aos[id].ux_m.z = c->hydro.parts[i].v[2];
-	    parts_aos[id].ux_m.w = c->hydro.parts[i].mass;
-	    parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
-	    parts_aos[id].f_bals_timebin_mintimebin_ngb.y = c->hydro.parts[i].force.balsara;
-	    parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
-	    parts_aos[id].f_bals_timebin_mintimebin_ngb.w = c->hydro.parts[i].limiter_data.min_ngb_time_bin;
-	    parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
-	    parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
-	    parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
-	    parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
-	    parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
-	    parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
-	    parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
-	    parts_aos[id].cjs_cje.x = cstarts.x;
-	    parts_aos[id].cjs_cje.y = cstarts.y;
-	  }
+extern inline void pack_neat_pair_aos_f4_f(
+    struct cell *__restrict c, struct part_aos_f4_f_send *__restrict parts_aos,
+    int tid, const int local_pack_position, const int count, const float3 shift,
+    const int2 cstarts) {
+  //  const struct part *restrict ptmps;
+  //  ptmps = c->hydro.parts;
+  const int pp = local_pack_position;
+  /*Data to be copied to GPU local memory*/
+  for (int i = 0; i < count; i++) {
+    const int id = i + pp;
+    parts_aos[id].x_h.x = c->hydro.parts[i].x[0] - shift.x;
+    parts_aos[id].x_h.y = c->hydro.parts[i].x[1] - shift.y;
+    parts_aos[id].x_h.z = c->hydro.parts[i].x[2] - shift.z;
+    parts_aos[id].x_h.w = c->hydro.parts[i].h;
+    parts_aos[id].ux_m.x = c->hydro.parts[i].v[0];
+    parts_aos[id].ux_m.y = c->hydro.parts[i].v[1];
+    parts_aos[id].ux_m.z = c->hydro.parts[i].v[2];
+    parts_aos[id].ux_m.w = c->hydro.parts[i].mass;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.x = c->hydro.parts[i].force.f;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.y =
+        c->hydro.parts[i].force.balsara;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.z = c->hydro.parts[i].time_bin;
+    parts_aos[id].f_bals_timebin_mintimebin_ngb.w =
+        c->hydro.parts[i].limiter_data.min_ngb_time_bin;
+    parts_aos[id].rho_p_c_vsigi.x = c->hydro.parts[i].rho;
+    parts_aos[id].rho_p_c_vsigi.y = c->hydro.parts[i].force.pressure;
+    parts_aos[id].rho_p_c_vsigi.z = c->hydro.parts[i].force.soundspeed;
+    parts_aos[id].rho_p_c_vsigi.w = c->hydro.parts[i].viscosity.v_sig;
+    parts_aos[id].u_alphavisc_alphadiff.x = c->hydro.parts[i].u;
+    parts_aos[id].u_alphavisc_alphadiff.y = c->hydro.parts[i].viscosity.alpha;
+    parts_aos[id].u_alphavisc_alphadiff.z = c->hydro.parts[i].diffusion.alpha;
+    parts_aos[id].cjs_cje.x = cstarts.x;
+    parts_aos[id].cjs_cje.y = cstarts.y;
+  }
 }
 
-void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c,
+                                    struct part_soa parts_soa_buffer, int timer,
+                                    int *pack_length, int tid,
+                                    int count_max_parts_tmp, struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -565,21 +597,23 @@ void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, struct par
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c,
+                                        struct part_aos *parts_aos_buffer,
+                                        int timer, int *pack_length, int tid,
+                                        int count_max_parts_tmp,
+                                        struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -595,21 +629,22 @@ void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, struct
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp,
+    struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -625,21 +660,23 @@ void runner_doself1_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *c, str
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c,
+                                          struct part_aos_g *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -655,22 +692,22 @@ void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, stru
   (*pack_length) += count;
 }
 
-
-void runner_doself1_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -686,21 +723,23 @@ void runner_doself1_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *c, s
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c,
+                                          struct part_aos_f *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	  message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -716,21 +755,22 @@ void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, stru
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *c, struct part_aos_f4_f_recv *parts_aos_buffer,
-       int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) {
+    message("Inactive cell\n");
     return;
-  if (!cell_is_active_hydro(c, e)){
-	message("Inactive cell\n");
-	return;
   }
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -746,349 +786,445 @@ void runner_doself1_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *c, s
   (*pack_length) += count;
 }
 
-void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-//  struct part *ptmps;
-//  ptmps=c->hydro.parts;
-
-//  memcpy(&rho[0], &parts_soa_buffer.rho[local_pack_position], count * sizeof(float));
-//	fprintf(stderr, "count %i\n", count);
-//  memcpy(rho, &parts_soa_buffer.rho[local_pack_position], count * sizeof(float));
-//  memcpy(rho_dh, &parts_soa_buffer.rho_dh[local_pack_position], count * sizeof(float));
-//  memcpy(wcount, &parts_soa_buffer.wcount[local_pack_position], count * sizeof(float));
-//  memcpy(wcount_dh, &parts_soa_buffer.wcount_dh[local_pack_position], count * sizeof(float));
-//  memcpy(div_v, &parts_soa_buffer.div_v[local_pack_position], count * sizeof(float));
-//  memcpy(rot_ux, &parts_soa_buffer.rot_ux[local_pack_position], count * sizeof(float));
-//  memcpy(rot_uy, &parts_soa_buffer.rot_uy[local_pack_position], count * sizeof(float));
-//  memcpy(rot_uz, &parts_soa_buffer.rot_uz[local_pack_position], count * sizeof(float));
-  float *rho = &parts_soa_buffer.rho[local_pack_position];// = calloc(count, sizeof(float));//
-  float *rho_dh  = &parts_soa_buffer.rho_dh[local_pack_position];// = calloc(count, sizeof(float));//
-  float *wcount  = &parts_soa_buffer.wcount[local_pack_position];// = calloc(count, sizeof(float));//
-  float *wcount_dh = &parts_soa_buffer.wcount_dh[local_pack_position];// = calloc(count, sizeof(float));//
-  float *div_v  = &parts_soa_buffer.div_v[local_pack_position];// = calloc(count, sizeof(float));//
-  float *rot_ux  = &parts_soa_buffer.rot_ux[local_pack_position];// = calloc(count, sizeof(float));//
-  float *rot_uy  = &parts_soa_buffer.rot_uy[local_pack_position];// = calloc(count, sizeof(float));//
-  float *rot_uz  = &parts_soa_buffer.rot_uz[local_pack_position];// = calloc(count, sizeof(float));//
-
-//  fprintf(stderr, "rho %f rho %f\n", rho[1], parts_soa_buffer.rho[local_pack_position+1]);
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
+                 int local_pack_position, int count, struct engine *e) {
+
+  //  struct part *ptmps;
+  //  ptmps=c->hydro.parts;
+
+  //  memcpy(&rho[0], &parts_soa_buffer.rho[local_pack_position], count *
+  //  sizeof(float));
+  //	fprintf(stderr, "count %i\n", count);
+  //  memcpy(rho, &parts_soa_buffer.rho[local_pack_position], count *
+  //  sizeof(float)); memcpy(rho_dh,
+  //  &parts_soa_buffer.rho_dh[local_pack_position], count * sizeof(float));
+  //  memcpy(wcount, &parts_soa_buffer.wcount[local_pack_position], count *
+  //  sizeof(float)); memcpy(wcount_dh,
+  //  &parts_soa_buffer.wcount_dh[local_pack_position], count * sizeof(float));
+  //  memcpy(div_v, &parts_soa_buffer.div_v[local_pack_position], count *
+  //  sizeof(float)); memcpy(rot_ux,
+  //  &parts_soa_buffer.rot_ux[local_pack_position], count * sizeof(float));
+  //  memcpy(rot_uy, &parts_soa_buffer.rot_uy[local_pack_position], count *
+  //  sizeof(float)); memcpy(rot_uz,
+  //  &parts_soa_buffer.rot_uz[local_pack_position], count * sizeof(float));
+  float *rho =
+      &parts_soa_buffer
+           .rho[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *rho_dh =
+      &parts_soa_buffer
+           .rho_dh[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *wcount =
+      &parts_soa_buffer
+           .wcount[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *wcount_dh =
+      &parts_soa_buffer.wcount_dh[local_pack_position];  // = calloc(count,
+                                                         // sizeof(float));//
+  float *div_v =
+      &parts_soa_buffer
+           .div_v[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *rot_ux =
+      &parts_soa_buffer
+           .rot_ux[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *rot_uy =
+      &parts_soa_buffer
+           .rot_uy[local_pack_position];  // = calloc(count, sizeof(float));//
+  float *rot_uz =
+      &parts_soa_buffer
+           .rot_uz[local_pack_position];  // = calloc(count, sizeof(float));//
+
+  //  fprintf(stderr, "rho %f rho %f\n", rho[1],
+  //  parts_soa_buffer.rho[local_pack_position+1]);
   for (int i = 0; i < count; i++) {
-//    int id_in_pack = i + local_pack_position;
-//    struct part *part_cpu = &c->hydro.parts[i];
+    //    int id_in_pack = i + local_pack_position;
+    //    struct part *part_cpu = &c->hydro.parts[i];
     struct part *pi = &c->hydro.parts[i];
-//    if (part_is_inhibited(pi, e)) {
-//      fprintf(stderr, "inhibited part\n");
-//      continue;
-//    }
-//    const int pi_active = part_is_active(pi, e);
-//    if (pi_active) {
-      pi->rho += rho[i];
-//      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
-      pi->density.rho_dh += rho_dh[i];
-      pi->density.wcount += wcount[i];
-      pi->density.wcount_dh += wcount_dh[i];
-      pi->viscosity.div_v += div_v[i];
-      pi->density.rot_v[0] += rot_ux[i];
-      pi->density.rot_v[1] += rot_uy[i];
-      pi->density.rot_v[2] += rot_uz[i];
-
-//      c->hydro.parts[i].rho += rho[i];
-//      c->hydro.parts[i].density.rho_dh += rho_dh[i];
-//      c->hydro.parts[i].density.wcount += wcount[i];
-//      c->hydro.parts[i].density.wcount_dh += wcount_dh[i];
-//      c->hydro.parts[i].viscosity.div_v += div_v[i];
-//      c->hydro.parts[i].density.rot_v[0] += rot_ux[i];
-//      c->hydro.parts[i].density.rot_v[1] += rot_uy[i];
-//      c->hydro.parts[i].density.rot_v[2] += rot_uz[i];
-
-//      c->hydro.parts[i].rho += parts_tmp->rho[i];
-//      c->hydro.parts[i].density.rho_dh += parts_tmp->rho_dh[i];
-//      c->hydro.parts[i].density.wcount += parts_tmp->wcount[i];
-//      c->hydro.parts[i].density.wcount_dh += parts_tmp->wcount_dh[i];
-//      c->hydro.parts[i].viscosity.div_v += parts_tmp->div_v[i];
-//      c->hydro.parts[i].density.rot_v[0] += parts_tmp->rot_ux[i];
-//      c->hydro.parts[i].density.rot_v[1] += parts_tmp->rot_uy[i];
-//      c->hydro.parts[i].density.rot_v[2] += parts_tmp->rot_uz[i];
-
-//      part_cpu[i].rho += parts_soa_buffer.rho[i];
-//      part_cpu[i].density.rho_dh += parts_soa_buffer.rho_dh[i];
-//      part_cpu[i].density.wcount += parts_soa_buffer.wcount[i];
-//      part_cpu[i].density.wcount_dh += parts_soa_buffer.wcount_dh[i];
-//      part_cpu[i].viscosity.div_v += parts_soa_buffer.div_v[i];
-//      part_cpu[i].density.rot_v[0] += parts_soa_buffer.rot_ux[i];
-//      part_cpu[i].density.rot_v[1] += parts_soa_buffer.rot_uy[i];
-//      part_cpu[i].density.rot_v[2] += parts_soa_buffer.rot_uz[i];
-//    }
-//    else fprintf(stderr,"a part is not active\n");
-  }
-//  c->hydro.parts=ptmps;
+    //    if (part_is_inhibited(pi, e)) {
+    //      fprintf(stderr, "inhibited part\n");
+    //      continue;
+    //    }
+    //    const int pi_active = part_is_active(pi, e);
+    //    if (pi_active) {
+    pi->rho += rho[i];
+    //      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
+    pi->density.rho_dh += rho_dh[i];
+    pi->density.wcount += wcount[i];
+    pi->density.wcount_dh += wcount_dh[i];
+    pi->viscosity.div_v += div_v[i];
+    pi->density.rot_v[0] += rot_ux[i];
+    pi->density.rot_v[1] += rot_uy[i];
+    pi->density.rot_v[2] += rot_uz[i];
+
+    //      c->hydro.parts[i].rho += rho[i];
+    //      c->hydro.parts[i].density.rho_dh += rho_dh[i];
+    //      c->hydro.parts[i].density.wcount += wcount[i];
+    //      c->hydro.parts[i].density.wcount_dh += wcount_dh[i];
+    //      c->hydro.parts[i].viscosity.div_v += div_v[i];
+    //      c->hydro.parts[i].density.rot_v[0] += rot_ux[i];
+    //      c->hydro.parts[i].density.rot_v[1] += rot_uy[i];
+    //      c->hydro.parts[i].density.rot_v[2] += rot_uz[i];
+
+    //      c->hydro.parts[i].rho += parts_tmp->rho[i];
+    //      c->hydro.parts[i].density.rho_dh += parts_tmp->rho_dh[i];
+    //      c->hydro.parts[i].density.wcount += parts_tmp->wcount[i];
+    //      c->hydro.parts[i].density.wcount_dh += parts_tmp->wcount_dh[i];
+    //      c->hydro.parts[i].viscosity.div_v += parts_tmp->div_v[i];
+    //      c->hydro.parts[i].density.rot_v[0] += parts_tmp->rot_ux[i];
+    //      c->hydro.parts[i].density.rot_v[1] += parts_tmp->rot_uy[i];
+    //      c->hydro.parts[i].density.rot_v[2] += parts_tmp->rot_uz[i];
+
+    //      part_cpu[i].rho += parts_soa_buffer.rho[i];
+    //      part_cpu[i].density.rho_dh += parts_soa_buffer.rho_dh[i];
+    //      part_cpu[i].density.wcount += parts_soa_buffer.wcount[i];
+    //      part_cpu[i].density.wcount_dh += parts_soa_buffer.wcount_dh[i];
+    //      part_cpu[i].viscosity.div_v += parts_soa_buffer.div_v[i];
+    //      part_cpu[i].density.rot_v[0] += parts_soa_buffer.rot_ux[i];
+    //      part_cpu[i].density.rot_v[1] += parts_soa_buffer.rot_uy[i];
+    //      part_cpu[i].density.rot_v[2] += parts_soa_buffer.rot_uz[i];
+    //    }
+    //    else fprintf(stderr,"a part is not active\n");
+  }
+  //  c->hydro.parts=ptmps;
 }
-void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-	//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
-	//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
-	//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
-	//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
-	//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
-	//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
-	//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
-	//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
-	  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
-	  for (int i = 0; i < count; i++) {
-
-	      struct part_aos p_tmp = parts_tmp[i];
-	      struct part *p = &c->hydro.parts[i];
-	      p->rho += p_tmp.rho;
-	      p->density.rho_dh += p_tmp.rho_dh;
-	      p->density.wcount += p_tmp.wcount;
-	      p->density.wcount_dh += p_tmp.wcount_dh;
-	      p->viscosity.div_v += p_tmp.div_v;
-	      p->density.rot_v[0] += p_tmp.rot_ux;
-	      p->density.rot_v[1] += p_tmp.rot_uy;
-	      p->density.rot_v[2] += p_tmp.rot_uz;
-	  }
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                     int local_pack_position, int count, struct engine *e) {
+
+  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
+  //  sizeof(float));// float *rho_dh  =
+  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
+  //  sizeof(float));// float *wcount  =
+  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
+  //  sizeof(float));// float *wcount_dh =
+  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
+  //  sizeof(float));// float *div_v  =
+  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
+  //  sizeof(float));// float *rot_ux  =
+  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
+  //  sizeof(float));// float *rot_uy  =
+  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
+  //  sizeof(float));// float *rot_uz  =
+  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
+  //  sizeof(float));//
+  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+
+    struct part_aos p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    p->rho += p_tmp.rho;
+    p->density.rho_dh += p_tmp.rho_dh;
+    p->density.wcount += p_tmp.wcount;
+    p->density.wcount_dh += p_tmp.wcount_dh;
+    p->viscosity.div_v += p_tmp.div_v;
+    p->density.rot_v[0] += p_tmp.rot_ux;
+    p->density.rot_v[1] += p_tmp.rot_uy;
+    p->density.rot_v[2] += p_tmp.rot_uz;
+  }
 }
 #include <stdatomic.h>
-void unpack_neat_aos_f4(struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-	//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
-	//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
-	//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
-	//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
-	//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
-	//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
-	//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
-	//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
-	  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
-	  for (int i = 0; i < count; i++) {
-
-	      struct part_aos_f4_recv p_tmp = parts_tmp[i];
-	      float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
-	      float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
-	      struct part *p = &c->hydro.parts[i];
-
-	      p->rho += rho_dh_wcount.x;
-	      p->density.rho_dh += rho_dh_wcount.y;
-	      p->density.wcount += rho_dh_wcount.z;
-	      p->density.wcount_dh += rho_dh_wcount.w;
-	      p->density.rot_v[0] += rot_ux_div_v.x;
-	      p->density.rot_v[1] += rot_ux_div_v.y;
-	      p->density.rot_v[2] += rot_ux_div_v.z;
-	      p->viscosity.div_v += rot_ux_div_v.w;
-//	      fprintf(stderr, "rho %f div_v %f\n", p_tmp.rho_dh_wcount.x, p_tmp.rot_ux_div_v.w);
-	  }
-}
+void unpack_neat_aos_f4(struct cell *c,
+                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
+                        int local_pack_position, int count, struct engine *e) {
+
+  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
+  //  sizeof(float));// float *rho_dh  =
+  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
+  //  sizeof(float));// float *wcount  =
+  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
+  //  sizeof(float));// float *wcount_dh =
+  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
+  //  sizeof(float));// float *div_v  =
+  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
+  //  sizeof(float));// float *rot_ux  =
+  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
+  //  sizeof(float));// float *rot_uy  =
+  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
+  //  sizeof(float));// float *rot_uz  =
+  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
+  //  sizeof(float));//
+  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
 
-void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+    struct part_aos_f4_recv p_tmp = parts_tmp[i];
+    float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
+    float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
+    struct part *p = &c->hydro.parts[i];
+
+    p->rho += rho_dh_wcount.x;
+    p->density.rho_dh += rho_dh_wcount.y;
+    p->density.wcount += rho_dh_wcount.z;
+    p->density.wcount_dh += rho_dh_wcount.w;
+    p->density.rot_v[0] += rot_ux_div_v.x;
+    p->density.rot_v[1] += rot_ux_div_v.y;
+    p->density.rot_v[2] += rot_ux_div_v.z;
+    p->viscosity.div_v += rot_ux_div_v.w;
+    //	      fprintf(stderr, "rho %f div_v %f\n", p_tmp.rho_dh_wcount.x,
+    //p_tmp.rot_ux_div_v.w);
+  }
+}
 
-	  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
-	  for (int i = 0; i < count; i++) {
-	      struct part_aos_g p_tmp = parts_tmp[i];
-	      struct part *p = &c->hydro.parts[i];
-	      const float v_sig = p->viscosity.v_sig;
-	      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-	      p->diffusion.laplace_u += p_tmp.laplace_u;
-	      const float max_ngb = p->force.alpha_visc_max_ngb;
-	      p->force.alpha_visc_max_ngb = max(p_tmp.alpha_visc_max_ngb, max_ngb);
-	  }
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e) {
 
+  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+    struct part_aos_g p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    const float v_sig = p->viscosity.v_sig;
+    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+    p->diffusion.laplace_u += p_tmp.laplace_u;
+    const float max_ngb = p->force.alpha_visc_max_ngb;
+    p->force.alpha_visc_max_ngb = max(p_tmp.alpha_visc_max_ngb, max_ngb);
+  }
 }
 
-void unpack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-	  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
-	  for (int i = 0; i < count; i++) {
-	      struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
-	      struct part *p = &c->hydro.parts[i];
-	      const float v_sig = p->viscosity.v_sig;
-	      p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
-	      p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
-	      const float max_ngb = p->force.alpha_visc_max_ngb;
-	      p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
-	  }
+void unpack_neat_aos_f4_g(struct cell *c,
+                          struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+                          int local_pack_position, int count,
+                          struct engine *e) {
 
+  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+    struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    const float v_sig = p->viscosity.v_sig;
+    p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
+    p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
+    const float max_ngb = p->force.alpha_visc_max_ngb;
+    p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
+  }
 }
 
-void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-	  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
-	  for (int i = 0; i < count; i++) {
-	      struct part_aos_f p_tmp = parts_tmp[i];
-	      struct part *p = &c->hydro.parts[i];
-	      p->a_hydro[0] += p_tmp.a_hydrox;
-	      p->a_hydro[1] += p_tmp.a_hydroy;
-	      p->a_hydro[2] += p_tmp.a_hydroz;
-	      p->u_dt += p_tmp.u_dt;
-	      p->force.h_dt += p_tmp.h_dt;
-//	      p->limiter_data.min_ngb_time_bin = min(p_tmp.min_ngb_time_bin, p->limiter_data.min_ngb_time_bin);
-	      p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
-	      const float v_sig = p->viscosity.v_sig;
-	      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-//	      p->viscosity.v_sig = p_tmp.v_sig;
-
-//          fprintf(stderr, "ax %f ay %f az %f\n", p_tmp.a_hydrox, p_tmp.a_hydroy, p_tmp.a_hydroz);
-	  }
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e) {
 
+  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
+  for (int i = 0; i < count; i++) {
+    struct part_aos_f p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    p->a_hydro[0] += p_tmp.a_hydrox;
+    p->a_hydro[1] += p_tmp.a_hydroy;
+    p->a_hydro[2] += p_tmp.a_hydroz;
+    p->u_dt += p_tmp.u_dt;
+    p->force.h_dt += p_tmp.h_dt;
+    //	      p->limiter_data.min_ngb_time_bin = min(p_tmp.min_ngb_time_bin,
+    //p->limiter_data.min_ngb_time_bin);
+    p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
+    const float v_sig = p->viscosity.v_sig;
+    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+    //	      p->viscosity.v_sig = p_tmp.v_sig;
+
+    //          fprintf(stderr, "ax %f ay %f az %f\n", p_tmp.a_hydrox,
+    //          p_tmp.a_hydroy, p_tmp.a_hydroz);
+  }
 }
 
-void unpack_neat_aos_f4_f(struct cell *restrict c, struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-//	  struct part_aos_f4_f_recv *restrict parts_tmp = &parts_aos_buffer[local_pack_position];
-	  int pp = local_pack_position;
-	  for (int i = 0; i < count; i++) {
-//	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
-//	      struct part *restrict p = &c->hydro.parts[i];
-	      c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
-	      c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
-	      c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
-	  }
-	  for (int i = 0; i < count; i++) {
-	      c->hydro.parts[i].viscosity.v_sig = fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z, c->hydro.parts[i].viscosity.v_sig);
-	      c->hydro.parts[i].limiter_data.min_ngb_time_bin = (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
-	  }
-	  for (int i = 0; i < count; i++) {
-	      c->hydro.parts[i].u_dt += parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
-	      c->hydro.parts[i].force.h_dt += parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y;
-	  }
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+                          struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+                          int tid, int local_pack_position, int count,
+                          struct engine *e) {
 
+  //	  struct part_aos_f4_f_recv *restrict parts_tmp =
+  //&parts_aos_buffer[local_pack_position];
+  int pp = local_pack_position;
+  for (int i = 0; i < count; i++) {
+    //	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+    //	      struct part *restrict p = &c->hydro.parts[i];
+    c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
+    c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
+    c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
+  }
+  for (int i = 0; i < count; i++) {
+    c->hydro.parts[i].viscosity.v_sig =
+        fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z,
+              c->hydro.parts[i].viscosity.v_sig);
+    c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+        (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+  }
+  for (int i = 0; i < count; i++) {
+    c->hydro.parts[i].u_dt +=
+        parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
+    c->hydro.parts[i].force.h_dt +=
+        parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.y;
+  }
 }
 
-void unpack_neat_pair(struct runner *r, struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e){
+void unpack_neat_pair(struct runner *r, struct cell *c,
+                      struct part_soa parts_soa_buffer, int tid,
+                      int local_pack_position, int count, struct engine *e) {
 
-//  struct part *ptmps;
-//  ptmps=c->hydro.parts;
+  //  struct part *ptmps;
+  //  ptmps=c->hydro.parts;
   for (int i = 0; i < count; i++) {
     int id_in_pack = i + local_pack_position;
-//    struct part *pi = &c->hydro.parts[i];
-//    if (part_is_inhibited(pi, e)) {
-//      fprintf(stderr, "inhibited part\n");
-//      continue;
-//    }
-//    const int pi_active = part_is_active(pi, e);
-//    if (pi_active) {
-      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
-      c->hydro.parts[i].density.rho_dh += parts_soa_buffer.rho_dh[id_in_pack];
-      c->hydro.parts[i].density.wcount += parts_soa_buffer.wcount[id_in_pack];
-      c->hydro.parts[i].density.wcount_dh += parts_soa_buffer.wcount_dh[id_in_pack];
-      c->hydro.parts[i].viscosity.div_v += parts_soa_buffer.div_v[id_in_pack];
-      c->hydro.parts[i].density.rot_v[0] += parts_soa_buffer.rot_ux[id_in_pack];
-      c->hydro.parts[i].density.rot_v[1] += parts_soa_buffer.rot_uy[id_in_pack];
-      c->hydro.parts[i].density.rot_v[2] += parts_soa_buffer.rot_uz[id_in_pack];
-//      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i, parts_soa_buffer.rho[id_in_pack]);
-//    }
-//    else fprintf(stderr,"a part is not active\n");
-  }
-//  c->hydro.parts=ptmps;
+    //    struct part *pi = &c->hydro.parts[i];
+    //    if (part_is_inhibited(pi, e)) {
+    //      fprintf(stderr, "inhibited part\n");
+    //      continue;
+    //    }
+    //    const int pi_active = part_is_active(pi, e);
+    //    if (pi_active) {
+    c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
+    c->hydro.parts[i].density.rho_dh += parts_soa_buffer.rho_dh[id_in_pack];
+    c->hydro.parts[i].density.wcount += parts_soa_buffer.wcount[id_in_pack];
+    c->hydro.parts[i].density.wcount_dh +=
+        parts_soa_buffer.wcount_dh[id_in_pack];
+    c->hydro.parts[i].viscosity.div_v += parts_soa_buffer.div_v[id_in_pack];
+    c->hydro.parts[i].density.rot_v[0] += parts_soa_buffer.rot_ux[id_in_pack];
+    c->hydro.parts[i].density.rot_v[1] += parts_soa_buffer.rot_uy[id_in_pack];
+    c->hydro.parts[i].density.rot_v[2] += parts_soa_buffer.rot_uz[id_in_pack];
+    //      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i,
+    //      parts_soa_buffer.rho[id_in_pack]);
+    //    }
+    //    else fprintf(stderr,"a part is not active\n");
+  }
+  //  c->hydro.parts=ptmps;
 }
 
-void unpack_neat_pair_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
-
-//  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count, sizeof(float));//
-//  float *rho_dh  = &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count, sizeof(float));//
-//  float *wcount  = &parts_aos_buffer[local_pack_position].wcount;// = calloc(count, sizeof(float));//
-//  float *wcount_dh = &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count, sizeof(float));//
-//  float *div_v  = &parts_aos_buffer[local_pack_position].div_v;// = calloc(count, sizeof(float));//
-//  float *rot_ux  = &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count, sizeof(float));//
-//  float *rot_uy  = &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count, sizeof(float));//
-//  float *rot_uz  = &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count, sizeof(float));//
+void unpack_neat_pair_aos(struct runner *r, struct cell *c,
+                          struct part_aos *parts_aos_buffer, int tid,
+                          int local_pack_position, int count,
+                          struct engine *e) {
+
+  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
+  //  sizeof(float));// float *rho_dh  =
+  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
+  //  sizeof(float));// float *wcount  =
+  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
+  //  sizeof(float));// float *wcount_dh =
+  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
+  //  sizeof(float));// float *div_v  =
+  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
+  //  sizeof(float));// float *rot_ux  =
+  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
+  //  sizeof(float));// float *rot_uy  =
+  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
+  //  sizeof(float));// float *rot_uz  =
+  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
+  //  sizeof(float));//
   struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
-//  struct part *ptmps;
-//  ptmps=c->hydro.parts;
-//  struct part *part_cpu = c->hydro.parts;
+  //  struct part *ptmps;
+  //  ptmps=c->hydro.parts;
+  //  struct part *part_cpu = c->hydro.parts;
   for (int i = 0; i < count; i++) {
-//    int id_in_pack = i + local_pack_position;
-//      struct part_aos part_gpu = parts_aos_buffer[id_in_pack];
-//    struct part *pi = &c->hydro.parts[i];
-//    if (part_is_inhibited(pi, e)) {
-//      fprintf(stderr, "inhibited part\n");
-//      continue;
-//    }
-//    const int pi_active = part_is_active(pi, e);
-//    if (pi_active) {
-//      if(parts_aos_buffer[id_in_pack].time_bin == 1000)(*count1000)++ ;//fprintf(stderr, "timebin %i\n", parts_aos_buffer[id_in_pack].time_bin);
-//      else if(parts_aos_buffer[id_in_pack].time_bin == 20)(*count20)++ ;//fprintf(stderr, "timebin %i\n", parts_aos_buffer[id_in_pack].time_bin);
-//      else fprintf(stderr, "not 20 or 1000\n");
-//
-      struct part_aos p_tmp = parts_tmp[i];
-      struct part *p = &c->hydro.parts[i];
-      p->rho += p_tmp.rho;
-      p->density.rho_dh += p_tmp.rho_dh;
-      p->density.wcount += p_tmp.wcount;
-      p->density.wcount_dh += p_tmp.wcount_dh;
-      p->viscosity.div_v += p_tmp.div_v;
-      p->density.rot_v[0] += p_tmp.rot_ux;
-      p->density.rot_v[1] += p_tmp.rot_uy;
-      p->density.rot_v[2] += p_tmp.rot_uz;
-
-//      c->hydro.parts[i].rho += parts_aos_buffer[id_in_pack].rho;
-//      c->hydro.parts[i].density.rho_dh += parts_aos_buffer[id_in_pack].rho_dh;
-//      c->hydro.parts[i].density.wcount += parts_aos_buffer[id_in_pack].wcount;
-//      c->hydro.parts[i].density.wcount_dh += parts_aos_buffer[id_in_pack].wcount_dh;
-//      c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[id_in_pack].div_v;
-//      c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[id_in_pack].rot_ux;
-//      c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[id_in_pack].rot_uy;
-//      c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[id_in_pack].rot_uz;
-
-//      part_cpu[i].rho += part_gpu.rho;
-//      part_cpu[i].density.rho_dh += part_gpu.rho_dh;
-//      part_cpu[i].density.wcount += part_gpu.wcount;
-//      part_cpu[i].density.wcount_dh += part_gpu.wcount_dh;
-//      part_cpu[i].viscosity.div_v += part_gpu.div_v;
-//      part_cpu[i].density.rot_v[0] += part_gpu.rot_ux;
-//      part_cpu[i].density.rot_v[1] += part_gpu.rot_uy;
-//      part_cpu[i].density.rot_v[2] += part_gpu.rot_uz;
-//      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i, parts_soa_buffer.rho[id_in_pack]);
-//    }
-//    else fprintf(stderr,"a part is not active\n");
-  }
-//  c->hydro.parts=ptmps;
+    //    int id_in_pack = i + local_pack_position;
+    //      struct part_aos part_gpu = parts_aos_buffer[id_in_pack];
+    //    struct part *pi = &c->hydro.parts[i];
+    //    if (part_is_inhibited(pi, e)) {
+    //      fprintf(stderr, "inhibited part\n");
+    //      continue;
+    //    }
+    //    const int pi_active = part_is_active(pi, e);
+    //    if (pi_active) {
+    //      if(parts_aos_buffer[id_in_pack].time_bin == 1000)(*count1000)++
+    //      ;//fprintf(stderr, "timebin %i\n",
+    //      parts_aos_buffer[id_in_pack].time_bin); else
+    //      if(parts_aos_buffer[id_in_pack].time_bin == 20)(*count20)++
+    //      ;//fprintf(stderr, "timebin %i\n",
+    //      parts_aos_buffer[id_in_pack].time_bin); else fprintf(stderr, "not 20
+    //      or 1000\n");
+    //
+    struct part_aos p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    p->rho += p_tmp.rho;
+    p->density.rho_dh += p_tmp.rho_dh;
+    p->density.wcount += p_tmp.wcount;
+    p->density.wcount_dh += p_tmp.wcount_dh;
+    p->viscosity.div_v += p_tmp.div_v;
+    p->density.rot_v[0] += p_tmp.rot_ux;
+    p->density.rot_v[1] += p_tmp.rot_uy;
+    p->density.rot_v[2] += p_tmp.rot_uz;
+
+    //      c->hydro.parts[i].rho += parts_aos_buffer[id_in_pack].rho;
+    //      c->hydro.parts[i].density.rho_dh +=
+    //      parts_aos_buffer[id_in_pack].rho_dh;
+    //      c->hydro.parts[i].density.wcount +=
+    //      parts_aos_buffer[id_in_pack].wcount;
+    //      c->hydro.parts[i].density.wcount_dh +=
+    //      parts_aos_buffer[id_in_pack].wcount_dh;
+    //      c->hydro.parts[i].viscosity.div_v +=
+    //      parts_aos_buffer[id_in_pack].div_v;
+    //      c->hydro.parts[i].density.rot_v[0] +=
+    //      parts_aos_buffer[id_in_pack].rot_ux;
+    //      c->hydro.parts[i].density.rot_v[1] +=
+    //      parts_aos_buffer[id_in_pack].rot_uy;
+    //      c->hydro.parts[i].density.rot_v[2] +=
+    //      parts_aos_buffer[id_in_pack].rot_uz;
+
+    //      part_cpu[i].rho += part_gpu.rho;
+    //      part_cpu[i].density.rho_dh += part_gpu.rho_dh;
+    //      part_cpu[i].density.wcount += part_gpu.wcount;
+    //      part_cpu[i].density.wcount_dh += part_gpu.wcount_dh;
+    //      part_cpu[i].viscosity.div_v += part_gpu.div_v;
+    //      part_cpu[i].density.rot_v[0] += part_gpu.rot_ux;
+    //      part_cpu[i].density.rot_v[1] += part_gpu.rot_uy;
+    //      part_cpu[i].density.rot_v[2] += part_gpu.rot_uz;
+    //      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i,
+    //      parts_soa_buffer.rho[id_in_pack]);
+    //    }
+    //    else fprintf(stderr,"a part is not active\n");
+  }
+  //  c->hydro.parts=ptmps;
 }
 
-void unpack_neat_pair_aos_f4(struct runner *r, struct cell * restrict c, struct part_aos_f4_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
-		int count, struct engine *e){
+void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c,
+                             struct part_aos_f4_recv *restrict parts_aos_buffer,
+                             int tid, int local_pack_position, int count,
+                             struct engine *e) {
 
-//  struct part_aos_f4_recv * restrict parts_tmp = &parts_aos_buffer[local_pack_position];
-  if (cell_is_active_hydro(c, e)){
-  int pp = local_pack_position;
-  for (int i = 0; i < count; i++) {
-	  int j = i + pp;
+  //  struct part_aos_f4_recv * restrict parts_tmp =
+  //  &parts_aos_buffer[local_pack_position];
+  if (cell_is_active_hydro(c, e)) {
+    int pp = local_pack_position;
+    for (int i = 0; i < count; i++) {
+      int j = i + pp;
       c->hydro.parts[i].rho += parts_aos_buffer[j].rho_dh_wcount.x;
       c->hydro.parts[i].density.rho_dh += parts_aos_buffer[j].rho_dh_wcount.y;
       c->hydro.parts[i].density.wcount += parts_aos_buffer[j].rho_dh_wcount.z;
-      c->hydro.parts[i].density.wcount_dh += parts_aos_buffer[j].rho_dh_wcount.w;
+      c->hydro.parts[i].density.wcount_dh +=
+          parts_aos_buffer[j].rho_dh_wcount.w;
       c->hydro.parts[i].density.rot_v[0] += parts_aos_buffer[j].rot_ux_div_v.x;
       c->hydro.parts[i].density.rot_v[1] += parts_aos_buffer[j].rot_ux_div_v.y;
       c->hydro.parts[i].density.rot_v[2] += parts_aos_buffer[j].rot_ux_div_v.z;
       c->hydro.parts[i].viscosity.div_v += parts_aos_buffer[j].rot_ux_div_v.w;
+    }
   }
-  }
-
 }
 
-void unpack_neat_pair_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+void unpack_neat_pair_aos_g(struct runner *r, struct cell *c,
+                            struct part_aos_g *parts_aos_buffer, int tid,
+                            int local_pack_position, int count,
+                            struct engine *e) {
   struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
   for (int i = 0; i < count; i++) {
-      struct part_aos_g p_tmp = parts_tmp[i];
-      struct part *p = &c->hydro.parts[i];
-      p->viscosity.v_sig = p_tmp.v_sig;
-      p->diffusion.laplace_u += p_tmp.laplace_u;
-      p->force.alpha_visc_max_ngb = p_tmp.alpha_visc_max_ngb;
+    struct part_aos_g p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    p->viscosity.v_sig = p_tmp.v_sig;
+    p->diffusion.laplace_u += p_tmp.laplace_u;
+    p->force.alpha_visc_max_ngb = p_tmp.alpha_visc_max_ngb;
   }
 }
 
-void unpack_neat_pair_aos_f4_g(struct runner *r, struct cell * restrict c, struct part_aos_f4_g_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
-		int count, struct engine *e){
-//  struct part_aos_f4_recv * restrict parts_tmp = &parts_aos_buffer[local_pack_position];
-//  int pp = local_pack_position;
-//  for (int i = 0; i < count; i++) {
-//	  int j = i + pp;
-//	  c->hydro.parts[i].viscosity.v_sig = parts_aos_buffer[j].vsig_lapu_aviscmax.x;
-//	  c->hydro.parts[i].diffusion.laplace_u += parts_aos_buffer[j].vsig_lapu_aviscmax.y;
-//	  c->hydro.parts[i].force.alpha_visc_max_ngb = parts_aos_buffer[j].vsig_lapu_aviscmax.z;
-//  }
-  if (cell_is_active_hydro(c, e)){
+void unpack_neat_pair_aos_f4_g(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_g_recv *restrict parts_aos_buffer, int tid,
+    int local_pack_position, int count, struct engine *e) {
+  //  struct part_aos_f4_recv * restrict parts_tmp =
+  //  &parts_aos_buffer[local_pack_position]; int pp = local_pack_position; for
+  //  (int i = 0; i < count; i++) {
+  //	  int j = i + pp;
+  //	  c->hydro.parts[i].viscosity.v_sig =
+  //parts_aos_buffer[j].vsig_lapu_aviscmax.x;
+  //	  c->hydro.parts[i].diffusion.laplace_u +=
+  //parts_aos_buffer[j].vsig_lapu_aviscmax.y;
+  //	  c->hydro.parts[i].force.alpha_visc_max_ngb =
+  //parts_aos_buffer[j].vsig_lapu_aviscmax.z;
+  //  }
+  if (cell_is_active_hydro(c, e)) {
 
-  struct part_aos_f4_g_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
+    struct part_aos_f4_g_recv *parts_tmp =
+        &parts_aos_buffer[local_pack_position];
+    for (int i = 0; i < count; i++) {
       struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
       struct part *p = &c->hydro.parts[i];
       const float v_sig = p->viscosity.v_sig;
@@ -1096,64 +1232,78 @@ void unpack_neat_pair_aos_f4_g(struct runner *r, struct cell * restrict c, struc
       p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
       const float max_ngb = p->force.alpha_visc_max_ngb;
       p->force.alpha_visc_max_ngb = fmaxf(p_tmp.vsig_lapu_aviscmax.z, max_ngb);
-  }
+    }
   }
 }
 
-
-void unpack_neat_pair_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e){
+void unpack_neat_pair_aos_f(struct runner *r, struct cell *c,
+                            struct part_aos_f *parts_aos_buffer, int tid,
+                            int local_pack_position, int count,
+                            struct engine *e) {
   struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
   for (int i = 0; i < count; i++) {
-      struct part_aos_f p_tmp = parts_tmp[i];
-      struct part *p = &c->hydro.parts[i];
-      p->a_hydro[0] += p_tmp.a_hydrox;
-      p->a_hydro[1] += p_tmp.a_hydroy;
-      p->a_hydro[2] += p_tmp.a_hydroz;
-      p->u_dt += p_tmp.u_dt;
-      p->force.h_dt += p_tmp.h_dt;
-      const float v_sig = p->viscosity.v_sig;
-      p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-      p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
-//	      p->viscosity.v_sig = p_tmp.v_sig;
+    struct part_aos_f p_tmp = parts_tmp[i];
+    struct part *p = &c->hydro.parts[i];
+    p->a_hydro[0] += p_tmp.a_hydrox;
+    p->a_hydro[1] += p_tmp.a_hydroy;
+    p->a_hydro[2] += p_tmp.a_hydroz;
+    p->u_dt += p_tmp.u_dt;
+    p->force.h_dt += p_tmp.h_dt;
+    const float v_sig = p->viscosity.v_sig;
+    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
+    p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
+    //	      p->viscosity.v_sig = p_tmp.v_sig;
   }
 }
 
-void unpack_neat_pair_aos_f4_f(struct runner *r, struct cell * restrict c, struct part_aos_f4_f_recv * restrict parts_aos_buffer, int tid, int local_pack_position,
-	int count, struct engine *e){
-//	  struct part_aos_f4_f_recv *restrict parts_tmp = &parts_aos_buffer[local_pack_position];
-	if (cell_is_active_hydro(c, e)){
+void unpack_neat_pair_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid,
+    int local_pack_position, int count, struct engine *e) {
+  //	  struct part_aos_f4_f_recv *restrict parts_tmp =
+  //&parts_aos_buffer[local_pack_position];
+  if (cell_is_active_hydro(c, e)) {
     int pp = local_pack_position;
-	for (int i = 0; i < count; i++) {
-//	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
-//	      struct part *restrict p = &c->hydro.parts[i];
-	  int j = i + pp;
+    for (int i = 0; i < count; i++) {
+      //	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
+      //	      struct part *restrict p = &c->hydro.parts[i];
+      int j = i + pp;
       c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[j].a_hydro.x;
-	  c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y;
-	  c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z;
-	  c->hydro.parts[i].viscosity.v_sig = fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z, c->hydro.parts[i].viscosity.v_sig);
-	  c->hydro.parts[i].limiter_data.min_ngb_time_bin = (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
-	  c->hydro.parts[i].u_dt += parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x;
-	  c->hydro.parts[i].force.h_dt += parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y;
-	}
-	}
+      c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[j].a_hydro.y;
+      c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[j].a_hydro.z;
+      c->hydro.parts[i].viscosity.v_sig =
+          fmaxf(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.z,
+                c->hydro.parts[i].viscosity.v_sig);
+      c->hydro.parts[i].limiter_data.min_ngb_time_bin =
+          (int)(parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
+      c->hydro.parts[i].u_dt +=
+          parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.x;
+      c->hydro.parts[i].force.h_dt +=
+          parts_aos_buffer[j].udt_hdt_vsig_mintimebin_ngb.y;
+    }
+  }
 }
 
-void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci,
+                                     struct cell *cj,
+                                     struct part_soa parts_soa_buffer,
+                                     int timer, int *pack_length, int tid,
+                                     int count_max_parts_tmp,
+                                     struct engine *e) {
   TIMER_TIC;
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e)){
-	message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1164,35 +1314,42 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair(r, ci, parts_soa_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair(r, ci, parts_soa_buffer, tid, local_pack_position, count_ci,
+                   e);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair(r, cj, parts_soa_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair(r, cj, parts_soa_buffer, tid, local_pack_position, count_cj,
+                   e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
-//  if(r->cpuid == 0)exit(0);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp,
+                                         struct engine *e) {
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e)){
-	message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1203,32 +1360,38 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos(r, ci, parts_aos_buffer, tid, local_pack_position,
+                       count_ci, e);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos(r, cj, parts_aos_buffer, tid, local_pack_position,
+                       count_cj, e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
-//  if(r->cpuid == 0)exit(0);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_recv *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
-	message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1239,32 +1402,40 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, ci, parts_aos_buffer, tid, local_pack_position,
+                          count_ci, e);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4(r, cj, parts_aos_buffer, tid, local_pack_position,
+                          count_cj, e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
-//  if(r->cpuid == 0)exit(0);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_g *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e) {
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e)){
-	message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1275,32 +1446,38 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_g(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_g(r, ci, parts_aos_buffer, tid, local_pack_position,
+                         count_ci, e);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_g(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_g(r, cj, parts_aos_buffer, tid, local_pack_position,
+                         count_cj, e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
-//  if(r->cpuid == 0)exit(0);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_g_recv *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
-	message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1311,32 +1488,40 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, ci, parts_aos_buffer, tid, local_pack_position,
+                            count_ci, e);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_g(r, cj, parts_aos_buffer, tid, local_pack_position,
+                            count_cj, e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
-//  if(r->cpuid == 0)exit(0);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_f *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e) {
 
   /* Anything to do here? */
-//  if (c->hydro.count == 0)
-//    return;
-  if (!cell_is_active_hydro(ci, e)){
-	  message("Inactive cell\n");
-	return;
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e)) {
+    message("Inactive cell\n");
+    return;
   }
   int count_ci = ci->hydro.count;
   int count_cj = cj->hydro.count;
   int local_pack_position = (*pack_length);
 
-# ifdef SWIFT_DEBUG_CHECKS
+#ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
     fprintf(stderr,
             "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
@@ -1347,67 +1532,78 @@ int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e)
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f(r, ci, parts_aos_buffer, tid, local_pack_position,
+                         count_ci, e);
   local_pack_position += count_ci;
-//  for (int i = 0; i < count_ci; i++){
-//    struct part *p = &ci->hydro.parts[i];
-//    fprintf(stderr, "ax %f, ay %f, az %f, u_dt %f, h_dt %f\n", p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->u_dt, p->force.h_dt);
-//  }
-//	      p->viscosity.v_sig = p_tmp.v_sig;
+  //  for (int i = 0; i < count_ci; i++){
+  //    struct part *p = &ci->hydro.parts[i];
+  //    fprintf(stderr, "ax %f, ay %f, az %f, u_dt %f, h_dt %f\n",
+  //    p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->u_dt, p->force.h_dt);
+  //  }
+  //	      p->viscosity.v_sig = p_tmp.v_sig;
   /* Pack the particle data into CPU-side buffers*/
-//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f(r, cj, parts_aos_buffer, tid, local_pack_position,
+                         count_cj, e);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
-//  if(r->cpuid == 0)exit(0);
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_f_recv *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e) {
-
-	  /* Anything to do here? */
-	//  if (c->hydro.count == 0)
-	//    return;
-	  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)){
-		message("Inactive cell\n");
-		return;
-	  }
-	  int count_ci = ci->hydro.count;
-	  int count_cj = cj->hydro.count;
-	  int local_pack_position = (*pack_length);
-
-	# ifdef SWIFT_DEBUG_CHECKS
-	  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
-	    fprintf(stderr,
-	            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-	            "%i pointer to pack_length is %i, local_pack_position is % i, "
-	            "count is %i\n",
-	            (*pack_length), pack_length, local_pack_position, count_ci, e);
-	  }
-	#endif
-
-	  /* Pack the particle data into CPU-side buffers*/
-	//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-	  unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position, count_ci, e);
-	  local_pack_position += count_ci;
-	  /* Pack the particle data into CPU-side buffers*/
-	//  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j %i\n", local_pack_position, count_ci, count_cj);
-	  unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position, count_cj, e);
-	  /* Increment pack length accordingly */
-	  (*pack_length) += count_ci + count_cj;
-	//  if(r->cpuid == 0)exit(0);
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e) {
+
+  /* Anything to do here? */
+  //  if (c->hydro.count == 0)
+  //    return;
+  if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
+    message("Inactive cell\n");
+    return;
+  }
+  int count_ci = ci->hydro.count;
+  int count_cj = cj->hydro.count;
+  int local_pack_position = (*pack_length);
+
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
+            "%i pointer to pack_length is %i, local_pack_position is % i, "
+            "count is %i\n",
+            (*pack_length), pack_length, local_pack_position, count_ci, e);
+  }
+#endif
+
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_f(r, ci, parts_aos_buffer, tid, local_pack_position,
+                            count_ci, e);
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
+  //  %i\n", local_pack_position, count_ci, count_cj);
+  unpack_neat_pair_aos_f4_f(r, cj, parts_aos_buffer, tid, local_pack_position,
+                            count_cj, e);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
+  //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_dopair_gpu_pack_neat(
-    struct runner *r, struct cell *c, struct part_soa parts_soa_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp) {
+void runner_dopair_gpu_pack_neat(struct runner *r, struct cell *c,
+                                 struct part_soa parts_soa_buffer, int timer,
+                                 int *pack_length, int tid,
+                                 int count_max_parts_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
 
   int count = c->hydro.count;
   int local_pack_position = (*pack_length);
@@ -1424,19 +1620,20 @@ void runner_dopair_gpu_pack_neat(
   /* Increment pack length accordingly */
   (*pack_length) += count;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci,
+                                   struct cell *cj,
+                                   struct part_soa parts_soa_buffer, int timer,
+                                   int *pack_length, int tid,
+                                   int count_max_parts_tmp, int count_ci,
+                                   int count_cj) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (ci->hydro.count == 0)
-    return;
+  if (ci->hydro.count == 0) return;
 
   int local_pack_position = (*pack_length);
 
@@ -1455,103 +1652,118 @@ void runner_do_ci_cj_gpu_pack_neat(
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat_aos(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj, float3 shift_tmp) {
+void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci,
+                                       struct cell *cj,
+                                       struct part_aos *parts_aos_buffer,
+                                       int timer, int *pack_length, int tid,
+                                       int count_max_parts_tmp, int count_ci,
+                                       int count_cj, float3 shift_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (ci->hydro.count == 0)
-    return;
+  if (ci->hydro.count == 0) return;
 
   int local_pack_position = (*pack_length);
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
     error();
   }
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-  float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
-  pack_neat_pair_aos(ci, parts_aos_buffer, tid, local_pack_position, count_ci, shift_i);
+  float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                    shift_tmp.z + cj->loc[2]};
+  pack_neat_pair_aos(ci, parts_aos_buffer, tid, local_pack_position, count_ci,
+                     shift_i);
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
   float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
-  pack_neat_pair_aos(cj, parts_aos_buffer, tid, local_pack_position, count_cj, shift_j);
+  pack_neat_pair_aos(cj, parts_aos_buffer, tid, local_pack_position, count_cj,
+                     shift_j);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_do_ci_cj_gpu_pack_neat_aos_f4(
-	    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_send * restrict parts_aos_buffer,
-		int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp){
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (ci->hydro.count == 0)
-    return;
+  if (ci->hydro.count == 0) return;
 
   int local_pack_position = (*pack_length);
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
     error();
   }
 #endif
 
   /* Pack the particle data into CPU-side buffers*/
-  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
   const int lpp1 = local_pack_position;
 
   const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
 
-  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
 
-  pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+  pack_neat_pair_aos_f4(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                        cjs_cje);
 
   local_pack_position += count_ci;
   /* Pack the particle data into CPU-side buffers*/
   const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
   const int lpp2 = local_pack_position;
 
-  pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
+  pack_neat_pair_aos_f4(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                        cis_cie);
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat_aos_g(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_g *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (ci->hydro.count == 0)
-    return;
+  if (ci->hydro.count == 0) return;
 
   int local_pack_position = (*pack_length);
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
     error();
   }
 #endif
@@ -1564,70 +1776,78 @@ void runner_do_ci_cj_gpu_pack_neat_aos_g(
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
-    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_g_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp) {
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
 
-	  TIMER_TIC;
+  TIMER_TIC;
 
-	  /* Anything to do here? */
-	  if (ci->hydro.count == 0)
-	    return;
+  /* Anything to do here? */
+  if (ci->hydro.count == 0) return;
 
-	  int local_pack_position = (*pack_length);
+  int local_pack_position = (*pack_length);
 
-	#ifdef SWIFT_DEBUG_CHECKS
-	  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-	    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-	    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
-	    error();
-	  }
-	#endif
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
 
-	  /* Pack the particle data into CPU-side buffers*/
-	  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
-	  const int lpp1 = local_pack_position;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
 
-	  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
 
-	  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
 
-	  pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+  pack_neat_pair_aos_f4_g(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                          cjs_cje);
 
-	  local_pack_position += count_ci;
-	  /* Pack the particle data into CPU-side buffers*/
-	  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
-	  const int lpp2 = local_pack_position;
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
 
-	  pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
-	  /* Increment pack length accordingly */
-	  (*pack_length) += count_ci + count_cj;
+  pack_neat_pair_aos_f4_g(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                          cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
 
-	  if (timer)
-	    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat_aos_f(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj) {
+void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_f *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj) {
 
   TIMER_TIC;
 
   /* Anything to do here? */
-  if (ci->hydro.count == 0)
-    return;
+  if (ci->hydro.count == 0) return;
 
   int local_pack_position = (*pack_length);
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
     error();
   }
 #endif
@@ -1640,51 +1860,56 @@ void runner_do_ci_cj_gpu_pack_neat_aos_f(
   /* Increment pack length accordingly */
   (*pack_length) += count_ci + count_cj;
 
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
-    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_f_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp) {
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp) {
 
-	  TIMER_TIC;
+  TIMER_TIC;
 
-	  /* Anything to do here? */
-	  if (ci->hydro.count == 0)
-	    return;
+  /* Anything to do here? */
+  if (ci->hydro.count == 0) return;
 
-	  int local_pack_position = (*pack_length);
+  int local_pack_position = (*pack_length);
 
-	#ifdef SWIFT_DEBUG_CHECKS
-	  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-	    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-	    		"ci %i cj %i count_max %i\n", local_pack_position, count_ci, count_cj, count_max_parts_tmp);
-	    error();
-	  }
-	#endif
+#ifdef SWIFT_DEBUG_CHECKS
+  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
+            "ci %i cj %i count_max %i\n",
+            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
+    error();
+  }
+#endif
 
-	  /* Pack the particle data into CPU-side buffers*/
-	  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1], shift_tmp.z + cj->loc[2]};
-	  const int lpp1 = local_pack_position;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
+                          shift_tmp.z + cj->loc[2]};
+  const int lpp1 = local_pack_position;
 
-	  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
+  const int2 cis_cie = {local_pack_position, local_pack_position + count_ci};
 
-	  const int2 cjs_cje = {local_pack_position + count_ci, local_pack_position + count_ci + count_cj};
+  const int2 cjs_cje = {local_pack_position + count_ci,
+                        local_pack_position + count_ci + count_cj};
 
-	  pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i, cjs_cje);
+  pack_neat_pair_aos_f4_f(ci, parts_aos_buffer, tid, lpp1, count_ci, shift_i,
+                          cjs_cje);
 
-	  local_pack_position += count_ci;
-	  /* Pack the particle data into CPU-side buffers*/
-	  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
-	  const int lpp2 = local_pack_position;
+  local_pack_position += count_ci;
+  /* Pack the particle data into CPU-side buffers*/
+  const float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
+  const int lpp2 = local_pack_position;
 
-	  pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j, cis_cie);
-	  /* Increment pack length accordingly */
-	  (*pack_length) += count_ci + count_cj;
+  pack_neat_pair_aos_f4_f(cj, parts_aos_buffer, tid, lpp2, count_cj, shift_j,
+                          cis_cie);
+  /* Increment pack length accordingly */
+  (*pack_length) += count_ci + count_cj;
 
-	  if (timer)
-	    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void runner_doself1_gpu_pack(
@@ -1705,8 +1930,7 @@ void runner_doself1_gpu_pack(
   //  fprintf(stderr,"Entered outer packing code!\n");
 
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
+  if (c->hydro.count == 0) return;
   /* Recurse? */
   //  if (c->split) {
   ////	fprintf(stderr,"Entered recursive packing code!\n");
@@ -1715,12 +1939,12 @@ void runner_doself1_gpu_pack(
   //    	  runner_doself1_gpu_pack(r, c, timer, pack_length,
   //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
   //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
-  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount, wcount_dh,
-  //    rho_dh, rot_u, rot_v, 	  rot_w, div_v, div_v_previous_step, alpha_visc,
-  //    v_sig, laplace_u, alpha_diff, f, soundspeed, 	  h_dt, balsara, pressure,
-  //    alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-  //    	  to_be_synchronized, count_max_parts_tmp, fgpuin);
-  //    	  fprintf(stderr,"working on a split cell\n");
+  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount,
+  //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
+  //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
+  //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
+  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized, count_max_parts_tmp,
+  //    fgpuin); 	  fprintf(stderr,"working on a split cell\n");
   //      }
   //    }
   //  }
@@ -1743,8 +1967,7 @@ void runner_doself1_gpu_pack(
   // Increment pack length accordingly
   (*pack_length) += count;
   //  }
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
@@ -1786,7 +2009,7 @@ void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
     //	u[id_in_pack]=ptmps[i].u;
     //	u_dt[id_in_pack]=ptmps[i].u_dt;
     //////////////////////////////////////////////////////
-    rho[id_in_pack] = 0.f;//ptmps[i].rho;
+    rho[id_in_pack] = 0.f;  // ptmps[i].rho;
     /////////////////////////////////////////////////////
     //	div_v_previous_step[id_in_pack]=ptmps[i].viscosity.div_v_previous_step;
     //	alpha_visc[id_in_pack]=ptmps[i].viscosity.alpha;
@@ -1798,18 +2021,18 @@ void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
     //	h_dt[id_in_pack]=ptmps[i].force.h_dt;
     //	balsara[id_in_pack]=ptmps[i].force.balsara;
     //	pressure[id_in_pack]=ptmps[i].force.pressure;
-//    time_bin[id_in_pack] = ptmps[i].time_bin;
+    //    time_bin[id_in_pack] = ptmps[i].time_bin;
     //	wakeup[id_in_pack]=ptmps[i].limiter_data.wakeup;
     //	min_ngb_time_bin[id_in_pack]=ptmps[i].limiter_data.min_ngb_time_bin;
     //	to_be_synchronized[id_in_pack]=ptmps[i].limiter_data.to_be_synchronized;
     ///////////////////////////////////////////////////////////////////
-    wcount[id_in_pack] = 0.f;//ptmps[i].density.wcount;
-    wcount_dh[id_in_pack] = 0.f;//ptmps[i].density.wcount_dh;
-    rho_dh[id_in_pack] = 0.f;//ptmps[i].density.rho_dh;
-    div_v[id_in_pack] = 0.f;//ptmps[i].viscosity.div_v;
-    rot_u[id_in_pack] = 0.f;//ptmps[i].density.rot_v[0];
-    rot_v[id_in_pack] = 0.f;//ptmps[i].density.rot_v[1];
-    rot_w[id_in_pack] = 0.f;//ptmps[i].density.rot_v[2];
+    wcount[id_in_pack] = 0.f;     // ptmps[i].density.wcount;
+    wcount_dh[id_in_pack] = 0.f;  // ptmps[i].density.wcount_dh;
+    rho_dh[id_in_pack] = 0.f;     // ptmps[i].density.rho_dh;
+    div_v[id_in_pack] = 0.f;      // ptmps[i].viscosity.div_v;
+    rot_u[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[0];
+    rot_v[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[1];
+    rot_w[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[2];
     ///////////////////////////////////////////////////////////////////
   }
 }
@@ -1830,10 +2053,8 @@ void runner_doself1_gpu_unpack(
   TIMER_TIC;
   //  fprintf(stderr, "got into pack function\n");
   /* Anything to do here? */
-  if (c->hydro.count == 0)
-    return;
-  if (!cell_is_active_hydro(c, e))
-	return;
+  if (c->hydro.count == 0) return;
+  if (!cell_is_active_hydro(c, e)) return;
   /* Anything to do here? */
   /* Recurse? */
   //  if (c->split) {
@@ -1843,12 +2064,12 @@ void runner_doself1_gpu_unpack(
   //    	  runner_doself1_gpu_unpack(r, c, timer, pack_length,
   //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
   //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
-  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount, wcount_dh,
-  //    rho_dh, rot_u, rot_v, 	  rot_w, div_v, div_v_previous_step, alpha_visc,
-  //    v_sig, laplace_u, alpha_diff, f, soundspeed, 	  h_dt, balsara, pressure,
-  //    alpha_visc_max_ngb, time_bin, wakeup, min_ngb_time_bin,
-  //    	  to_be_synchronized, count_max_parts_tmp, fgpuin);
-  //    	  fprintf(stderr,"working on a split cell\n");
+  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount,
+  //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
+  //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
+  //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
+  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized, count_max_parts_tmp,
+  //    fgpuin); 	  fprintf(stderr,"working on a split cell\n");
   //      }
   //    }
   //  } else {
@@ -1870,15 +2091,14 @@ void runner_doself1_gpu_unpack(
          alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
          time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
          local_pack_position, count, e);
-//  for (int i = *pack_length; i < count+*pack_length; i++) {
-//  for (int i = 0; i < count; i++) {
-//	  message("wcount is %f", c->hydro.parts[i].density.wcount);
-//  }
+  //  for (int i = *pack_length; i < count+*pack_length; i++) {
+  //  for (int i = 0; i < count; i++) {
+  //	  message("wcount is %f", c->hydro.parts[i].density.wcount);
+  //  }
   // Increment pack length accordingly
   (*pack_length) += count;
   //  }
-  if (timer)
-    TIMER_TOC(timer_doself_gpu_pack);
+  if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
 void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
@@ -1906,7 +2126,8 @@ void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
       continue;
     }
     const int pi_active = part_is_active(pi, e);
-    if (!pi_active)fprintf(stderr, "Inactive part\n");
+    if (!pi_active)
+      fprintf(stderr, "Inactive part\n");
     else if (pi_active) {
       //    c->hydro.parts[i].rho = rho[id_in_pack];
       //    c->hydro.parts[i].viscosity.div_v = div_v[id_in_pack];
@@ -1929,6 +2150,6 @@ void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
   }
   //  c->hydro.parts=ptmps;
 }
-//#ifdef WITHCUDA
-//}
-//#endif
+// #ifdef WITHCUDA
+// }
+// #endif
diff --git a/src/runner_gpu_pack_functions.h b/src/runner_gpu_pack_functions.h
old mode 100755
new mode 100644
index 797e06519e..8730219711
--- a/src/runner_gpu_pack_functions.h
+++ b/src/runner_gpu_pack_functions.h
@@ -12,47 +12,73 @@ void runner_doself1_gpu_pack(
     float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
     timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
     char *to_be_synchronized, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_neat(
-		    struct runner *r, struct cell *c, struct part_soa parts_soa,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_neat_aos(
-		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c,
+                                  struct part_soa parts_soa, int timer,
+                                  int *pack_length, int tid,
+                                  int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
 void runner_doself1_gpu_pack_neat_aos_f4(
-		    struct runner *r, struct cell * __restrict__ c, struct part_aos_f4_send * __restrict__ parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_neat_aos_g(
-		    struct runner *r, struct cell *c, struct part_aos_g *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_neat_aos_f4_g(
-		    struct runner *r, struct cell *c, struct part_aos_f4_g_send *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_neat_aos_f(
-		    struct runner *r, struct cell *c, struct part_aos_f *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
+    struct runner *r, struct cell *__restrict__ c,
+    struct part_aos_f4_send *__restrict__ parts_aos, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c,
+                                        struct part_aos_g *parts_aos, int timer,
+                                        int *pack_length, int tid,
+                                        int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f4_g(struct runner *r, struct cell *c,
+                                           struct part_aos_f4_g_send *parts_aos,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp);
+void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c,
+                                        struct part_aos_f *parts_aos, int timer,
+                                        int *pack_length, int tid,
+                                        int count_max_parts_tmp);
 void runner_doself1_gpu_pack_neat_aos_f4_f(
-		    struct runner *r, struct cell * restrict c, struct part_aos_f4_f_send * restrict parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_forc_aos(
-		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_pack_grad_aos(
-		    struct runner *r, struct cell *c, struct part_aos *parts_aos,
-			int timer, int *pack_length, int tid, int count_max_parts_tmp);
-void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c, struct part_soa parts_soa,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c, struct part_aos *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c, struct part_aos_g *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c, struct part_aos_f *parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-void runner_doself1_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *restrict c, struct part_aos_f4_f_recv * restrict parts_aos_buffer,
-int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_send *restrict parts_aos, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp);
+void runner_doself1_gpu_pack_forc_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
+void runner_doself1_gpu_pack_grad_aos(struct runner *r, struct cell *c,
+                                      struct part_aos *parts_aos, int timer,
+                                      int *pack_length, int tid,
+                                      int count_max_parts_tmp);
+void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c,
+                                    struct part_soa parts_soa, int timer,
+                                    int *pack_length, int tid,
+                                    int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c,
+                                        struct part_aos *parts_aos_buffer,
+                                        int timer, int *pack_length, int tid,
+                                        int count_max_parts_tmp,
+                                        struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
+    int timer, int *pack_length, int tid, int count_max_parts_tmp,
+    struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c,
+                                          struct part_aos_g *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *c,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c,
+                                          struct part_aos_f *parts_aos_buffer,
+                                          int timer, int *pack_length, int tid,
+                                          int count_max_parts_tmp,
+                                          struct engine *e);
+void runner_doself1_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *restrict c,
+    struct part_aos_f4_f_recv *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
 void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
           int *tid_p, long long *id, float *ux, float *uy, float *uz,
           float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
@@ -66,20 +92,43 @@ void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
           float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
           timebin_t *wakeup, timebin_t *min_ngb_time_bin,
           char *to_be_synchronized, int local_pack_position, int count);
-void pack_neat(struct cell *c, struct part_soa parts_soa, int tid, int local_pack_position, int count);
-void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count);
-void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count);
-void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid, int local_pack_position, int count);
-void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer, int tid, int local_pack_position, int count, int2 frst_lst_prts);
-void pack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_send *parts_aos_buffer, int tid, int local_pack_position, int count);
-void pack_neat_aos_f4_f(const struct cell *restrict c, struct part_aos_f4_f_send *restrict parts_aos, int tid, int local_pack_position, int count);
-void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos_f4(struct cell *c, struct part_aos_f4_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos_f4_g(struct cell *c, struct part_aos_f4_g_recv *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
-void unpack_neat_aos_f4_f(struct cell *restrict c, struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid, int local_pack_position, int count, struct engine *e);
+void pack_neat(struct cell *c, struct part_soa parts_soa, int tid,
+               int local_pack_position, int count);
+void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                   int local_pack_position, int count);
+void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                     int tid, int local_pack_position, int count);
+void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid,
+                     int local_pack_position, int count);
+void pack_neat_aos_f4(struct cell *c, struct part_aos_f4_send *parts_aos_buffer,
+                      int tid, int local_pack_position, int count,
+                      int2 frst_lst_prts);
+void pack_neat_aos_f4_g(struct cell *c,
+                        struct part_aos_f4_g_send *parts_aos_buffer, int tid,
+                        int local_pack_position, int count);
+void pack_neat_aos_f4_f(const struct cell *restrict c,
+                        struct part_aos_f4_f_send *restrict parts_aos, int tid,
+                        int local_pack_position, int count);
+void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
+                 int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
+                     int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f4(struct cell *c,
+                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
+                        int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e);
+void unpack_neat_aos_f4_g(struct cell *c,
+                          struct part_aos_f4_g_recv *parts_aos_buffer, int tid,
+                          int local_pack_position, int count, struct engine *e);
+void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
+                       int tid, int local_pack_position, int count,
+                       struct engine *e);
+void unpack_neat_aos_f4_f(struct cell *restrict c,
+                          struct part_aos_f4_f_recv *restrict parts_aos_buffer,
+                          int tid, int local_pack_position, int count,
+                          struct engine *e);
 void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
             int *tid_p, long long *id, float *ux, float *uy, float *uz,
             float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
@@ -108,53 +157,90 @@ void runner_doself1_gpu_unpack(
     timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
     char *to_be_synchronized, int count_max_parts_tmp, struct engine *e);
 
-void runner_do_ci_cj_gpu_pack_neat(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
+void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci,
+                                   struct cell *cj,
+                                   struct part_soa parts_soa_buffer, int timer,
+                                   int *pack_length, int tid,
+                                   int count_max_parts_tmp, int count_ci,
+                                   int count_cj);
 
-void runner_do_ci_cj_gpu_pack_neat_aos(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj, float3 shift_tmp);
+void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci,
+                                       struct cell *cj,
+                                       struct part_aos *parts_aos_buffer,
+                                       int timer, int *pack_length, int tid,
+                                       int count_max_parts_tmp, int count_ci,
+                                       int count_cj, float3 shift_tmp);
 
 void runner_do_ci_cj_gpu_pack_neat_aos_f4(
-    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
-
-void runner_do_ci_cj_gpu_pack_neat_aos_g(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
-
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_g *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj);
 
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
-    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_g_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
-
-void runner_do_ci_cj_gpu_pack_neat_aos_f(
-    struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, int count_ci, int count_cj);
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos_f *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp, int count_ci,
+                                         int count_cj);
 
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
-    struct runner *r, struct cell * restrict ci, struct cell * restrict cj, struct part_aos_f4_f_send * restrict parts_aos_buffer,
-	int timer, int *pack_length, int tid, int count_max_parts_tmp, const int count_ci, const int count_cj, float3 shift_tmp);
-
-void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci, struct cell *cj, struct part_soa parts_soa_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_recv *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_g_recv *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_g *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
-void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(struct runner *r, struct cell *ci, struct cell *cj, struct part_aos_f4_f_recv *parts_aos_buffer,
-    int timer, int *pack_length, int tid, int count_max_parts_tmp, struct engine *e);
-
+    struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
+    struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
+    int *pack_length, int tid, int count_max_parts_tmp, const int count_ci,
+    const int count_cj, float3 shift_tmp);
+
+void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci,
+                                     struct cell *cj,
+                                     struct part_soa parts_soa_buffer,
+                                     int timer, int *pack_length, int tid,
+                                     int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci,
+                                         struct cell *cj,
+                                         struct part_aos *parts_aos_buffer,
+                                         int timer, int *pack_length, int tid,
+                                         int count_max_parts_tmp,
+                                         struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_g *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci,
+                                           struct cell *cj,
+                                           struct part_aos_f *parts_aos_buffer,
+                                           int timer, int *pack_length, int tid,
+                                           int count_max_parts_tmp,
+                                           struct engine *e);
+
+void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+    struct runner *r, struct cell *ci, struct cell *cj,
+    struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
+    int tid, int count_max_parts_tmp, struct engine *e);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
old mode 100755
new mode 100644
index 70577aaf05..2bf1352fe8
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,9 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD 1 //off-load hydro to GPU
-//#define DO_CORNERS 1 //do corner pair tasks on CPU
-//#define DUMP_TIMINGS 1
+#define GPUOFFLOAD 1  // off-load hydro to GPU
+// #define DO_CORNERS 1 //do corner pair tasks on CPU
+// #define DUMP_TIMINGS 1
 #include "../config.h"
 
 /* MPI headers. */
@@ -155,12 +155,13 @@ extern "C" {
 #endif
 
 #include "cuda/part_gpu.h"
-#include "runner_gpu_pack_functions.h"
-#include "runner_doiact_functions_hydro_gpu.h"
-#include "files_for_new_functions/host_device_data_transfer.h"
 #include "files_for_new_functions/arrays_malloc.h"
-//#include "./cuda/BLOCK_SIZE.h"
+#include "files_for_new_functions/host_device_data_transfer.h"
+#include "runner_doiact_functions_hydro_gpu.h"
+#include "runner_gpu_pack_functions.h"
+// #include "./cuda/BLOCK_SIZE.h"
 #include "cuda/GPU_runner_functions.h"
+
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
@@ -180,22 +181,23 @@ inline cudaError_t checkCuda(cudaError_t result) {
   return result;
 }
 
-//inline void gpuErrchk(cudaError_t code) {
-//#define __FILE__ __LINE__
-//  inline void gpuAssert(cudaError_t code, const char *file, int line) {
-//    int abort = 0;
-//    if (code != cudaSuccess) {
-//      //			fprintf( stderr, "cudaCheckError() failed at
-//      //%s:%i : %s\n",
-//      //                 file, line, cudaGetErrorString( code ) );
-//      abort = 1;
-//      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-//              line);
-//      if (abort)
-//        exit(code);
-//    }
-//  }
-//}
+// inline void gpuErrchk(cudaError_t code) {
+// #define __FILE__ __LINE__
+//   inline void gpuAssert(cudaError_t code, const char *file, int line) {
+//     int abort = 0;
+//     if (code != cudaSuccess) {
+//       //			fprintf( stderr, "cudaCheckError() failed at
+//       //%s:%i : %s\n",
+//       //                 file, line, cudaGetErrorString( code ) );
+//       abort = 1;
+//       fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),
+//       file,
+//               line);
+//       if (abort)
+//         exit(code);
+//     }
+//   }
+// }
 
 void *runner_main2(void *data) {
   struct runner *r = (struct runner *)data;
@@ -214,29 +216,30 @@ void *runner_main2(void *data) {
   struct pack_vars_pair *pack_vars_pair_grad;
 
   cudaMallocHost((void **)&pack_vars_self_dens,
-                       sizeof(struct pack_vars_self *));
+                 sizeof(struct pack_vars_self *));
   cudaMallocHost((void **)&pack_vars_self_forc,
-                       sizeof(struct pack_vars_self *));
+                 sizeof(struct pack_vars_self *));
   cudaMallocHost((void **)&pack_vars_self_grad,
-                       sizeof(struct pack_vars_self *));
+                 sizeof(struct pack_vars_self *));
 
   cudaMallocHost((void **)&pack_vars_pair_dens,
-                       sizeof(struct pack_vars_pair *));
+                 sizeof(struct pack_vars_pair *));
   cudaMallocHost((void **)&pack_vars_pair_forc,
-                       sizeof(struct pack_vars_pair *));
+                 sizeof(struct pack_vars_pair *));
   cudaMallocHost((void **)&pack_vars_pair_grad,
-                       sizeof(struct pack_vars_pair *));
+                 sizeof(struct pack_vars_pair *));
 
-  int devId = 0; // find and print gpu device name
+  int devId = 0;  // find and print gpu device name
   struct cudaDeviceProp prop;
   int nDevices;
   int maxBlocksSM;
   int nSMs;
   cudaGetDeviceCount(&nDevices);
   cudaGetDeviceProperties(&prop, devId);
-  cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor, devId);
+  cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor,
+                         devId);
   cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId);
-  int nPartsPerCell = space->nr_parts/space->tot_cells;
+  int nPartsPerCell = space->nr_parts / space->tot_cells;
   int mpi_rank = 0;
 #ifdef WITH_MPI
   MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
@@ -244,23 +247,28 @@ void *runner_main2(void *data) {
   if (r->cpuid == 0 && mpi_rank == 0) {
     fprintf(stderr, "%i devices available device id is %i\n", nDevices, devId);
     fprintf(stderr, "Device : %s\n", prop.name);
-    fprintf(stderr, "nSMs %i max blocks per SM %i maxnBlocks per stream %i\n", nSMs, maxBlocksSM, nSMs * maxBlocksSM);
-    fprintf(stderr, "Target nBlocks per kernel is %i\n", N_TASKS_BUNDLE_SELF*nPartsPerCell/BLOCK_SIZE);
-    fprintf(stderr, "Target nBlocks per stream is %i\n", N_TASKS_PER_PACK_SELF*nPartsPerCell/BLOCK_SIZE);
+    fprintf(stderr, "nSMs %i max blocks per SM %i maxnBlocks per stream %i\n",
+            nSMs, maxBlocksSM, nSMs * maxBlocksSM);
+    fprintf(stderr, "Target nBlocks per kernel is %i\n",
+            N_TASKS_BUNDLE_SELF * nPartsPerCell / BLOCK_SIZE);
+    fprintf(stderr, "Target nBlocks per stream is %i\n",
+            N_TASKS_PER_PACK_SELF * nPartsPerCell / BLOCK_SIZE);
   }
-  if(nDevices == 1)cudaSetDevice(devId);
+  if (nDevices == 1) cudaSetDevice(devId);
 #ifndef WITH_MPI
-  else{
+  else {
     cudaSetDevice(devId);
   }
 #endif
 #ifdef WITH_MPI
-  else{
-      cudaSetDevice(mpi_rank * 2);
-      fprintf(stderr, "%i devices available device id is %i\n", nDevices, mpi_rank * 2);
+  else {
+    cudaSetDevice(mpi_rank * 2);
+    fprintf(stderr, "%i devices available device id is %i\n", nDevices,
+            mpi_rank * 2);
   }
 #endif
-  fprintf(stderr, "after dev select engine_rank %i rank %i\n", engine_rank, mpi_rank);
+  fprintf(stderr, "after dev select engine_rank %i rank %i\n", engine_rank,
+          mpi_rank);
 
   cudaError_t cu_error;
   // how many tasks do we want for each launch of GPU kernel
@@ -284,8 +292,8 @@ void *runner_main2(void *data) {
   pack_vars_pair_forc->bundle_size = bundle_size_pair;
   pack_vars_self_grad->bundle_size = bundle_size;
   pack_vars_pair_grad->bundle_size = bundle_size_pair;
-//  fprintf(stderr, "size %i size %i\n", sizeof(*pack_vars_self_dens), sizeof(pack_vars_self));
-//  const int bundle_size_pair = bundle_size/2;
+  //  fprintf(stderr, "size %i size %i\n", sizeof(*pack_vars_self_dens),
+  //  sizeof(pack_vars_self)); const int bundle_size_pair = bundle_size/2;
   // Keep track of first and last particles for each task (particle data is
   // arranged in long arrays containing particles from all the tasks we will
   // work with)
@@ -310,83 +318,76 @@ void *runner_main2(void *data) {
   int *d_task_first_parts_pair_forc, *d_task_last_parts_pair_forc;
   int *d_task_first_parts_pair_grad, *d_task_last_parts_pair_grad;
 
-  cudaMallocManaged((void**)&task_first_part_self_dens_f4,
-		  target_n_tasks * sizeof(int2), cudaMemAttachGlobal);
-  cudaMallocHost((void**)&task_first_part_f4,
-		  target_n_tasks * sizeof(int2));
-  cudaMalloc((void**)&d_task_first_part_f4,
-		  target_n_tasks * sizeof(int2));
-  cudaMallocHost((void**)&task_first_part_f4_f,
-		  target_n_tasks * sizeof(int2));
-  cudaMalloc((void**)&d_task_first_part_f4_f,
-		  target_n_tasks * sizeof(int2));
-  cudaMallocHost((void**)&task_first_part_f4_g,
-		  target_n_tasks * sizeof(int2));
-  cudaMalloc((void**)&d_task_first_part_f4_g,
-  		  target_n_tasks * sizeof(int2));
-
-  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_dens,
-		  target_n_tasks * sizeof(int4));
-//  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_dens,
-//  		  target_n_tasks * sizeof(int4));
-
-  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_forc,
-		  target_n_tasks * sizeof(int4));
-  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_forc,
-  		  target_n_tasks * sizeof(int4));
-
-  cudaMallocHost((void**)&fparti_fpartj_lparti_lpartj_grad,
-		  target_n_tasks * sizeof(int4));
-  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_grad,
-  		  target_n_tasks * sizeof(int4));
-
-
-//  cudaMallocManaged((void**)&d_task_last_part_self_dens_f4,
-//		  target_n_tasks * sizeof(int), cudaMemAttachGlobal);
+  cudaMallocManaged((void **)&task_first_part_self_dens_f4,
+                    target_n_tasks * sizeof(int2), cudaMemAttachGlobal);
+  cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2));
+  cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4_f, target_n_tasks * sizeof(int2));
+  cudaMallocHost((void **)&task_first_part_f4_g, target_n_tasks * sizeof(int2));
+  cudaMalloc((void **)&d_task_first_part_f4_g, target_n_tasks * sizeof(int2));
+
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens,
+                 target_n_tasks * sizeof(int4));
+  //  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_dens,
+  //  		  target_n_tasks * sizeof(int4));
+
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc,
+                 target_n_tasks * sizeof(int4));
+  cudaMalloc((void **)&d_fparti_fpartj_lparti_lpartj_forc,
+             target_n_tasks * sizeof(int4));
+
+  cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
+                 target_n_tasks * sizeof(int4));
+  cudaMalloc((void **)&d_fparti_fpartj_lparti_lpartj_grad,
+             target_n_tasks * sizeof(int4));
+
+  //  cudaMallocManaged((void**)&d_task_last_part_self_dens_f4,
+  //		  target_n_tasks * sizeof(int), cudaMemAttachGlobal);
 
   // Arrays keeping track of the row numbers of the first and last particles
   // within each bundle. Required by the GPU code
 
   cudaMallocHost((void **)&pack_vars_self_dens->task_first_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_dens->task_last_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_dens->task_first_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_dens->task_last_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_self_forc->task_first_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_forc->task_last_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_forc->task_first_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_forc->task_last_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_self_grad->task_first_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_grad->task_last_part,
-                   target_n_tasks * sizeof(int));
+                 target_n_tasks * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_grad->task_first_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_grad->task_last_part,
-                   2 * target_n_tasks * sizeof(int));
+                 2 * target_n_tasks * sizeof(int));
 
   /* nBundles is the number of task bundles each
   thread has ==> Used to loop through bundles */
-  int nBundles = (target_n_tasks + bundle_size - 1) /
-                   bundle_size;
-  int nBundles_pair = (target_n_tasks_pair + bundle_size_pair - 1) /
-                   bundle_size_pair;
-
-  if(r->cpuid == 0){
-	  fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n", engine_rank, r->cpuid, nBundles);
-	  fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair);
+  int nBundles = (target_n_tasks + bundle_size - 1) / bundle_size;
+  int nBundles_pair =
+      (target_n_tasks_pair + bundle_size_pair - 1) / bundle_size_pair;
+
+  if (r->cpuid == 0) {
+    fprintf(stderr, "engine_rank %i cpuid %i nBundles/nStreams %i\n",
+            engine_rank, r->cpuid, nBundles);
+    fprintf(stderr, "nBundles/nStreams Pair %i\n", nBundles_pair);
   }
 
   pack_vars_self_dens->nBundles = nBundles;
@@ -400,91 +401,105 @@ void *runner_main2(void *data) {
   // within this thread)
 
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
-          nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_task_list,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_part,
-		  2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_dens->bundle_last_part,
-          2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_dens->bundle_first_task_list,
-		  2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_part,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_forc->bundle_last_part,
-          nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_forc->bundle_first_task_list,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_part,
-		  2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_forc->bundle_last_part,
-          2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_forc->bundle_first_task_list,
-		  2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_part,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_grad->bundle_last_part,
-          nBundles * sizeof(int));
+                 nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_grad->bundle_first_task_list,
-		  nBundles * sizeof(int));
+                 nBundles * sizeof(int));
 
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_part,
-		  2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_last_part,
-          2 * nBundles * sizeof(int));
+                 2 * nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
-		  2 * nBundles * sizeof(int));
-
-//These I need to keep/////////////////
-  cudaMalloc((void **)&d_task_first_part_self_dens, target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_part_self_forc, target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_part_self_grad, target_n_tasks * sizeof(int));
-
-  cudaMalloc((void **)&d_task_last_part_self_dens, target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_part_self_forc, target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_part_self_grad, target_n_tasks * sizeof(int));
-  //These I need to keep/////////////////
+                 2 * nBundles * sizeof(int));
+
+  // These I need to keep/////////////////
+  cudaMalloc((void **)&d_task_first_part_self_dens,
+             target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_part_self_forc,
+             target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_part_self_grad,
+             target_n_tasks * sizeof(int));
+
+  cudaMalloc((void **)&d_task_last_part_self_dens,
+             target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_part_self_forc,
+             target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_part_self_grad,
+             target_n_tasks * sizeof(int));
+  // These I need to keep/////////////////
   pack_vars_self_dens->d_task_first_part = d_task_first_part_self_dens;
   pack_vars_self_dens->d_task_last_part = d_task_last_part_self_dens;
-  //These I need to keep/////////////////
+  // These I need to keep/////////////////
   pack_vars_self_forc->d_task_first_part = d_task_first_part_self_forc;
   pack_vars_self_forc->d_task_last_part = d_task_last_part_self_forc;
-  //These I need to keep/////////////////
+  // These I need to keep/////////////////
   pack_vars_self_grad->d_task_first_part = d_task_first_part_self_grad;
   pack_vars_self_grad->d_task_last_part = d_task_last_part_self_grad;
 
-  //These I need to keep/////////////////
-  cudaMalloc((void **)&d_task_first_parts_pair_dens, 2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_parts_pair_forc, 2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_parts_pair_grad, 2 * target_n_tasks * sizeof(int));
-
-  cudaMalloc((void **)&d_task_last_parts_pair_dens, 2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_parts_pair_forc, 2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_parts_pair_grad, 2 * target_n_tasks * sizeof(int));
-  //These I need to keep/////////////////
+  // These I need to keep/////////////////
+  cudaMalloc((void **)&d_task_first_parts_pair_dens,
+             2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_parts_pair_forc,
+             2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_first_parts_pair_grad,
+             2 * target_n_tasks * sizeof(int));
+
+  cudaMalloc((void **)&d_task_last_parts_pair_dens,
+             2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_parts_pair_forc,
+             2 * target_n_tasks * sizeof(int));
+  cudaMalloc((void **)&d_task_last_parts_pair_grad,
+             2 * target_n_tasks * sizeof(int));
+  // These I need to keep/////////////////
   pack_vars_pair_dens->d_task_first_part = d_task_first_parts_pair_dens;
   pack_vars_pair_dens->d_task_last_part = d_task_last_parts_pair_dens;
   pack_vars_pair_forc->d_task_first_part = d_task_first_parts_pair_forc;
   pack_vars_pair_forc->d_task_last_part = d_task_last_parts_pair_forc;
   pack_vars_pair_grad->d_task_first_part = d_task_first_parts_pair_grad;
   pack_vars_pair_grad->d_task_last_part = d_task_last_parts_pair_grad;
-  //cell positions for self tasks REMEMBER to remove CPU copies as these are no longer necessary
+  // cell positions for self tasks REMEMBER to remove CPU copies as these are no
+  // longer necessary
   double *d_dens_cell_x, *d_dens_cell_y, *d_dens_cell_z;
   float3 *d_dens_f3_cell_x;
   double *d_grad_cell_x, *d_grad_cell_y, *d_grad_cell_z;
   double *d_forc_cell_x, *d_forc_cell_y, *d_forc_cell_z;
-  //Shifts for pair tasks REMEMBER to remove CPU copies as these are no longer necessary
+  // Shifts for pair tasks REMEMBER to remove CPU copies as these are no longer
+  // necessary
   double *d_dens_shift_x, *d_dens_shift_y, *d_dens_shift_z;
   double *d_grad_shift_x, *d_grad_shift_y, *d_grad_shift_z;
   double *d_forc_shift_x, *d_forc_shift_y, *d_forc_shift_z;
 
-  //These I need to keep/////////////////
+  // These I need to keep/////////////////
   cudaMalloc((void **)&d_dens_cell_x, target_n_tasks * sizeof(double));
   cudaMalloc((void **)&d_dens_cell_y, target_n_tasks * sizeof(double));
   cudaMalloc((void **)&d_dens_cell_z, target_n_tasks * sizeof(double));
@@ -510,51 +525,78 @@ void *runner_main2(void *data) {
   cudaMalloc((void **)&d_grad_shift_x, 2 * target_n_tasks * sizeof(double));
   cudaMalloc((void **)&d_grad_shift_y, 2 * target_n_tasks * sizeof(double));
   cudaMalloc((void **)&d_grad_shift_z, 2 * target_n_tasks * sizeof(double));
-  //These I need to keep/////////////////
+  // These I need to keep/////////////////
 
-  cudaMallocHost((void **)&pack_vars_self_dens->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_dens->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_dens->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_dens->cellx,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_dens->celly,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_dens->cellz,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_self_dens->d_cellx = d_dens_cell_x;
   pack_vars_self_dens->d_celly = d_dens_cell_y;
   pack_vars_self_dens->d_cellz = d_dens_cell_z;
 
-  cudaMallocHost((void **)&pack_vars_pair_dens->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_dens->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_dens->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_dens->shiftx,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_dens->shifty,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_dens->shiftz,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_pair_dens->d_shiftx = d_dens_shift_x;
   pack_vars_pair_dens->d_shifty = d_dens_shift_y;
   pack_vars_pair_dens->d_shiftz = d_dens_shift_z;
 
-  cudaMallocHost((void **)&pack_vars_self_forc->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_forc->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_forc->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_forc->cellx,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_forc->celly,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_forc->cellz,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_self_forc->d_cellx = d_forc_cell_x;
   pack_vars_self_forc->d_celly = d_forc_cell_y;
   pack_vars_self_forc->d_cellz = d_forc_cell_z;
 
-  cudaMallocHost((void **)&pack_vars_pair_forc->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_forc->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_forc->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_forc->shiftx,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_forc->shifty,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_forc->shiftz,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_pair_forc->d_shiftx = d_forc_shift_x;
   pack_vars_pair_forc->d_shifty = d_forc_shift_y;
   pack_vars_pair_forc->d_shiftz = d_forc_shift_z;
 
-  cudaMallocHost((void **)&pack_vars_self_grad->cellx, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_grad->celly, target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_grad->cellz, target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_grad->cellx,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_grad->celly,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost((void **)&pack_vars_self_grad->cellz,
+                 target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_self_grad->d_cellx = d_grad_cell_x;
   pack_vars_self_grad->d_celly = d_grad_cell_y;
   pack_vars_self_grad->d_cellz = d_grad_cell_z;
 
-  cudaMallocHost((void **)&pack_vars_pair_grad->shiftx, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_grad->shifty, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_pair_grad->shiftz, 2 * target_n_tasks * sizeof(double)); // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_grad->shiftx,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_grad->shifty,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
+  cudaMallocHost(
+      (void **)&pack_vars_pair_grad->shiftz,
+      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
 
   pack_vars_pair_grad->d_shiftx = d_grad_shift_x;
   pack_vars_pair_grad->d_shifty = d_grad_shift_y;
@@ -564,31 +606,26 @@ void *runner_main2(void *data) {
   cudaStream_t stream_pairs[nBundles_pair];
 
   cudaEvent_t self_end[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&self_end[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end[i]);
 
   cudaEvent_t self_end_g[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&self_end_g[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_g[i]);
 
   cudaEvent_t self_end_f[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&self_end_f[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_f[i]);
 
   cudaEvent_t pair_end[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&pair_end[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end[i]);
 
   cudaEvent_t pair_end_g[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&pair_end_g[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_g[i]);
 
   cudaEvent_t pair_end_f[nBundles];
-  for (int i =0; i < nBundles; i++)
-	  cudaEventCreate(&pair_end_f[i]);
+  for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_f[i]);
 
   int tasksperbundle = (target_n_tasks + nBundles - 1) / nBundles;
-  int tasksperbundle_pair = (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair;
+  int tasksperbundle_pair =
+      (target_n_tasks_pair + nBundles_pair - 1) / nBundles_pair;
 
   pack_vars_self_dens->tasksperbundle = tasksperbundle;
   pack_vars_pair_dens->tasksperbundle = tasksperbundle_pair;
@@ -597,7 +634,6 @@ void *runner_main2(void *data) {
   pack_vars_self_grad->tasksperbundle = tasksperbundle;
   pack_vars_pair_grad->tasksperbundle = tasksperbundle_pair;
 
-
   for (int i = 0; i < nBundles; ++i)
     cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
 
@@ -619,8 +655,7 @@ void *runner_main2(void *data) {
     error("MPI_Comm_size failed with error %i.", res);
 #endif
   int count_max_parts_tmp =
-      2 * target_n_tasks * space->nr_parts * nr_nodes /
-      space->nr_cells;
+      2 * target_n_tasks * space->nr_parts * nr_nodes / space->nr_cells;
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -628,117 +663,153 @@ void *runner_main2(void *data) {
   pack_vars_self_grad->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_grad->count_max_parts = count_max_parts_tmp;
 
-  struct part_aos      		*parts_aos_dens;
-  struct part_aos_f4   		*parts_aos_dens_f4;
-  struct part_aos_f4_send   *parts_aos_f4_send;
-  struct part_aos_f4_recv   *parts_aos_f4_recv;
+  struct part_aos *parts_aos_dens;
+  struct part_aos_f4 *parts_aos_dens_f4;
+  struct part_aos_f4_send *parts_aos_f4_send;
+  struct part_aos_f4_recv *parts_aos_f4_recv;
 
-  struct part_aos_f    		*parts_aos_forc;
-  struct part_aos_f4_f 		*parts_aos_forc_f4;
+  struct part_aos_f *parts_aos_forc;
+  struct part_aos_f4_f *parts_aos_forc_f4;
   struct part_aos_f4_f_send *parts_aos_forc_f4_send;
   struct part_aos_f4_f_recv *parts_aos_forc_f4_recv;
 
-  struct part_aos_g    		*parts_aos_grad;
-  struct part_aos_f4_g 		*parts_aos_grad_f4;
+  struct part_aos_g *parts_aos_grad;
+  struct part_aos_f4_g *parts_aos_grad_f4;
   struct part_aos_f4_g_send *parts_aos_grad_f4_send;
   struct part_aos_f4_g_recv *parts_aos_grad_f4_recv;
 
-  struct part_aos      		*d_parts_aos_dens;
-  struct part_aos_f4   		*d_parts_aos_dens_f4;
-  struct part_aos_f4_send   *d_parts_aos_f4_send;
-  struct part_aos_f4_recv   *d_parts_aos_f4_recv;
+  struct part_aos *d_parts_aos_dens;
+  struct part_aos_f4 *d_parts_aos_dens_f4;
+  struct part_aos_f4_send *d_parts_aos_f4_send;
+  struct part_aos_f4_recv *d_parts_aos_f4_recv;
 
-  struct part_aos_f    		*d_parts_aos_forc;
-  struct part_aos_f4_f 		*d_parts_aos_forc_f4;
+  struct part_aos_f *d_parts_aos_forc;
+  struct part_aos_f4_f *d_parts_aos_forc_f4;
   struct part_aos_f4_f_send *d_parts_aos_forc_f4_send;
   struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv;
 
-  struct part_aos_g    		*d_parts_aos_grad;
-  struct part_aos_f4_g 		*d_parts_aos_grad_f4;
+  struct part_aos_g *d_parts_aos_grad;
+  struct part_aos_f4_g *d_parts_aos_grad_f4;
   struct part_aos_f4_g_send *d_parts_aos_grad_f4_send;
   struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv;
 
-  struct part_aos           *parts_aos_pair_dens;
-  struct part_aos_f4_send   *parts_aos_pair_f4_send;
-  struct part_aos_f4_recv   *parts_aos_pair_f4_recv;
+  struct part_aos *parts_aos_pair_dens;
+  struct part_aos_f4_send *parts_aos_pair_f4_send;
+  struct part_aos_f4_recv *parts_aos_pair_f4_recv;
 
-  struct part_aos           *d_parts_aos_pair_dens;
-  struct part_aos_f4_send   *d_parts_aos_pair_f4_send;
-  struct part_aos_f4_recv   *d_parts_aos_pair_f4_recv;
+  struct part_aos *d_parts_aos_pair_dens;
+  struct part_aos_f4_send *d_parts_aos_pair_f4_send;
+  struct part_aos_f4_recv *d_parts_aos_pair_f4_recv;
 
-  struct part_aos_f         *parts_aos_pair_forc;
+  struct part_aos_f *parts_aos_pair_forc;
   struct part_aos_f4_f_send *parts_aos_pair_f4_f_send;
   struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv;
 
-  struct part_aos_f         *d_parts_aos_pair_forc;
+  struct part_aos_f *d_parts_aos_pair_forc;
   struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send;
   struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv;
 
-  struct part_aos_g         *parts_aos_pair_grad;
+  struct part_aos_g *parts_aos_pair_grad;
   struct part_aos_f4_g_send *parts_aos_pair_f4_g_send;
   struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv;
 
-  struct part_aos_g         *d_parts_aos_pair_grad;
+  struct part_aos_g *d_parts_aos_pair_grad;
   struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
   struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
 
-
-
-
-//  cudaMalloc((void**)&d_parts_aos_dens, count_max_parts_tmp * sizeof(struct part_aos));
-  cudaMalloc((void**)&d_parts_aos_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_send));
-  cudaMalloc((void**)&d_parts_aos_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-//  cudaMalloc((void**)&d_parts_aos_dens_f4, count_max_parts_tmp * sizeof(struct part_aos_f4));
-//  cudaMalloc((void**)&d_parts_aos_forc, count_max_parts_tmp * sizeof(struct part_aos_f));
-//  cudaMalloc((void**)&d_parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_f));
-  cudaMalloc((void**)&d_parts_aos_forc_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
-  cudaMalloc((void**)&d_parts_aos_forc_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-//  cudaMalloc((void**)&d_parts_aos_grad, count_max_parts_tmp * sizeof(struct part_aos_g));
-//  cudaMalloc((void**)&d_parts_aos_grad_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_g));
-  cudaMalloc((void**)&d_parts_aos_grad_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
-  cudaMalloc((void**)&d_parts_aos_grad_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
-
-//  cudaMallocHost((void **)&parts_aos_dens, count_max_parts_tmp * sizeof(struct part_aos));
-  cudaMallocHost((void **)&parts_aos_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_send));
-  cudaMallocHost((void **)&parts_aos_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-//  cudaMallocHost((void **)&parts_aos_dens_f4, count_max_parts_tmp * sizeof(struct part_aos_f4));
-//  cudaMallocHost((void **)&parts_aos_forc, count_max_parts_tmp * sizeof(struct part_aos_f));
-//  cudaMallocHost((void **)&parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_f));
-  cudaMallocHost((void **)&parts_aos_forc_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
-  cudaMallocHost((void **)&parts_aos_forc_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-//  cudaMallocHost((void **)&parts_aos_grad, count_max_parts_tmp * sizeof(struct part_aos_g));
-//  cudaMallocHost((void **)&parts_aos_grad_f4, count_max_parts_tmp * sizeof(struct part_aos_f4_g));
-  cudaMallocHost((void **)&parts_aos_grad_f4_send, count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
-  cudaMallocHost((void **)&parts_aos_grad_f4_recv, count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
-
-//  cudaMalloc((void**)&d_parts_aos_pair_dens, 2 * count_max_parts_tmp * sizeof(struct part_aos));
-  cudaMalloc((void**)&d_parts_aos_pair_f4_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
-  cudaMalloc((void**)&d_parts_aos_pair_f4_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-
-  cudaMalloc((void**)&d_parts_aos_pair_f4_f_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
-  cudaMalloc((void**)&d_parts_aos_pair_f4_f_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-
-  cudaMalloc((void**)&d_parts_aos_pair_f4_g_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
-  cudaMalloc((void**)&d_parts_aos_pair_f4_g_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
-
-
-///////////Probably not needed anymore////////////////////////////////////////////////////////////////
-  cudaMalloc((void**)&d_parts_aos_pair_forc, 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
-  cudaMalloc((void**)&d_parts_aos_pair_grad, 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
-  ///////////Probably not needed anymore////////////////////////////////////////////////////////////////
-
-//  cudaMallocHost((void **)&parts_aos_pair_dens, 2 * count_max_parts_tmp * sizeof(struct part_aos));
-  cudaMallocHost((void **)&parts_aos_pair_f4_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
-  cudaMallocHost((void **)&parts_aos_pair_f4_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-
-  cudaMallocHost((void **)&parts_aos_pair_f4_g_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
-  cudaMallocHost((void **)&parts_aos_pair_f4_g_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
-
-  cudaMallocHost((void **)&parts_aos_pair_f4_f_send, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
-  cudaMallocHost((void **)&parts_aos_pair_f4_f_recv, 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-
-  cudaMallocHost((void **)&parts_aos_pair_forc, 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
-  cudaMallocHost((void **)&parts_aos_pair_grad, 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
+  //  cudaMalloc((void**)&d_parts_aos_dens, count_max_parts_tmp * sizeof(struct
+  //  part_aos));
+  cudaMalloc((void **)&d_parts_aos_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void **)&d_parts_aos_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+  //  cudaMalloc((void**)&d_parts_aos_dens_f4, count_max_parts_tmp *
+  //  sizeof(struct part_aos_f4)); cudaMalloc((void**)&d_parts_aos_forc,
+  //  count_max_parts_tmp * sizeof(struct part_aos_f));
+  //  cudaMalloc((void**)&d_parts_aos_forc_f4, count_max_parts_tmp *
+  //  sizeof(struct part_aos_f4_f));
+  cudaMalloc((void **)&d_parts_aos_forc_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void **)&d_parts_aos_forc_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+  //  cudaMalloc((void**)&d_parts_aos_grad, count_max_parts_tmp * sizeof(struct
+  //  part_aos_g)); cudaMalloc((void**)&d_parts_aos_grad_f4, count_max_parts_tmp
+  //  * sizeof(struct part_aos_f4_g));
+  cudaMalloc((void **)&d_parts_aos_grad_f4_send,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void **)&d_parts_aos_grad_f4_recv,
+             count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  //  cudaMallocHost((void **)&parts_aos_dens, count_max_parts_tmp *
+  //  sizeof(struct part_aos));
+  cudaMallocHost((void **)&parts_aos_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+  //  cudaMallocHost((void **)&parts_aos_dens_f4, count_max_parts_tmp *
+  //  sizeof(struct part_aos_f4)); cudaMallocHost((void **)&parts_aos_forc,
+  //  count_max_parts_tmp * sizeof(struct part_aos_f)); cudaMallocHost((void
+  //  **)&parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct
+  //  part_aos_f4_f));
+  cudaMallocHost((void **)&parts_aos_forc_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_forc_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+  //  cudaMallocHost((void **)&parts_aos_grad, count_max_parts_tmp *
+  //  sizeof(struct part_aos_g)); cudaMallocHost((void **)&parts_aos_grad_f4,
+  //  count_max_parts_tmp * sizeof(struct part_aos_f4_g));
+  cudaMallocHost((void **)&parts_aos_grad_f4_send,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_grad_f4_recv,
+                 count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  //  cudaMalloc((void**)&d_parts_aos_pair_dens, 2 * count_max_parts_tmp *
+  //  sizeof(struct part_aos));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMalloc((void **)&d_parts_aos_pair_f4_f_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_f_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMalloc((void **)&d_parts_aos_pair_f4_g_send,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMalloc((void **)&d_parts_aos_pair_f4_g_recv,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  ///////////Probably not needed
+  ///anymore////////////////////////////////////////////////////////////////
+  cudaMalloc((void **)&d_parts_aos_pair_forc,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_f));
+  cudaMalloc((void **)&d_parts_aos_pair_grad,
+             2 * count_max_parts_tmp * sizeof(struct part_aos_g));
+  ///////////Probably not needed
+  ///anymore////////////////////////////////////////////////////////////////
+
+  //  cudaMallocHost((void **)&parts_aos_pair_dens, 2 * count_max_parts_tmp *
+  //  sizeof(struct part_aos));
+  cudaMallocHost((void **)&parts_aos_pair_f4_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_g_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_send,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
+  cudaMallocHost((void **)&parts_aos_pair_f4_f_recv,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
+
+  cudaMallocHost((void **)&parts_aos_pair_forc,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
+  cudaMallocHost((void **)&parts_aos_pair_grad,
+                 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
 
   /*Declare some global variables*/
   float d_a = e->cosmology->a;
@@ -747,40 +818,40 @@ void *runner_main2(void *data) {
 
   // a list of the cells and tasks the GPU will work on
   pack_vars_self_dens->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_self_dens->cell_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   pack_vars_pair_dens->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_pair_dens->ci_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
   pack_vars_pair_dens->cj_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   pack_vars_self_forc->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_self_forc->cell_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   pack_vars_pair_forc->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_pair_forc->ci_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
   pack_vars_pair_forc->cj_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   pack_vars_self_grad->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_self_grad->cell_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   pack_vars_pair_grad->task_list =
-		    (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_pair_grad->ci_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
   pack_vars_pair_grad->cj_list =
-		    (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
+      (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
 
   // number of density self tasks executed
   int tasks_done_cpu = 0;
@@ -789,40 +860,42 @@ void *runner_main2(void *data) {
 
   /* Main loop. */
   while (1) {
-	  /*Stuff for debugging*/
-	  int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0;
-	  int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0;
-	  int n_partial_d_bundles = 0, n_partial_g_bundles = 0, n_partial_f_bundles = 0;
-	  int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0, n_partial_p_f_bundles = 0;
-	  int output = 0;
-	  int packed_self = 0;
-	  int packed_pair = 0;
-	  int packed_self_f = 0;
-	  int packed_pair_f = 0;
-	  int packed_self_g = 0;
-	  int packed_pair_g = 0;
-	  int density = 0;
-	  int density_sub = 0;
-	  int unpacked = 0;
-	  int unpacked_f = 0;
-	  int unpacked_g = 0;
-	  int unpacked_pair = 0;
-	  int unpacked_pair_f = 0;
-	  int unpacked_pair_g = 0;
-	  int ghost_in = 0;
-	  int cpu_self = 0;
-	  int cpu_self_f = 0;
-	  int cpu_self_g = 0;
-	  int cpu_pair = 0;
-	  int cpu_pair_f = 0;
-	  int cpu_pair_g = 0;
+    /*Stuff for debugging*/
+    int n_full_d_bundles = 0, n_full_g_bundles = 0, n_full_f_bundles = 0;
+    int n_full_p_d_bundles = 0, n_full_p_g_bundles = 0, n_full_p_f_bundles = 0;
+    int n_partial_d_bundles = 0, n_partial_g_bundles = 0,
+        n_partial_f_bundles = 0;
+    int n_partial_p_d_bundles = 0, n_partial_p_g_bundles = 0,
+        n_partial_p_f_bundles = 0;
+    int output = 0;
+    int packed_self = 0;
+    int packed_pair = 0;
+    int packed_self_f = 0;
+    int packed_pair_f = 0;
+    int packed_self_g = 0;
+    int packed_pair_g = 0;
+    int density = 0;
+    int density_sub = 0;
+    int unpacked = 0;
+    int unpacked_f = 0;
+    int unpacked_g = 0;
+    int unpacked_pair = 0;
+    int unpacked_pair_f = 0;
+    int unpacked_pair_g = 0;
+    int ghost_in = 0;
+    int cpu_self = 0;
+    int cpu_self_f = 0;
+    int cpu_self_g = 0;
+    int cpu_pair = 0;
+    int cpu_pair_f = 0;
+    int cpu_pair_g = 0;
     //	Initialise timers to zero
-	double time_for_density_cpu = 0.0;
-	double time_for_density_cpu_pair = 0.0;
-	double time_for_cpu_g = 0.0;
-	double time_for_cpu_pair_g = 0.0;
-	double time_for_cpu_f = 0.0;
-	double time_for_cpu_pair_f = 0.0;
+    double time_for_density_cpu = 0.0;
+    double time_for_density_cpu_pair = 0.0;
+    double time_for_cpu_g = 0.0;
+    double time_for_cpu_pair_g = 0.0;
+    double time_for_cpu_f = 0.0;
+    double time_for_cpu_pair_f = 0.0;
     double time_for_density_cpu_sub = 0.0;
     double time_for_density_gpu = 0.0;
     double time_for_density_gpu_pair = 0.0;
@@ -869,8 +942,7 @@ void *runner_main2(void *data) {
     double time_for_copy_to_struct = 0.0;
     double tot_time_for_hard_memcpys = 0.0;
     /* Can we go home yet? */
-    if (e->step_props & engine_step_prop_done)
-      break;
+    if (e->step_props & engine_step_prop_done) break;
     /* Re-set the pointer to the previous task, as there is none. */
     struct task *t = NULL;
     struct task *prev = NULL;
@@ -879,9 +951,9 @@ void *runner_main2(void *data) {
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
 
     FILE *fgpu_steps;
-//    if(step == 0 || step%10 == 0)fgpu_steps = fopen(buf5, "w");
+    //    if(step == 0 || step%10 == 0)fgpu_steps = fopen(buf5, "w");
     fgpu_steps = fopen(buf5, "w");
-//    if (step == 0) cudaProfilerStart();
+    //    if (step == 0) cudaProfilerStart();
     step++;
 
     sched->nr_packs_self_dens_done = 0;
@@ -902,8 +974,7 @@ void *runner_main2(void *data) {
         t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
         /* Did I get anything? */
-        if (t == NULL)
-          break;
+        if (t == NULL) break;
       }
 
       /* Get the cells. */
@@ -935,114 +1006,125 @@ void *runner_main2(void *data) {
       const ticks task_beg = getticks();
       /* Different types of tasks... */
       switch (t->type) {
-      case task_type_self:
-        if (t->subtype == task_subtype_gpu_unpack) {
+        case task_type_self:
+          if (t->subtype == task_subtype_gpu_unpack) {
             unpacked++;
-        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          } else if (t->subtype == task_subtype_gpu_unpack_g) {
             unpacked_g++;
-        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          } else if (t->subtype == task_subtype_gpu_unpack_f) {
             unpacked_f++;
-        } else if (t->subtype == task_subtype_density) {
+          } else if (t->subtype == task_subtype_density) {
             cpu_self++;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_doself1_branch_density(r, ci);
-          clock_gettime(CLOCK_REALTIME, &t1);
-          tasks_done_cpu++;
-          time_for_density_cpu +=
-              (t1.tv_sec - t0.tv_sec) +
-              (t1.tv_nsec - t0.tv_nsec) /
-                  1000000000.0;
-          density++;
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_doself1_branch_density(r, ci);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu += (t1.tv_sec - t0.tv_sec) +
+                                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            density++;
 #endif
-	    /* GPU WORK */
-        } else if (t->subtype == task_subtype_gpu_pack) {
-          packed_self++;
+            /* GPU WORK */
+          } else if (t->subtype == task_subtype_gpu_pack) {
+            packed_self++;
 #ifdef GPUOFFLOAD
-//          struct timespec t0, t1; //
-//          clock_gettime(CLOCK_REALTIME, &t0);
-          packing_time += runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci,
-          		t, parts_aos_f4_send, task_first_part_f4);
-//      	  clock_gettime(CLOCK_REALTIME, &t1);
-//      	  packing_time += (t1.tv_sec - t0.tv_sec) +
-//      			(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-//          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
-//        		  t, parts_aos_dens, &packing_time);
-          /* No pack tasks left in queue, flag that we want to run */
-	      int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-	      /*Packed enough tasks let's go*/
-	      int launch = pack_vars_self_dens->launch;
-	      /* Do we have enough stuff to run the GPU ? */
-	      if(launch)n_full_d_bundles++;
-	      if(launch_leftovers)n_partial_d_bundles++;
-          if (launch || launch_leftovers) {
-        	/*Launch GPU tasks*/
-  	        runner_doself1_launch_f4(r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send, parts_aos_f4_recv,
-  	        		d_parts_aos_f4_send, d_parts_aos_f4_recv, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-  					&unpack_time_self, task_first_part_self_dens_f4, devId,
-					  task_first_part_f4, d_task_first_part_f4, self_end);
-//	        runner_doself1_launch(r, sched, pack_vars_self_dens, ci, t, parts_aos_dens,
-//	        		d_parts_aos_dens, stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-//					&tot_time_for_hard_memcpys);
-          } /*End of GPU work Self*/
-#endif //GPUDENSSELF
-        } /* self / pack */
-        else if (t->subtype == task_subtype_gpu_pack_g){
-          packed_self_g++;
+            //          struct timespec t0, t1; //
+            //          clock_gettime(CLOCK_REALTIME, &t0);
+            packing_time +=
+                runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+                                       parts_aos_f4_send, task_first_part_f4);
+            //      	  clock_gettime(CLOCK_REALTIME, &t1);
+            //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
+            //      			(t1.tv_nsec - t0.tv_nsec) /
+            //      1000000000.0;
+            //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
+            //        		  t, parts_aos_dens, &packing_time);
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+            /*Packed enough tasks let's go*/
+            int launch = pack_vars_self_dens->launch;
+            /* Do we have enough stuff to run the GPU ? */
+            if (launch) n_full_d_bundles++;
+            if (launch_leftovers) n_partial_d_bundles++;
+            if (launch || launch_leftovers) {
+              /*Launch GPU tasks*/
+              runner_doself1_launch_f4(
+                  r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+                  parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+                  stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+                  &unpack_time_self, task_first_part_self_dens_f4, devId,
+                  task_first_part_f4, d_task_first_part_f4, self_end);
+              //	        runner_doself1_launch(r, sched,
+              //pack_vars_self_dens, ci, t, parts_aos_dens, 	        		d_parts_aos_dens,
+              //stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+              //					&tot_time_for_hard_memcpys);
+            } /*End of GPU work Self*/
+#endif        // GPUDENSSELF
+          }   /* self / pack */
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+            packed_self_g++;
 #ifdef GPUOFFLOAD
-//          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-//        		  t, parts_aos_grad, &packing_time_g);
-          packing_time_g += runner_doself1_pack_f4_g(r, sched, pack_vars_self_grad, ci,
-        			t, parts_aos_grad_f4_send, task_first_part_f4_g);
-          /* No pack tasks left in queue, flag that we want to run */
-  	      int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-  	      /*Packed enough tasks let's go*/
-  	      int launch = pack_vars_self_grad->launch;
-  	      /* Do we have enough stuff to run the GPU ? */
+            //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+            //        		  t, parts_aos_grad, &packing_time_g);
+            packing_time_g += runner_doself1_pack_f4_g(
+                r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                task_first_part_f4_g);
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+            /*Packed enough tasks let's go*/
+            int launch = pack_vars_self_grad->launch;
+            /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
-          	/*Launch GPU tasks*/
-//      	        runner_doself1_launch_g(r, sched, pack_vars_self_grad, ci, t, parts_aos_grad,
-//      	        		d_parts_aos_grad, stream, d_a, d_H, e, &packing_time_g, &time_for_gpu_g);
-      	        runner_doself1_launch_f4_g(r, sched, pack_vars_self_grad, ci,
-      	      		t, parts_aos_grad_f4_send, parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-      	      		d_parts_aos_grad_f4_recv, stream, d_a, d_H,
-      	      		e, &packing_time_g, &time_for_gpu_g, task_first_part_f4_g, d_task_first_part_f4_g,
-      	      		self_end_g, &unpack_time_self_g);
+              /*Launch GPU tasks*/
+              //      	        runner_doself1_launch_g(r, sched,
+              //      pack_vars_self_grad, ci, t, parts_aos_grad,
+              //      	        		d_parts_aos_grad, stream, d_a,
+              //      d_H, e, &packing_time_g, &time_for_gpu_g);
+              runner_doself1_launch_f4_g(
+                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                  parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+                  d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+                  &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+                  d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
-#endif //GPUGRADSELF
-        }
-        else if (t->subtype == task_subtype_gpu_pack_f){
-          packed_self_f++;
+#endif        // GPUGRADSELF
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+            packed_self_f++;
 #ifdef GPUOFFLOAD
-//          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-//        		  t, parts_aos_forc, &packing_time_f);
-          packing_time_f += runner_doself1_pack_f4_f(r, sched, pack_vars_self_forc, ci,
-        		  t, parts_aos_forc_f4_send, task_first_part_f4_f);
-//          int count = ci->hydro.count;
-//          for(int i = 0; i < count; i++){
-//        	  int pid = pack_vars_self_forc->count_parts - count + i;
-//        	  if(parts_aos_forc_f4_send[pid].ux_m.w < 1e-9)fprintf(stderr, "zero mass after packing %i %f\n", pid, parts_aos_forc_f4_send[pid].ux_m.w);
-//          }
-          /* No pack tasks left in queue, flag that we want to run */
-	      int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-	      /*Packed enough tasks let's go*/
-	      int launch = pack_vars_self_forc->launch;
-	      /* Do we have enough stuff to run the GPU ? */
-          if (launch || launch_leftovers) {
-            /*Launch GPU tasks*/
-//  	        runner_doself1_launch_f(r, sched, pack_vars_self_forc, ci, t, parts_aos_forc,
-//  	        		d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
-	        runner_doself1_launch_f4_f(r, sched, pack_vars_self_forc, ci,
-	        		t, parts_aos_forc_f4_send, parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-					d_parts_aos_forc_f4_recv, stream, d_a, d_H,
-					e, &packing_time_f, &time_for_gpu_f, task_first_part_f4_f, d_task_first_part_f4_f,
-					self_end_f, &unpack_time_self_f);
-          } /*End of GPU work Self*/
-#endif //GPUFORCSELF
-        }
+            //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+            //        		  t, parts_aos_forc, &packing_time_f);
+            packing_time_f += runner_doself1_pack_f4_f(
+                r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                task_first_part_f4_f);
+            //          int count = ci->hydro.count;
+            //          for(int i = 0; i < count; i++){
+            //        	  int pid = pack_vars_self_forc->count_parts - count +
+            //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
+            //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
+            //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
+            //          }
+            /* No pack tasks left in queue, flag that we want to run */
+            int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+            /*Packed enough tasks let's go*/
+            int launch = pack_vars_self_forc->launch;
+            /* Do we have enough stuff to run the GPU ? */
+            if (launch || launch_leftovers) {
+              /*Launch GPU tasks*/
+              //  	        runner_doself1_launch_f(r, sched,
+              //  pack_vars_self_forc, ci, t, parts_aos_forc, 	        		d_parts_aos_forc,
+              //  stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
+              runner_doself1_launch_f4_f(
+                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                  parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+                  d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+                  &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+                  d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+            } /*End of GPU work Self*/
+#endif        // GPUFORCSELF
+          }
 #ifdef EXTRA_HYDRO_LOOP
-        else if (t->subtype == task_subtype_gradient) {
+          else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
 #ifndef GPUOFFLOAD
             struct timespec t0, t1, dt;
@@ -1050,14 +1132,12 @@ void *runner_main2(void *data) {
             runner_doself1_branch_gradient(r, ci);
             clock_gettime(CLOCK_REALTIME, &t1);
             tasks_done_cpu++;
-            time_for_cpu_g +=
-                (t1.tv_sec - t0.tv_sec) +
-                (t1.tv_nsec - t0.tv_nsec) /
-                    1000000000.0;
-#endif //GPUGRADSELF
-        }
+            time_for_cpu_g += (t1.tv_sec - t0.tv_sec) +
+                              (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif  // GPUGRADSELF
+          }
 #endif
-        else if (t->subtype == task_subtype_force) {
+          else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
 #ifndef GPUOFFLOAD
             struct timespec t0, t1;
@@ -1065,684 +1145,697 @@ void *runner_main2(void *data) {
             runner_doself2_branch_force(r, ci);
             clock_gettime(CLOCK_REALTIME, &t1);
             tasks_done_cpu++;
-            time_for_cpu_f +=
-                (t1.tv_sec - t0.tv_sec) +
-                (t1.tv_nsec - t0.tv_nsec) /
-                    1000000000.0;
-#endif //GPUFORCSELF
-        } else if (t->subtype == task_subtype_limiter)
-          runner_doself1_branch_limiter(r, ci);
-        else if (t->subtype == task_subtype_grav)
-          runner_doself_recursive_grav(r, ci, 1);
-        else if (t->subtype == task_subtype_external_grav)
-          runner_do_grav_external(r, ci, 1);
-        else if (t->subtype == task_subtype_stars_density)
-          runner_doself_branch_stars_density(r, ci);
+            time_for_cpu_f += (t1.tv_sec - t0.tv_sec) +
+                              (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif  // GPUFORCSELF
+          } else if (t->subtype == task_subtype_limiter)
+            runner_doself1_branch_limiter(r, ci);
+          else if (t->subtype == task_subtype_grav)
+            runner_doself_recursive_grav(r, ci, 1);
+          else if (t->subtype == task_subtype_external_grav)
+            runner_do_grav_external(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_doself_branch_stars_density(r, ci);
 #ifdef EXTRA_STAR_LOOPS
-        else if (t->subtype == task_subtype_stars_prep1)
-          runner_doself_branch_stars_prep1(r, ci);
-        else if (t->subtype == task_subtype_stars_prep2)
-          runner_doself_branch_stars_prep2(r, ci);
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_doself_branch_stars_prep1(r, ci);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_doself_branch_stars_prep2(r, ci);
 #endif
-        else if (t->subtype == task_subtype_stars_feedback)
-          runner_doself_branch_stars_feedback(r, ci);
-        else if (t->subtype == task_subtype_bh_density)
-          runner_doself_branch_bh_density(r, ci);
-        else if (t->subtype == task_subtype_bh_swallow)
-          runner_doself_branch_bh_swallow(r, ci);
-        else if (t->subtype == task_subtype_do_gas_swallow)
-          runner_do_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_do_bh_swallow)
-          runner_do_bh_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_bh_feedback)
-          runner_doself_branch_bh_feedback(r, ci);
-        else if (t->subtype == task_subtype_rt_gradient)
-          runner_doself1_branch_rt_gradient(r, ci);
-        else if (t->subtype == task_subtype_rt_transport)
-          runner_doself2_branch_rt_transport(r, ci);
-        else if (t->subtype == task_subtype_sink_swallow)
-          runner_doself_branch_sinks_swallow(r, ci);
-        else if (t->subtype == task_subtype_sink_do_gas_swallow)
-          runner_do_sinks_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_do_sink_swallow)
-          runner_do_sinks_sink_swallow_self(r, ci, 1);
-        else
-          error("Unknown/invalid task subtype (%s).",
-                subtaskID_names[t->subtype]);
-        break;
-
-      case task_type_pair:
-        if (t->subtype == task_subtype_density) {
-	      /* Abouzied: To be commented out when the GPU pairs have been coded up */
-          cpu_pair++;
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_doself_branch_stars_feedback(r, ci);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_doself_branch_bh_density(r, ci);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_doself_branch_bh_swallow(r, ci);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_doself_branch_bh_feedback(r, ci);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_doself1_branch_rt_gradient(r, ci);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_doself2_branch_rt_transport(r, ci);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_doself_branch_sinks_swallow(r, ci);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_self(r, ci, 1);
+          else
+            error("Unknown/invalid task subtype (%s).",
+                  subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_pair:
+          if (t->subtype == task_subtype_density) {
+            /* Abouzied: To be commented out when the GPU pairs have been coded
+             * up */
+            cpu_pair++;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-	      runner_dopair1_branch_density(r, ci, cj);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-	      tasks_done_cpu++;
-	      time_for_density_cpu_pair +=
-	          (t1.tv_sec - t0.tv_sec) +
-	          (t1.tv_nsec - t0.tv_nsec) /
-	           1000000000.0;
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair1_branch_density(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu_pair +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 #endif
-	    }
-	    /* GPU WORK */
-	    else if (t->subtype == task_subtype_gpu_pack) {
-	      packed_pair++;
+          }
+          /* GPU WORK */
+          else if (t->subtype == task_subtype_gpu_pack) {
+            packed_pair++;
 #ifdef GPUOFFLOAD
 #ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//		  if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-		    clock_gettime(CLOCK_REALTIME, &t0);
-		    runner_dopair1_branch_density(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_dens->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-							 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-							 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
-	        packing_time_pair += runner_dopair1_pack_f4(r, sched, pack_vars_pair_dens, ci,
-	      		 cj, t, parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-		    /* Packed enough tasks or no pack tasks left in queue, flag that we want to run */
-		    int launch = pack_vars_pair_dens->launch;
-		    int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-		    /* Do we have enough stuff to run the GPU ? */
-	        if(launch)n_full_p_d_bundles++;
-	        if(launch_leftovers)n_partial_p_d_bundles++;
-		    if(launch || launch_leftovers) {
-		    /*Launch GPU tasks*/
-//				runner_dopair1_launch(r, sched, pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-//						d_parts_aos_pair_dens, stream, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair);
-			  runner_dopair1_launch_f4_one_memcpy(r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send, parts_aos_pair_f4_recv,
-						 d_parts_aos_pair_f4_send, d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e, &packing_time_pair, &time_for_density_gpu_pair,
-						 &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens, pair_end);
-		    }
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            double shift[3] = {0.0};
+            t->corner_pair = 0;
+            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
+              //		  if((sid != 4 && sid != 10 && sid == 12) &&
+              //step > 1){
+              clock_gettime(CLOCK_REALTIME, &t0);
+              runner_dopair1_branch_density(r, ci, cj);
+              t->corner_pair = 1;
+              int qid = r->qid;
+              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+              /* Tell the cells they have been packed */
+              ci->pack_done++;
+              cj->pack_done++;
+              t->done = 1;
+              int launch = 0, launch_leftovers = 0;
+              if ((sched->queues[qid].n_packs_pair_left == 0))
+                launch_leftovers = 1;
+              /* Tasks done. Release the lock ! */
+              task_unlock(t);
+              /*schedule my dependencies (Only unpacks really)*/
+              enqueue_dependencies(sched, t);
+              /*Signal sleeping runners*/
+              signal_sleeping_runners(sched, t);
+              clock_gettime(CLOCK_REALTIME, &t1);
+              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+              if (launch_leftovers) {
+                pack_vars_pair_dens->launch_leftovers = 1;
+                runner_dopair1_launch_f4_one_memcpy(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+              }
+            } else {
+#endif  // DO_CORNERS
+              packing_time_pair += runner_dopair1_pack_f4(
+                  r, sched, pack_vars_pair_dens, ci, cj, t,
+                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+              /* Packed enough tasks or no pack tasks left in queue, flag that
+               * we want to run */
+              int launch = pack_vars_pair_dens->launch;
+              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch) n_full_p_d_bundles++;
+              if (launch_leftovers) n_partial_p_d_bundles++;
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //				runner_dopair1_launch(r, sched,
+                //pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+                //						d_parts_aos_pair_dens,
+                //stream, d_a, d_H, e, &packing_time_pair,
+                //&time_for_density_gpu_pair);
+                runner_dopair1_launch_f4_one_memcpy(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+              }
 #ifdef DO_CORNERS
-		  } /* End of GPU work Pairs */
-#endif //DO_CORNERS
-#endif //GPUDENS
-	    } /* pair / pack */
-        else if (t->subtype == task_subtype_gpu_pack_g){
-  	      packed_pair_g++;
+            } /* End of GPU work Pairs */
+#endif        // DO_CORNERS
+#endif        // GPUDENS
+          }   /* pair / pack */
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+            packed_pair_g++;
 #ifdef GPUOFFLOAD
 #ifdef DO_CORNERS
-	      struct timespec t0, t1, dt;
-	      clock_gettime(CLOCK_REALTIME, &t0);
-	      double shift[3] = {0.0};
-	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-		  packing_time_pair +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-    	    clock_gettime(CLOCK_REALTIME, &t0);
-	    	runner_dopair1_branch_gradient(r, ci, cj);
-		    t->corner_pair = 1;
-			int qid = r->qid;
-			atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-			/* Tell the cells they have been packed */
-			ci->pack_done++;
-			cj->pack_done++;
-			t->done = 1;
-			int launch = 0, launch_leftovers = 0;
-			if ((sched->queues[qid].n_packs_pair_left_g == 0))
-				launch_leftovers = 1;
-			/* Tasks done. Release the lock ! */
-		    task_unlock(t);
-			/*schedule my dependencies (Only unpacks really)*/
-			enqueue_dependencies(sched, t);
-			/*Signal sleeping runners*/
-			signal_sleeping_runners(sched, t);
-		    clock_gettime(CLOCK_REALTIME, &t1);
-		    packing_time_pair_g +=
-		        (t1.tv_sec - t0.tv_sec) +
-		        (t1.tv_nsec - t0.tv_nsec) /
-		         1000000000.0;
-			if (launch_leftovers) {
-			  pack_vars_pair_grad->launch_leftovers = 1;
-			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-			}
-	      }
-	      else{
-#endif //DO_CORNERS
-//          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad, ci,
-//        		  cj, t, parts_aos_pair_grad, e, &packing_time_g);
-	        packing_time_pair_g += runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-	      		 cj, t, parts_aos_pair_f4_g_send, e, fparti_fpartj_lparti_lpartj_grad);
-            /* No pack tasks left in queue, flag that we want to run */
-	        int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-	        /*Packed enough tasks let's go*/
-	        int launch = pack_vars_pair_grad->launch;
-		    /* Do we have enough stuff to run the GPU ? */
-		    if (launch || launch_leftovers) {
-		    /*Launch GPU tasks*/
-//			runner_dopair1_launch_g(r, sched, pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-//					d_parts_aos_pair_grad, stream, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g);
-			  runner_dopair1_launch_f4_g_one_memcpy(r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send, parts_aos_pair_f4_g_recv,
-					 d_parts_aos_pair_f4_g_send, d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_g, &time_for_gpu_pair_g,
-					 &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad, pair_end_g);
-		    }
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            double shift[3] = {0.0};
+            t->corner_pair = 0;
+            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
+              //          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+              clock_gettime(CLOCK_REALTIME, &t0);
+              runner_dopair1_branch_gradient(r, ci, cj);
+              t->corner_pair = 1;
+              int qid = r->qid;
+              atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+              /* Tell the cells they have been packed */
+              ci->pack_done++;
+              cj->pack_done++;
+              t->done = 1;
+              int launch = 0, launch_leftovers = 0;
+              if ((sched->queues[qid].n_packs_pair_left_g == 0))
+                launch_leftovers = 1;
+              /* Tasks done. Release the lock ! */
+              task_unlock(t);
+              /*schedule my dependencies (Only unpacks really)*/
+              enqueue_dependencies(sched, t);
+              /*Signal sleeping runners*/
+              signal_sleeping_runners(sched, t);
+              clock_gettime(CLOCK_REALTIME, &t1);
+              packing_time_pair_g += (t1.tv_sec - t0.tv_sec) +
+                                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+              if (launch_leftovers) {
+                pack_vars_pair_grad->launch_leftovers = 1;
+                runner_dopair1_launch_f4_g_one_memcpy(
+                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_g, &time_for_gpu_pair_g,
+                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+                    pair_end_g);
+              }
+            } else {
+#endif  // DO_CORNERS
+              //          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad,
+              //          ci,
+              //        		  cj, t, parts_aos_pair_grad, e,
+              //        &packing_time_g);
+              packing_time_pair_g +=
+                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+                                           cj, t, parts_aos_pair_f4_g_send, e,
+                                           fparti_fpartj_lparti_lpartj_grad);
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_pair_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //			runner_dopair1_launch_g(r, sched,
+                //pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+                //					d_parts_aos_pair_grad,
+                //stream, d_a, d_H, e, &packing_time_pair_g,
+                //&time_for_gpu_pair_g);
+                runner_dopair1_launch_f4_g_one_memcpy(
+                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_g, &time_for_gpu_pair_g,
+                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+                    pair_end_g);
+              }
 #ifdef DO_CORNERS
-	      }/* End of GPU work Pairs */
-#endif //DO_CORNERS
-#endif //GPUGRADPAIR
-        }
-        else if (t->subtype == task_subtype_gpu_pack_f){
-    	    packed_pair_f++;
+            } /* End of GPU work Pairs */
+#endif        // DO_CORNERS
+#endif        // GPUGRADPAIR
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+            packed_pair_f++;
 #ifdef GPUOFFLOAD
 #ifdef DO_CORNERS
-  	      struct timespec t0, t1, dt;
-  	      clock_gettime(CLOCK_REALTIME, &t0);
-  	      double shift[3] = {0.0};
-  	      t->corner_pair = 0;
-          int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-    	  clock_gettime(CLOCK_REALTIME, &t1);
-  		  packing_time_pair +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-	      if((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1){
-//          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-  	    	runner_dopair1_branch_force(r, ci, cj);
-  		    t->corner_pair = 1;
-  			int qid = r->qid;
-  			atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-  			/* Tell the cells they have been packed */
-  			ci->pack_done++;
-  			cj->pack_done++;
-  			t->done = 1;
-  			int launch = 0, launch_leftovers = 0;
-  			if ((sched->queues[qid].n_packs_pair_left_f == 0))
-  				launch_leftovers = 1;
-  			/* Tasks done. Release the lock ! */
-  		    task_unlock(t);
-  			/*schedule my dependencies (Only unpacks really)*/
-  			enqueue_dependencies(sched, t);
-  			/*Signal sleeping runners*/
-  			signal_sleeping_runners(sched, t);
-  		    clock_gettime(CLOCK_REALTIME, &t1);
-  		    packing_time_pair_f +=
-  		        (t1.tv_sec - t0.tv_sec) +
-  		        (t1.tv_nsec - t0.tv_nsec) /
-  		         1000000000.0;
-  			if (launch_leftovers) {
-  			  pack_vars_pair_forc->launch_leftovers = 1;
-  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-  			}
-  	      }
-  	      else{
-#endif //DO_CORNERS
-//            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc, ci,
-//            		cj, t, parts_aos_pair_forc, e, &packing_time_f);
-  	        packing_time_pair_f += runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-  	      		 cj, t, parts_aos_pair_f4_f_send, e, fparti_fpartj_lparti_lpartj_forc);
-            /* No pack tasks left in queue, flag that we want to run */
-  	        int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-  	        /*Packed enough tasks let's go*/
-  	        int launch = pack_vars_pair_forc->launch;
-  		    /* Do we have enough stuff to run the GPU ? */
-  		    if (launch || launch_leftovers) {
-  		    /*Launch GPU tasks*/
-//  			runner_dopair1_launch_f(r, sched, pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-//  					d_parts_aos_pair_forc, stream, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f);
-  			  runner_dopair1_launch_f4_f_one_memcpy(r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send, parts_aos_pair_f4_f_recv,
-					 d_parts_aos_pair_f4_f_send, d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e, &packing_time_pair_f, &time_for_gpu_pair_f,
-					 &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc, pair_end_f);
-  		    } /* End of GPU work Pairs */
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            double shift[3] = {0.0};
+            t->corner_pair = 0;
+            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
+              //          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
+              runner_dopair1_branch_force(r, ci, cj);
+              t->corner_pair = 1;
+              int qid = r->qid;
+              atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+              /* Tell the cells they have been packed */
+              ci->pack_done++;
+              cj->pack_done++;
+              t->done = 1;
+              int launch = 0, launch_leftovers = 0;
+              if ((sched->queues[qid].n_packs_pair_left_f == 0))
+                launch_leftovers = 1;
+              /* Tasks done. Release the lock ! */
+              task_unlock(t);
+              /*schedule my dependencies (Only unpacks really)*/
+              enqueue_dependencies(sched, t);
+              /*Signal sleeping runners*/
+              signal_sleeping_runners(sched, t);
+              clock_gettime(CLOCK_REALTIME, &t1);
+              packing_time_pair_f += (t1.tv_sec - t0.tv_sec) +
+                                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+              if (launch_leftovers) {
+                pack_vars_pair_forc->launch_leftovers = 1;
+                runner_dopair1_launch_f4_f_one_memcpy(
+                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_f, &time_for_gpu_pair_f,
+                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+                    pair_end_f);
+              }
+            } else {
+#endif  // DO_CORNERS
+              //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc,
+              //            ci, 		cj, t, parts_aos_pair_forc, e, &packing_time_f);
+              packing_time_pair_f +=
+                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+                                           cj, t, parts_aos_pair_f4_f_send, e,
+                                           fparti_fpartj_lparti_lpartj_forc);
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_pair_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //  			runner_dopair1_launch_f(r, sched,
+                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+                //  					d_parts_aos_pair_forc,
+                //  stream, d_a, d_H, e, &packing_time_pair_f,
+                //  &time_for_gpu_pair_f);
+                runner_dopair1_launch_f4_f_one_memcpy(
+                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_f, &time_for_gpu_pair_f,
+                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+                    pair_end_f);
+              } /* End of GPU work Pairs */
 #ifdef DO_CORNERS
-  	      }
-#endif //DO_CORNERS
-#endif //GPUFORCPAIR
-        }
-	    else if (t->subtype == task_subtype_gpu_unpack) {
-	  	      unpacked_pair++;
-        }
-	    else if (t->subtype == task_subtype_gpu_unpack_g) {
-	  	      unpacked_pair_g++;
-        }
-	    else if (t->subtype == task_subtype_gpu_unpack_f) {
-	  	      unpacked_pair_f++;
-        }
+            }
+#endif  // DO_CORNERS
+#endif  // GPUFORCPAIR
+          } else if (t->subtype == task_subtype_gpu_unpack) {
+            unpacked_pair++;
+          } else if (t->subtype == task_subtype_gpu_unpack_g) {
+            unpacked_pair_g++;
+          } else if (t->subtype == task_subtype_gpu_unpack_f) {
+            unpacked_pair_f++;
+          }
 #ifdef EXTRA_HYDRO_LOOP
-        else if (t->subtype == task_subtype_gradient){
-          int Do_nothing = 0;
+          else if (t->subtype == task_subtype_gradient) {
+            int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_dopair1_branch_gradient(r, ci, cj);
-	      clock_gettime(CLOCK_REALTIME, &t1);
-	      tasks_done_cpu++;
-	      time_for_cpu_pair_g +=
-	          (t1.tv_sec - t0.tv_sec) +
-	          (t1.tv_nsec - t0.tv_nsec) /
-	           1000000000.0;
-#endif //GPUGRADPAIR
-        }
-#endif //EXTRA_HYDRO_LOOP
-        else if (t->subtype == task_subtype_force){
-          int Do_nothing = 0;
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair1_branch_gradient(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif  // GPUGRADPAIR
+          }
+#endif  // EXTRA_HYDRO_LOOP
+          else if (t->subtype == task_subtype_force) {
+            int Do_nothing = 0;
 #ifndef GPUOFFLOAD
-          struct timespec t0, t1, dt;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_dopair2_branch_force(r, ci, cj);
-  	      clock_gettime(CLOCK_REALTIME, &t1);
-  	      tasks_done_cpu++;
-  	      time_for_cpu_pair_f +=
-  	          (t1.tv_sec - t0.tv_sec) +
-  	          (t1.tv_nsec - t0.tv_nsec) /
-  	           1000000000.0;
-#endif //GPUFORCPAIR
-        }
-        else if (t->subtype == task_subtype_limiter)
-          runner_dopair1_branch_limiter(r, ci, cj);
-        else if (t->subtype == task_subtype_grav)
-          runner_dopair_recursive_grav(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_stars_density)
-          runner_dopair_branch_stars_density(r, ci, cj);
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dopair2_branch_force(r, ci, cj);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_cpu_pair_f += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+#endif  // GPUFORCPAIR
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dopair1_branch_limiter(r, ci, cj);
+          else if (t->subtype == task_subtype_grav)
+            runner_dopair_recursive_grav(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dopair_branch_stars_density(r, ci, cj);
 #ifdef EXTRA_STAR_LOOPS
-        else if (t->subtype == task_subtype_stars_prep1)
-          runner_dopair_branch_stars_prep1(r, ci, cj);
-        else if (t->subtype == task_subtype_stars_prep2)
-          runner_dopair_branch_stars_prep2(r, ci, cj);
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dopair_branch_stars_prep1(r, ci, cj);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dopair_branch_stars_prep2(r, ci, cj);
 #endif
-        else if (t->subtype == task_subtype_stars_feedback)
-          runner_dopair_branch_stars_feedback(r, ci, cj);
-        else if (t->subtype == task_subtype_bh_density)
-          runner_dopair_branch_bh_density(r, ci, cj);
-        else if (t->subtype == task_subtype_bh_swallow)
-          runner_dopair_branch_bh_swallow(r, ci, cj);
-        else if (t->subtype == task_subtype_do_gas_swallow)
-          runner_do_gas_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_do_bh_swallow)
-          runner_do_bh_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_bh_feedback)
-          runner_dopair_branch_bh_feedback(r, ci, cj);
-        else if (t->subtype == task_subtype_rt_gradient)
-          runner_dopair1_branch_rt_gradient(r, ci, cj);
-        else if (t->subtype == task_subtype_rt_transport)
-          runner_dopair2_branch_rt_transport(r, ci, cj);
-        else if (t->subtype == task_subtype_sink_swallow)
-          runner_dopair_branch_sinks_swallow(r, ci, cj);
-        else if (t->subtype == task_subtype_sink_do_gas_swallow)
-          runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_do_sink_swallow)
-          runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
-        else
-          error("Unknown/invalid task subtype (%s/%s).",
-                taskID_names[t->type], subtaskID_names[t->subtype]);
-        break;
-
-      case task_type_sub_self:
-        if (t->subtype == task_subtype_density) {
-          struct timespec t0, t1, dt;
-          const int count = ci->hydro.count;
-          density_sub++;
-          clock_gettime(CLOCK_REALTIME, &t0);
-          runner_dosub_self1_density(r, ci, 1);
-          clock_gettime(CLOCK_REALTIME, &t1);
-          tasks_done_cpu++;
-          time_for_density_cpu_sub +=
-              (t1.tv_sec - t0.tv_sec) +
-              (t1.tv_nsec - t0.tv_nsec) /
-                  1000000000.0;
-        }
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dopair_branch_stars_feedback(r, ci, cj);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dopair_branch_bh_density(r, ci, cj);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dopair_branch_bh_swallow(r, ci, cj);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dopair_branch_bh_feedback(r, ci, cj);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dopair1_branch_rt_gradient(r, ci, cj);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dopair2_branch_rt_transport(r, ci, cj);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dopair_branch_sinks_swallow(r, ci, cj);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sub_self:
+          if (t->subtype == task_subtype_density) {
+            struct timespec t0, t1, dt;
+            const int count = ci->hydro.count;
+            density_sub++;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            runner_dosub_self1_density(r, ci, 1);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            tasks_done_cpu++;
+            time_for_density_cpu_sub +=
+                (t1.tv_sec - t0.tv_sec) +
+                (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+          }
 #ifdef EXTRA_HYDRO_LOOP
-        else if (t->subtype == task_subtype_gradient){
-          runner_dosub_self1_gradient(r, ci, 1);
-          fprintf(stderr, "split a g task\n");
-        }
+          else if (t->subtype == task_subtype_gradient) {
+            runner_dosub_self1_gradient(r, ci, 1);
+            fprintf(stderr, "split a g task\n");
+          }
 #endif
-        else if (t->subtype == task_subtype_force){
-          runner_dosub_self2_force(r, ci, 1);
-          fprintf(stderr, "split a f task\n");
-        }
-        else if (t->subtype == task_subtype_limiter)
-          runner_dosub_self1_limiter(r, ci, 1);
-        else if (t->subtype == task_subtype_stars_density)
-          runner_dosub_self_stars_density(r, ci, 1);
+          else if (t->subtype == task_subtype_force) {
+            runner_dosub_self2_force(r, ci, 1);
+            fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dosub_self1_limiter(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dosub_self_stars_density(r, ci, 1);
 #ifdef EXTRA_STAR_LOOPS
-        else if (t->subtype == task_subtype_stars_prep1)
-          runner_dosub_self_stars_prep1(r, ci, 1);
-        else if (t->subtype == task_subtype_stars_prep2)
-          runner_dosub_self_stars_prep2(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dosub_self_stars_prep1(r, ci, 1);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dosub_self_stars_prep2(r, ci, 1);
 #endif
-        else if (t->subtype == task_subtype_stars_feedback)
-          runner_dosub_self_stars_feedback(r, ci, 1);
-        else if (t->subtype == task_subtype_bh_density)
-          runner_dosub_self_bh_density(r, ci, 1);
-        else if (t->subtype == task_subtype_bh_swallow)
-          runner_dosub_self_bh_swallow(r, ci, 1);
-        else if (t->subtype == task_subtype_do_gas_swallow)
-          runner_do_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_do_bh_swallow)
-          runner_do_bh_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_bh_feedback)
-          runner_dosub_self_bh_feedback(r, ci, 1);
-        else if (t->subtype == task_subtype_rt_gradient)
-          runner_dosub_self1_rt_gradient(r, ci, 1);
-        else if (t->subtype == task_subtype_rt_transport)
-          runner_dosub_self2_rt_transport(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_swallow)
-          runner_dosub_self_sinks_swallow(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_do_gas_swallow)
-          runner_do_sinks_gas_swallow_self(r, ci, 1);
-        else if (t->subtype == task_subtype_sink_do_sink_swallow)
-          runner_do_sinks_sink_swallow_self(r, ci, 1);
-        else
-          error("Unknown/invalid task subtype (%s/%s).",
-                taskID_names[t->type], subtaskID_names[t->subtype]);
-        break;
-
-      case task_type_sub_pair:
-        if (t->subtype == task_subtype_density){
-        	int nothing = 0;
-        	fprintf(stderr,"Doing a pair sub task");
-          runner_dosub_pair1_density(r, ci, cj, 1);
-        }
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dosub_self_stars_feedback(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dosub_self_bh_density(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dosub_self_bh_swallow(r, ci, 1);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dosub_self_bh_feedback(r, ci, 1);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dosub_self1_rt_gradient(r, ci, 1);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dosub_self2_rt_transport(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dosub_self_sinks_swallow(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_self(r, ci, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_self(r, ci, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sub_pair:
+          if (t->subtype == task_subtype_density) {
+            int nothing = 0;
+            fprintf(stderr, "Doing a pair sub task");
+            runner_dosub_pair1_density(r, ci, cj, 1);
+          }
 #ifdef EXTRA_HYDRO_LOOP
-        else if (t->subtype == task_subtype_gradient){
-          runner_dosub_pair1_gradient(r, ci, cj, 1);
-          fprintf(stderr, "split a g task\n");
-        }
+          else if (t->subtype == task_subtype_gradient) {
+            runner_dosub_pair1_gradient(r, ci, cj, 1);
+            fprintf(stderr, "split a g task\n");
+          }
 #endif
-        else if (t->subtype == task_subtype_force){
-          runner_dosub_pair2_force(r, ci, cj, 1);
-          fprintf(stderr, "split a f task\n");
-        }
-        else if (t->subtype == task_subtype_limiter)
-          runner_dosub_pair1_limiter(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_stars_density)
-          runner_dosub_pair_stars_density(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_force) {
+            runner_dosub_pair2_force(r, ci, cj, 1);
+            fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_limiter)
+            runner_dosub_pair1_limiter(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_density)
+            runner_dosub_pair_stars_density(r, ci, cj, 1);
 #ifdef EXTRA_STAR_LOOPS
-        else if (t->subtype == task_subtype_stars_prep1)
-          runner_dosub_pair_stars_prep1(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_stars_prep2)
-          runner_dosub_pair_stars_prep2(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_prep1)
+            runner_dosub_pair_stars_prep1(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_stars_prep2)
+            runner_dosub_pair_stars_prep2(r, ci, cj, 1);
 #endif
-        else if (t->subtype == task_subtype_stars_feedback)
-          runner_dosub_pair_stars_feedback(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_bh_density)
-          runner_dosub_pair_bh_density(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_bh_swallow)
-          runner_dosub_pair_bh_swallow(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_do_gas_swallow)
-          runner_do_gas_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_do_bh_swallow)
-          runner_do_bh_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_bh_feedback)
-          runner_dosub_pair_bh_feedback(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_rt_gradient)
-          runner_dosub_pair1_rt_gradient(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_rt_transport)
-          runner_dosub_pair2_rt_transport(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_swallow)
-          runner_dosub_pair_sinks_swallow(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_do_gas_swallow)
-          runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
-        else if (t->subtype == task_subtype_sink_do_sink_swallow)
-          runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
-        else
-          error("Unknown/invalid task subtype (%s/%s).",
-                taskID_names[t->type], subtaskID_names[t->subtype]);
-        break;
-
-      case task_type_sort:
-        /* Cleanup only if any of the indices went stale. */
-        runner_do_hydro_sort(
-            r, ci, t->flags,
-            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin,
-            cell_get_flag(ci, cell_flag_rt_requests_sort), 1);
-        /* Reset the sort flags as our work here is done. */
-        t->flags = 0;
-        break;
-      case task_type_rt_sort:
-        /* Cleanup only if any of the indices went stale.
-         * NOTE: we check whether we reset the sort flags when the
-         * recv tasks are running. Cells without an RT recv task
-         * don't have rt_sort tasks. */
-        runner_do_hydro_sort(
-            r, ci, t->flags,
-            ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1);
-        /* Reset the sort flags as our work here is done. */
-        t->flags = 0;
-        break;
-      case task_type_stars_sort:
-        /* Cleanup only if any of the indices went stale. */
-        runner_do_stars_sort(
-            r, ci, t->flags,
-            ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
-        /* Reset the sort flags as our work here is done. */
-        t->flags = 0;
-        break;
-      case task_type_init_grav:
-        runner_do_init_grav(r, ci, 1);
-        break;
-      case task_type_ghost:
-        runner_do_ghost(r, ci, 1);
-        break;
+          else if (t->subtype == task_subtype_stars_feedback)
+            runner_dosub_pair_stars_feedback(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_density)
+            runner_dosub_pair_bh_density(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_swallow)
+            runner_dosub_pair_bh_swallow(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_gas_swallow)
+            runner_do_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_do_bh_swallow)
+            runner_do_bh_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_bh_feedback)
+            runner_dosub_pair_bh_feedback(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_rt_gradient)
+            runner_dosub_pair1_rt_gradient(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_rt_transport)
+            runner_dosub_pair2_rt_transport(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_swallow)
+            runner_dosub_pair_sinks_swallow(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_gas_swallow)
+            runner_do_sinks_gas_swallow_pair(r, ci, cj, 1);
+          else if (t->subtype == task_subtype_sink_do_sink_swallow)
+            runner_do_sinks_sink_swallow_pair(r, ci, cj, 1);
+          else
+            error("Unknown/invalid task subtype (%s/%s).",
+                  taskID_names[t->type], subtaskID_names[t->subtype]);
+          break;
+
+        case task_type_sort:
+          /* Cleanup only if any of the indices went stale. */
+          runner_do_hydro_sort(
+              r, ci, t->flags,
+              ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin,
+              cell_get_flag(ci, cell_flag_rt_requests_sort), 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_rt_sort:
+          /* Cleanup only if any of the indices went stale.
+           * NOTE: we check whether we reset the sort flags when the
+           * recv tasks are running. Cells without an RT recv task
+           * don't have rt_sort tasks. */
+          runner_do_hydro_sort(
+              r, ci, t->flags,
+              ci->hydro.dx_max_sort_old > space_maxreldx * ci->dmin, 1, 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_stars_sort:
+          /* Cleanup only if any of the indices went stale. */
+          runner_do_stars_sort(
+              r, ci, t->flags,
+              ci->stars.dx_max_sort_old > space_maxreldx * ci->dmin, 1);
+          /* Reset the sort flags as our work here is done. */
+          t->flags = 0;
+          break;
+        case task_type_init_grav:
+          runner_do_init_grav(r, ci, 1);
+          break;
+        case task_type_ghost:
+          runner_do_ghost(r, ci, 1);
+          break;
 #ifdef EXTRA_HYDRO_LOOP
-      case task_type_extra_ghost:
-        runner_do_extra_ghost(r, ci, 1);
-        break;
+        case task_type_extra_ghost:
+          runner_do_extra_ghost(r, ci, 1);
+          break;
 #endif
-      case task_type_stars_ghost:
-        runner_do_stars_ghost(r, ci, 1);
-        break;
-      case task_type_bh_density_ghost:
-        runner_do_black_holes_density_ghost(r, ci, 1);
-        break;
-      case task_type_bh_swallow_ghost3:
-        runner_do_black_holes_swallow_ghost(r, ci, 1);
-        break;
-      case task_type_drift_part:
-        runner_do_drift_part(r, ci, 1);
-        break;
-      case task_type_drift_spart:
-        runner_do_drift_spart(r, ci, 1);
-        break;
-      case task_type_drift_sink:
-        runner_do_drift_sink(r, ci, 1);
-        break;
-      case task_type_drift_bpart:
-        runner_do_drift_bpart(r, ci, 1);
-        break;
-      case task_type_drift_gpart:
-        runner_do_drift_gpart(r, ci, 1);
-        break;
-      case task_type_kick1:
-        runner_do_kick1(r, ci, 1);
-        break;
-      case task_type_kick2:
-        runner_do_kick2(r, ci, 1);
-        break;
-      case task_type_end_hydro_force:
-        runner_do_end_hydro_force(r, ci, 1);
-        break;
-      case task_type_end_grav_force:
-        runner_do_end_grav_force(r, ci, 1);
-        break;
-      case task_type_csds:
-        runner_do_csds(r, ci, 1);
-        break;
-      case task_type_timestep:
-        runner_do_timestep(r, ci, 1);
-        break;
-      case task_type_timestep_limiter:
-        runner_do_limiter(r, ci, 0, 1);
-        break;
-      case task_type_timestep_sync:
-        runner_do_sync(r, ci, 0, 1);
-        break;
-      case task_type_collect:
-        runner_do_timestep_collect(r, ci, 1);
-        break;
-      case task_type_rt_collect_times:
-        runner_do_collect_rt_times(r, ci, 1);
-        break;
+        case task_type_stars_ghost:
+          runner_do_stars_ghost(r, ci, 1);
+          break;
+        case task_type_bh_density_ghost:
+          runner_do_black_holes_density_ghost(r, ci, 1);
+          break;
+        case task_type_bh_swallow_ghost3:
+          runner_do_black_holes_swallow_ghost(r, ci, 1);
+          break;
+        case task_type_drift_part:
+          runner_do_drift_part(r, ci, 1);
+          break;
+        case task_type_drift_spart:
+          runner_do_drift_spart(r, ci, 1);
+          break;
+        case task_type_drift_sink:
+          runner_do_drift_sink(r, ci, 1);
+          break;
+        case task_type_drift_bpart:
+          runner_do_drift_bpart(r, ci, 1);
+          break;
+        case task_type_drift_gpart:
+          runner_do_drift_gpart(r, ci, 1);
+          break;
+        case task_type_kick1:
+          runner_do_kick1(r, ci, 1);
+          break;
+        case task_type_kick2:
+          runner_do_kick2(r, ci, 1);
+          break;
+        case task_type_end_hydro_force:
+          runner_do_end_hydro_force(r, ci, 1);
+          break;
+        case task_type_end_grav_force:
+          runner_do_end_grav_force(r, ci, 1);
+          break;
+        case task_type_csds:
+          runner_do_csds(r, ci, 1);
+          break;
+        case task_type_timestep:
+          runner_do_timestep(r, ci, 1);
+          break;
+        case task_type_timestep_limiter:
+          runner_do_limiter(r, ci, 0, 1);
+          break;
+        case task_type_timestep_sync:
+          runner_do_sync(r, ci, 0, 1);
+          break;
+        case task_type_collect:
+          runner_do_timestep_collect(r, ci, 1);
+          break;
+        case task_type_rt_collect_times:
+          runner_do_collect_rt_times(r, ci, 1);
+          break;
 #ifdef WITH_MPI
-      case task_type_send:
-        if (t->subtype == task_subtype_tend) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_sf_counts) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_part_swallow) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_bpart_merger) {
-          free(t->buff);
-        } else if (t->subtype == task_subtype_limiter) {
-          free(t->buff);
-        }
-        break;
-      case task_type_recv:
-        if (t->subtype == task_subtype_tend) {
-          cell_unpack_end_step(ci, (struct pcell_step *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_sf_counts) {
-          cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
-          cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_xv) {
-          runner_do_recv_part(r, ci, 1, 1);
-        } else if (t->subtype == task_subtype_rho) {
-          runner_do_recv_part(r, ci, 0, 1);
-        } else if (t->subtype == task_subtype_gradient) {
-          runner_do_recv_part(r, ci, 0, 1);
-        } else if (t->subtype == task_subtype_rt_gradient) {
-          runner_do_recv_part(r, ci, 2, 1);
-        } else if (t->subtype == task_subtype_rt_transport) {
-          runner_do_recv_part(r, ci, -1, 1);
-        } else if (t->subtype == task_subtype_part_swallow) {
-          cell_unpack_part_swallow(ci,
-                                   (struct black_holes_part_data *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_bpart_merger) {
-          cell_unpack_bpart_swallow(ci,
-                                    (struct black_holes_bpart_data *)t->buff);
-          free(t->buff);
-        } else if (t->subtype == task_subtype_limiter) {
-          /* Nothing to do here. Unpacking done in a separate task */
-        } else if (t->subtype == task_subtype_gpart) {
-          runner_do_recv_gpart(r, ci, 1);
-        } else if (t->subtype == task_subtype_spart_density) {
-          runner_do_recv_spart(r, ci, 1, 1);
-        } else if (t->subtype == task_subtype_part_prep1) {
-          runner_do_recv_part(r, ci, 0, 1);
-        } else if (t->subtype == task_subtype_spart_prep2) {
-          runner_do_recv_spart(r, ci, 0, 1);
-        } else if (t->subtype == task_subtype_bpart_rho) {
-          runner_do_recv_bpart(r, ci, 1, 1);
-        } else if (t->subtype == task_subtype_bpart_feedback) {
-          runner_do_recv_bpart(r, ci, 0, 1);
-        } else {
-          error("Unknown/invalid task subtype (%d).", t->subtype);
-        }
-        break;
-
-      case task_type_pack:
-        runner_do_pack_limiter(r, ci, &t->buff, 1);
-        task_get_unique_dependent(t)->buff = t->buff;
-        break;
-      case task_type_unpack:
-        runner_do_unpack_limiter(r, ci, t->buff, 1);
-        break;
+        case task_type_send:
+          if (t->subtype == task_subtype_tend) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_sf_counts) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_part_swallow) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_bpart_merger) {
+            free(t->buff);
+          } else if (t->subtype == task_subtype_limiter) {
+            free(t->buff);
+          }
+          break;
+        case task_type_recv:
+          if (t->subtype == task_subtype_tend) {
+            cell_unpack_end_step(ci, (struct pcell_step *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_sf_counts) {
+            cell_unpack_sf_counts(ci, (struct pcell_sf *)t->buff);
+            cell_clear_stars_sort_flags(ci, /*clear_unused_flags=*/0);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_xv) {
+            runner_do_recv_part(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_rho) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_gradient) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_rt_gradient) {
+            runner_do_recv_part(r, ci, 2, 1);
+          } else if (t->subtype == task_subtype_rt_transport) {
+            runner_do_recv_part(r, ci, -1, 1);
+          } else if (t->subtype == task_subtype_part_swallow) {
+            cell_unpack_part_swallow(ci,
+                                     (struct black_holes_part_data *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_bpart_merger) {
+            cell_unpack_bpart_swallow(ci,
+                                      (struct black_holes_bpart_data *)t->buff);
+            free(t->buff);
+          } else if (t->subtype == task_subtype_limiter) {
+            /* Nothing to do here. Unpacking done in a separate task */
+          } else if (t->subtype == task_subtype_gpart) {
+            runner_do_recv_gpart(r, ci, 1);
+          } else if (t->subtype == task_subtype_spart_density) {
+            runner_do_recv_spart(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_part_prep1) {
+            runner_do_recv_part(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_spart_prep2) {
+            runner_do_recv_spart(r, ci, 0, 1);
+          } else if (t->subtype == task_subtype_bpart_rho) {
+            runner_do_recv_bpart(r, ci, 1, 1);
+          } else if (t->subtype == task_subtype_bpart_feedback) {
+            runner_do_recv_bpart(r, ci, 0, 1);
+          } else {
+            error("Unknown/invalid task subtype (%d).", t->subtype);
+          }
+          break;
+
+        case task_type_pack:
+          runner_do_pack_limiter(r, ci, &t->buff, 1);
+          task_get_unique_dependent(t)->buff = t->buff;
+          break;
+        case task_type_unpack:
+          runner_do_unpack_limiter(r, ci, t->buff, 1);
+          break;
 #endif
-      case task_type_grav_down:
-        runner_do_grav_down(r, t->ci, 1);
-        break;
-      case task_type_grav_long_range:
-        runner_do_grav_long_range(r, t->ci, 1);
-        break;
-      case task_type_grav_mm:
-        runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj);
-        break;
-      case task_type_cooling:
-        runner_do_cooling(r, t->ci, 1);
-        break;
-      case task_type_star_formation:
-        runner_do_star_formation(r, t->ci, 1);
-        break;
-      case task_type_star_formation_sink:
-        runner_do_star_formation_sink(r, t->ci, 1);
-        break;
-      case task_type_stars_resort:
-        runner_do_stars_resort(r, t->ci, 1);
-        break;
-      case task_type_sink_formation:
-        runner_do_sink_formation(r, t->ci);
-        break;
-      case task_type_fof_self:
-        runner_do_fof_search_self(r, t->ci, 1);
-        break;
-      case task_type_fof_pair:
-        runner_do_fof_search_pair(r, t->ci, t->cj, 1);
-        break;
-      case task_type_fof_attach_self:
-        runner_do_fof_attach_self(r, t->ci, 1);
-        break;
-      case task_type_fof_attach_pair:
-        runner_do_fof_attach_pair(r, t->ci, t->cj, 1);
-        break;
-      case task_type_neutrino_weight:
-        runner_do_neutrino_weighting(r, ci, 1);
-        break;
-      case task_type_rt_ghost1:
-        runner_do_rt_ghost1(r, t->ci, 1);
-        break;
-      case task_type_rt_ghost2:
-        runner_do_rt_ghost2(r, t->ci, 1);
-        break;
-      case task_type_rt_tchem:
-        runner_do_rt_tchem(r, t->ci, 1);
-        break;
-      case task_type_rt_advance_cell_time:
-        runner_do_rt_advance_cell_time(r, t->ci, 1);
-        break;
-      default:
-        error("Unknown/invalid task type (%d).", t->type);
-    }
-    r->active_time += (getticks() - task_beg);
+        case task_type_grav_down:
+          runner_do_grav_down(r, t->ci, 1);
+          break;
+        case task_type_grav_long_range:
+          runner_do_grav_long_range(r, t->ci, 1);
+          break;
+        case task_type_grav_mm:
+          runner_dopair_grav_mm_progenies(r, t->flags, t->ci, t->cj);
+          break;
+        case task_type_cooling:
+          runner_do_cooling(r, t->ci, 1);
+          break;
+        case task_type_star_formation:
+          runner_do_star_formation(r, t->ci, 1);
+          break;
+        case task_type_star_formation_sink:
+          runner_do_star_formation_sink(r, t->ci, 1);
+          break;
+        case task_type_stars_resort:
+          runner_do_stars_resort(r, t->ci, 1);
+          break;
+        case task_type_sink_formation:
+          runner_do_sink_formation(r, t->ci);
+          break;
+        case task_type_fof_self:
+          runner_do_fof_search_self(r, t->ci, 1);
+          break;
+        case task_type_fof_pair:
+          runner_do_fof_search_pair(r, t->ci, t->cj, 1);
+          break;
+        case task_type_fof_attach_self:
+          runner_do_fof_attach_self(r, t->ci, 1);
+          break;
+        case task_type_fof_attach_pair:
+          runner_do_fof_attach_pair(r, t->ci, t->cj, 1);
+          break;
+        case task_type_neutrino_weight:
+          runner_do_neutrino_weighting(r, ci, 1);
+          break;
+        case task_type_rt_ghost1:
+          runner_do_rt_ghost1(r, t->ci, 1);
+          break;
+        case task_type_rt_ghost2:
+          runner_do_rt_ghost2(r, t->ci, 1);
+          break;
+        case task_type_rt_tchem:
+          runner_do_rt_tchem(r, t->ci, 1);
+          break;
+        case task_type_rt_advance_cell_time:
+          runner_do_rt_advance_cell_time(r, t->ci, 1);
+          break;
+        default:
+          error("Unknown/invalid task type (%d).", t->type);
+      }
+      r->active_time += (getticks() - task_beg);
 
 /* Mark that we have run this task on these cells */
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1758,91 +1851,115 @@ void *runner_main2(void *data) {
       r->t = NULL;
 #endif
 
-
       /* We're done with this task, see if we get a next one. */
       prev = t;
 #ifdef GPUOFFLOAD
-//      if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack){
+      //      if (t->type == task_type_self && t->subtype ==
+      //      task_subtype_gpu_pack){
       if (t->subtype == task_subtype_gpu_pack ||
-    	  t->subtype == task_subtype_gpu_pack_g ||
-		  t->subtype == task_subtype_gpu_pack_f){
-    	/* Don't enqueue unpacks yet. Just signal the runners */
+          t->subtype == task_subtype_gpu_pack_g ||
+          t->subtype == task_subtype_gpu_pack_f) {
+        /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
-//        if(t->gpu_done == 0)message("Missed packing a GPU tasks\n");
-      }
-      else{ /* Mark task as done, as per usual */
+        //        if(t->gpu_done == 0)message("Missed packing a GPU tasks\n");
+      } else { /* Mark task as done, as per usual */
         t = scheduler_done(sched, t);
       }
-#else //GPUOFFLOAD
-        t = scheduler_done(sched, t);
-#endif //GPUOFFLOAD
+#else   // GPUOFFLOAD
+      t = scheduler_done(sched, t);
+#endif  // GPUOFFLOAD
 
     } /* main loop. */
-    // Stuff for writing debug data to file for validation
-////        if (step % 10 == 0 || step == 1) {
-//      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z, rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n");
-//      for (int tid = 0; tid < space->nr_local_cells;
-//           tid++) { /* This should indeed be tasks_done_gpu as they are the only
-////                     tasks which have been done*/
-//        struct cell *ctemp = &(space->cells_top[tid]);
-//        for (int i = 0; i < ctemp->hydro.count; i++) {
-//          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n",
-//                  ctemp->hydro.parts[i].x[0], ctemp->hydro.parts[i].x[1],
-//                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
-//                  ctemp->hydro.parts[i].density.rho_dh,
-//                  ctemp->hydro.parts[i].viscosity.v_sig, ctemp->hydro.parts[i].diffusion.laplace_u,
-//                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb, ctemp->hydro.parts[i].a_hydro[0],
-//				  ctemp->hydro.parts[i].a_hydro[1],
-//				  ctemp->hydro.parts[i].a_hydro[2]);
-////          message("wcount %f density %f", ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho);
-////          message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
-//        }
-//      }
-////  }
+      // Stuff for writing debug data to file for validation
+    ////        if (step % 10 == 0 || step == 1) {
+    //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
+    //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid =
+    //      0; tid < space->nr_local_cells;
+    //           tid++) { /* This should indeed be tasks_done_gpu as they are
+    //           the only
+    ////                     tasks which have been done*/
+    //        struct cell *ctemp = &(space->cells_top[tid]);
+    //        for (int i = 0; i < ctemp->hydro.count; i++) {
+    //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
+    //          %f\n",
+    //                  ctemp->hydro.parts[i].x[0], ctemp->hydro.parts[i].x[1],
+    //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+    //                  ctemp->hydro.parts[i].density.rho_dh,
+    //                  ctemp->hydro.parts[i].viscosity.v_sig,
+    //                  ctemp->hydro.parts[i].diffusion.laplace_u,
+    //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
+    //                  ctemp->hydro.parts[i].a_hydro[0],
+    //				  ctemp->hydro.parts[i].a_hydro[1],
+    //				  ctemp->hydro.parts[i].a_hydro[2]);
+    ////          message("wcount %f density %f",
+    ///ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
+    ///message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+    //        }
+    //      }
+    ////  }
     /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
 #ifdef DUMP_TIMINGS
 #ifdef GPUOFFLOAD
-//        char buffer[30];
-//        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d", r->cpuid, step);
-//        FILE *fullbundles = fopen(buffer, "w");
-//        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial, nfullpair, npartialpair\n");
-//        else fprintf(fullbundles, "%i, %i, %i, %i\n",
-//        		n_full_d_bundles, n_partial_d_bundles, n_full_p_d_bundles, n_partial_p_d_bundles);
-//        fflush(fullbundles);
-
-///////////////////////////////////////////////////////////////
-///to ooutput timings uncomment this
-///////////////////////////////////////////////////////////////
-		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
-				"GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, GPU_PG, P_PG, U_PG\n "
-				"%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n",
-				time_for_density_gpu, packing_time, unpack_time_self, time_for_density_gpu_pair, packing_time_pair,	unpacking_time_pair,
-				time_for_gpu_f, packing_time_f, unpack_time_self_f, time_for_gpu_pair_f,  packing_time_pair_f, unpacking_time_pair_f,
-				time_for_gpu_g, packing_time_g, unpack_time_self_g, time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-
-		else fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e\n",
-				time_for_density_gpu, packing_time, unpack_time_self, time_for_density_gpu_pair, packing_time_pair,	unpacking_time_pair,
-				time_for_gpu_f, packing_time_f, unpack_time_self_f, time_for_gpu_pair_f,  packing_time_pair_f, unpacking_time_pair_f,
-				time_for_gpu_g, packing_time_g, unpack_time_self_g, time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-//////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////
-
-#else //GPUOFFLOAD
-		if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "CPU TIME SELF, CPU TIME PAIR, "
-				"CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME PAIR G\n "
-				"%e, %e, %e, %e, %e, %e\n", time_for_density_cpu, time_for_density_cpu_pair,
-				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-
-		else fprintf(fgpu_steps,"%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu, time_for_density_cpu_pair,
-				time_for_cpu_f, time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-#endif //GPUOFFLOAD
-#endif //DUMPTIMINGS
-//    }
-	fflush(fgpu_steps);
-	fclose(fgpu_steps);
+    //        char buffer[30];
+    //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
+    //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
+    //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial, nfullpair,
+    //        npartialpair\n"); else fprintf(fullbundles, "%i, %i, %i, %i\n",
+    //        		n_full_d_bundles, n_partial_d_bundles,
+    //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
+
+    ///////////////////////////////////////////////////////////////
+    /// to ooutput timings uncomment this
+    ///////////////////////////////////////////////////////////////
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
+              "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
+              "GPU_PG, P_PG, U_PG\n "
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+    else
+      fprintf(fgpu_steps,
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+      //////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
+
+#else   // GPUOFFLOAD
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "CPU TIME SELF, CPU TIME PAIR, "
+              "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
+              "PAIR G\n "
+              "%e, %e, %e, %e, %e, %e\n",
+              time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
+              time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+    else
+      fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
+              time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
+              time_for_cpu_g, time_for_cpu_pair_g);
+#endif  // GPUOFFLOAD
+#endif  // DUMPTIMINGS
+    //    }
+    fflush(fgpu_steps);
+    fclose(fgpu_steps);
     time_for_density_cpu = 0.0;
     time_for_density_gpu = 0.0;
     time_for_density_cpu_pair = 0.0;
@@ -1852,7 +1969,9 @@ void *runner_main2(void *data) {
     tasks_done_gpu = 0;
     tasks_done_cpu = 0;
     tasks_done_gpu_inc = 0;
-    if(ghost_in > 0)fprintf(stderr,"total tasks not done on GPU %i is %i\n", r->cpuid, ghost_in);
+    if (ghost_in > 0)
+      fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
+              ghost_in);
     packed_self = 0;
     packed_pair = 0;
     packed_self_f = 0;
@@ -1862,140 +1981,139 @@ void *runner_main2(void *data) {
     density = 0;
     density_sub = 0;
     unpacked = 0;
-//	if(step == 2)cudaProfilerStop();
-//	if(step == 2)exit(0);
-//	  size_t free_byte ;
-//	  size_t total_byte ;
-//	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-//	  double free = (double)free_byte;
-//	  double available = (double)total_byte;
-//	  double used = (available - free);
-//	  fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+    //	if(step == 2)cudaProfilerStop();
+    //	if(step == 2)exit(0);
+    //	  size_t free_byte ;
+    //	  size_t total_byte ;
+    //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+    //	  double free = (double)free_byte;
+    //	  double available = (double)total_byte;
+    //	  double used = (available - free);
+    //	  fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
     /* Wait at the wait barrier. */
-//    swift_barrier_wait(&e->wait_barrier);
-
+    //    swift_barrier_wait(&e->wait_barrier);
   }
   // Free all data
-//  cudaFree(d_tid_p);
-//  cudaFree(d_id);
-//  cudaFree(d_x_p);
-//  cudaFree(d_y_p);
-//  cudaFree(d_z_p);
-//  cudaFree(d_ux);
-//  cudaFree(d_uy);
-//  cudaFree(d_uz);
-//  cudaFree(d_a_hydrox);
-//  cudaFree(d_a_hydroy);
-//  cudaFree(d_a_hydroz);
-//  cudaFree(d_mass);
-//  cudaFree(d_h);
-//  cudaFree(d_u);
-//  cudaFree(d_u_dt);
-//  cudaFree(d_rho);
-//  cudaFree(d_SPH_sum);
-//  cudaFree(d_locx);
-//  cudaFree(d_locy);
-//  cudaFree(d_locz);
-//  cudaFree(d_widthx);
-//  cudaFree(d_widthy);
-//  cudaFree(d_widthz);
-//  cudaFree(d_h_max);
-//  cudaFree(d_count_p);
-//  cudaFree(d_wcount);
-//  cudaFree(d_wcount_dh);
-//  cudaFree(d_rho_dh);
-//  cudaFree(d_rot_ux);
-//  cudaFree(d_rot_uy);
-//  cudaFree(d_rot_uz);
-//  cudaFree(d_div_v);
-//  cudaFree(d_div_v_previous_step);
-//  cudaFree(d_alpha_visc);
-//  cudaFree(d_v_sig);
-//  cudaFree(d_laplace_u);
-//  cudaFree(d_alpha_diff);
-//  cudaFree(d_f);
-//  cudaFree(d_soundspeed);
-//  cudaFree(d_h_dt);
-//  cudaFree(d_balsara);
-//  cudaFree(d_pressure);
-//  cudaFree(d_alpha_visc_max_ngb);
-//  cudaFree(d_time_bin);
-//  cudaFree(d_wakeup);
-//  cudaFree(d_min_ngb_time_bin);
-//  cudaFree(d_to_be_synchronized);
-//  cudaFree(tid_p);
-//  cudaFree(id);
-//  cudaFree(mass);
-//  cudaFree(h);
-//  cudaFree(u);
-//  cudaFree(u_dt);
-//  cudaFree(rho);
-//  cudaFree(SPH_sum);
-//  cudaFree(x_p);
-//  cudaFree(y_p);
-//  cudaFree(z_p);
-//  cudaFree(ux);
-//  cudaFree(uy);
-//  cudaFree(uz);
-//  cudaFree(a_hydrox);
-//  cudaFree(a_hydroy);
-//  cudaFree(a_hydroz);
-//  cudaFree(locx);
-//  cudaFree(locy);
-//  cudaFree(locz);
-//  cudaFree(widthx);
-//  cudaFree(widthy);
-//  cudaFree(widthz);
-//  cudaFree(h_max);
-//  cudaFree(count_p);
-//  cudaFree(wcount);
-//  cudaFree(wcount_dh);
-//  cudaFree(rho_dh);
-//  cudaFree(rot_ux);
-//  cudaFree(rot_uy);
-//  cudaFree(rot_uz);
-//  cudaFree(div_v);
-//  cudaFree(div_v_previous_step);
-//  cudaFree(alpha_visc);
-//  cudaFree(v_sig);
-//  cudaFree(laplace_u);
-//  cudaFree(alpha_diff);
-//  cudaFree(f);
-//  cudaFree(soundspeed);
-//  cudaFree(h_dt);
-//  cudaFree(balsara);
-//  cudaFree(pressure);
-//  cudaFree(alpha_visc_max_ngb);
-//  cudaFree(time_bin);
-//  cudaFree(wakeup);
-//  cudaFree(min_ngb_time_bin);
-//  cudaFree(to_be_synchronized);
-//  cudaFree(partid_p);
-//  cudaFree(d_task_first_part);
-//  cudaFree(d_task_last_part);
-//  cudaFree(task_first_part_self_dens);
-//  cudaFree(task_last_part_self_dens);
-//  cudaFree(task_first_part_pair_ci);
-//  cudaFree(task_last_part_pair_ci);
-//  cudaFree(task_first_part_pair_cj);
-//  cudaFree(task_last_part_pair_cj);
-//  cudaFree(d_bundle_first_part_self_dens);
-//  cudaFree(d_bundle_last_part_self_dens);
-//  cudaFree(bundle_first_part_self_dens);
-//  cudaFree(bundle_last_part_self_dens);
-//  cudaFree(bundle_first_part_pair_ci);
-//  cudaFree(bundle_last_part_pair_ci);
-//  cudaFree(bundle_first_part_pair_cj);
-//  cudaFree(bundle_last_part_pair_cj);
-//  free(ci_list_self_dens);
-//  free(ci_list_pair);
-//  free(cj_list_pair);
+  //  cudaFree(d_tid_p);
+  //  cudaFree(d_id);
+  //  cudaFree(d_x_p);
+  //  cudaFree(d_y_p);
+  //  cudaFree(d_z_p);
+  //  cudaFree(d_ux);
+  //  cudaFree(d_uy);
+  //  cudaFree(d_uz);
+  //  cudaFree(d_a_hydrox);
+  //  cudaFree(d_a_hydroy);
+  //  cudaFree(d_a_hydroz);
+  //  cudaFree(d_mass);
+  //  cudaFree(d_h);
+  //  cudaFree(d_u);
+  //  cudaFree(d_u_dt);
+  //  cudaFree(d_rho);
+  //  cudaFree(d_SPH_sum);
+  //  cudaFree(d_locx);
+  //  cudaFree(d_locy);
+  //  cudaFree(d_locz);
+  //  cudaFree(d_widthx);
+  //  cudaFree(d_widthy);
+  //  cudaFree(d_widthz);
+  //  cudaFree(d_h_max);
+  //  cudaFree(d_count_p);
+  //  cudaFree(d_wcount);
+  //  cudaFree(d_wcount_dh);
+  //  cudaFree(d_rho_dh);
+  //  cudaFree(d_rot_ux);
+  //  cudaFree(d_rot_uy);
+  //  cudaFree(d_rot_uz);
+  //  cudaFree(d_div_v);
+  //  cudaFree(d_div_v_previous_step);
+  //  cudaFree(d_alpha_visc);
+  //  cudaFree(d_v_sig);
+  //  cudaFree(d_laplace_u);
+  //  cudaFree(d_alpha_diff);
+  //  cudaFree(d_f);
+  //  cudaFree(d_soundspeed);
+  //  cudaFree(d_h_dt);
+  //  cudaFree(d_balsara);
+  //  cudaFree(d_pressure);
+  //  cudaFree(d_alpha_visc_max_ngb);
+  //  cudaFree(d_time_bin);
+  //  cudaFree(d_wakeup);
+  //  cudaFree(d_min_ngb_time_bin);
+  //  cudaFree(d_to_be_synchronized);
+  //  cudaFree(tid_p);
+  //  cudaFree(id);
+  //  cudaFree(mass);
+  //  cudaFree(h);
+  //  cudaFree(u);
+  //  cudaFree(u_dt);
+  //  cudaFree(rho);
+  //  cudaFree(SPH_sum);
+  //  cudaFree(x_p);
+  //  cudaFree(y_p);
+  //  cudaFree(z_p);
+  //  cudaFree(ux);
+  //  cudaFree(uy);
+  //  cudaFree(uz);
+  //  cudaFree(a_hydrox);
+  //  cudaFree(a_hydroy);
+  //  cudaFree(a_hydroz);
+  //  cudaFree(locx);
+  //  cudaFree(locy);
+  //  cudaFree(locz);
+  //  cudaFree(widthx);
+  //  cudaFree(widthy);
+  //  cudaFree(widthz);
+  //  cudaFree(h_max);
+  //  cudaFree(count_p);
+  //  cudaFree(wcount);
+  //  cudaFree(wcount_dh);
+  //  cudaFree(rho_dh);
+  //  cudaFree(rot_ux);
+  //  cudaFree(rot_uy);
+  //  cudaFree(rot_uz);
+  //  cudaFree(div_v);
+  //  cudaFree(div_v_previous_step);
+  //  cudaFree(alpha_visc);
+  //  cudaFree(v_sig);
+  //  cudaFree(laplace_u);
+  //  cudaFree(alpha_diff);
+  //  cudaFree(f);
+  //  cudaFree(soundspeed);
+  //  cudaFree(h_dt);
+  //  cudaFree(balsara);
+  //  cudaFree(pressure);
+  //  cudaFree(alpha_visc_max_ngb);
+  //  cudaFree(time_bin);
+  //  cudaFree(wakeup);
+  //  cudaFree(min_ngb_time_bin);
+  //  cudaFree(to_be_synchronized);
+  //  cudaFree(partid_p);
+  //  cudaFree(d_task_first_part);
+  //  cudaFree(d_task_last_part);
+  //  cudaFree(task_first_part_self_dens);
+  //  cudaFree(task_last_part_self_dens);
+  //  cudaFree(task_first_part_pair_ci);
+  //  cudaFree(task_last_part_pair_ci);
+  //  cudaFree(task_first_part_pair_cj);
+  //  cudaFree(task_last_part_pair_cj);
+  //  cudaFree(d_bundle_first_part_self_dens);
+  //  cudaFree(d_bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_self_dens);
+  //  cudaFree(bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_pair_ci);
+  //  cudaFree(bundle_last_part_pair_ci);
+  //  cudaFree(bundle_first_part_pair_cj);
+  //  cudaFree(bundle_last_part_pair_cj);
+  //  free(ci_list_self_dens);
+  //  free(ci_list_pair);
+  //  free(cj_list_pair);
 
   /* Be kind, rewind. */
   return NULL;
 }
 
-#endif // WITH_CUDA
+#endif  // WITH_CUDA
 
 #include <stdio.h>
 #include <sys/resource.h>
diff --git a/src/runner_others.c b/src/runner_others.c
index cbace92a63..f25dcb0b44 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -296,7 +296,7 @@ void runner_do_star_formation_sink(struct runner *r, struct cell *c,
         sink_update_sink_properties_after_star_formation(s, e, sink_props,
                                                          phys_const);
       } /* if sink_is_active */
-    } /* Loop over the particles */
+    }   /* Loop over the particles */
   }
 
   /* If we formed any stars, the star sorts are now invalid. We need to
diff --git a/src/runner_sinks.c b/src/runner_sinks.c
index eeece9401b..245417bfbc 100644
--- a/src/runner_sinks.c
+++ b/src/runner_sinks.c
@@ -109,8 +109,8 @@ void runner_doself_sinks_swallow(struct runner *r, struct cell *c, int timer) {
               e->gravity_properties, e->sink_properties);
         }
       } /* loop over the parts in ci. */
-    } /* loop over the bparts in ci. */
-  } /* Do we have gas particles in the cell? */
+    }   /* loop over the bparts in ci. */
+  }     /* Do we have gas particles in the cell? */
 
   /* When doing sink swallowing, we need a quick loop also over the sink
    * neighbours */
@@ -165,7 +165,7 @@ void runner_doself_sinks_swallow(struct runner *r, struct cell *c, int timer) {
             e->gravity_properties, e->sink_properties);
       }
     } /* loop over the sinks in ci. */
-  } /* loop over the sinks in ci. */
+  }   /* loop over the sinks in ci. */
 
   if (timer) TIMER_TOC(timer_doself_sink_swallow);
 }
@@ -252,8 +252,8 @@ void runner_do_nonsym_pair_sinks_naive_swallow(struct runner *r,
               e->gravity_properties, e->sink_properties);
         }
       } /* loop over the parts in cj. */
-    } /* loop over the sinks in ci. */
-  } /* Do we have gas particles in the cell? */
+    }   /* loop over the sinks in ci. */
+  }     /* Do we have gas particles in the cell? */
 
   /* When doing sink swallowing, we need a quick loop also over the sinks
    * neighbours */
@@ -308,7 +308,7 @@ void runner_do_nonsym_pair_sinks_naive_swallow(struct runner *r,
             e->gravity_properties, e->sink_properties);
       }
     } /* loop over the sinks in cj. */
-  } /* loop over the sinks in ci. */
+  }   /* loop over the sinks in ci. */
 }
 
 /**
@@ -682,7 +682,7 @@ void runner_do_sinks_gas_swallow(struct runner *r, struct cell *c, int timer) {
           get_integer_time_begin(ti_current + 1, p->time_bin);
       ti_beg_max = max(ti_beg, ti_beg_max);
     } /* Loop over the parts */
-  } /* Cell is not split */
+  }   /* Cell is not split */
 
   /* Update ti_beg_max. See bug fix above. */
   if (ti_beg_max != c->hydro.ti_beg_max) {
@@ -875,8 +875,8 @@ void runner_do_sinks_sink_swallow(struct runner *r, struct cell *c, int timer) {
         }
 
       } /* Part was flagged for swallowing */
-    } /* Loop over the parts */
-  } /* Cell is not split */
+    }   /* Loop over the parts */
+  }     /* Cell is not split */
 }
 
 /**
diff --git a/src/scheduler.c b/src/scheduler.c
index 509205597c..3dd29d7966 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -901,9 +901,9 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
   int local_count = 0;
   for (int i = 0; i < s->nr_tasks; i++) {
     const struct task *ta = &s->tasks[i];
-//    if(ta->subtype == task_subtype_gpu_unpack
-//  		  || ta->subtype == task_subtype_gpu_unpack_f
-//			  || ta->subtype == task_subtype_gpu_unpack_g)continue;
+    //    if(ta->subtype == task_subtype_gpu_unpack
+    //  		  || ta->subtype == task_subtype_gpu_unpack_f
+    //			  || ta->subtype == task_subtype_gpu_unpack_g)continue;
     /* Are we using this task?
      * For the 0-step, we wish to show all the tasks (even the inactives). */
     if (step != 0 && ta->skip) continue;
@@ -955,9 +955,10 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
     /* and their dependencies */
     for (int j = 0; j < ta->nr_unlock_tasks; j++) {
       const struct task *tb = ta->unlock_tasks[j];
-      if(tb->subtype == task_subtype_gpu_unpack
-        		  || tb->subtype == task_subtype_gpu_unpack_f
-      			  || tb->subtype == task_subtype_gpu_unpack_g)continue;
+      if (tb->subtype == task_subtype_gpu_unpack ||
+          tb->subtype == task_subtype_gpu_unpack_f ||
+          tb->subtype == task_subtype_gpu_unpack_g)
+        continue;
       /* Are we using this task?
        * For the 0-step, we wish to show all the tasks (even the inactive). */
       if (step != 0 && tb->skip) continue;
@@ -1386,7 +1387,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
               }
       }
     } /* pair interaction? */
-  } /* iterate over the current task. */
+  }   /* iterate over the current task. */
 }
 
 /**
@@ -1461,9 +1462,9 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
                         s);
 
           } /* Self-gravity only */
-        } /* Make tasks explicitly */
-      } /* Cell is split */
-    } /* Self interaction */
+        }   /* Make tasks explicitly */
+      }     /* Cell is split */
+    }       /* Self interaction */
 
     /* Pair interaction? */
     else if (t->type == task_type_pair) {
@@ -1536,7 +1537,7 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
         } /* Split the pair */
       }
     } /* pair interaction? */
-  } /* iterate over the current task. */
+  }   /* iterate over the current task. */
 }
 
 /**
@@ -1656,13 +1657,13 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
       scheduler_splittask_gravity(t, s);
-    // if task is gpu task do not split A. Nasar
-    }else if (t->subtype == task_subtype_gpu_pack ||
-              t->subtype == task_subtype_gpu_unpack||
-      		  t->subtype == task_subtype_gpu_pack_g ||
-              t->subtype == task_subtype_gpu_unpack_g||
-      	      t->subtype == task_subtype_gpu_pack_f ||
-              t->subtype == task_subtype_gpu_unpack_f) {
+      // if task is gpu task do not split A. Nasar
+    } else if (t->subtype == task_subtype_gpu_pack ||
+               t->subtype == task_subtype_gpu_unpack ||
+               t->subtype == task_subtype_gpu_pack_g ||
+               t->subtype == task_subtype_gpu_unpack_g ||
+               t->subtype == task_subtype_gpu_pack_f ||
+               t->subtype == task_subtype_gpu_unpack_f) {
       continue; /*Do nothing and grab next task to split*/
     } else {
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1761,21 +1762,21 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (ci != NULL) cell_set_flag(ci, cell_flag_has_tasks);
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
-  //#ifdef WITH_CUDA  A. Nasar
+  // #ifdef WITH_CUDA  A. Nasar
   if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack) {
     atomic_inc(&s->nr_self_pack_tasks);
   }
   if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack) {
     atomic_inc(&s->nr_pair_pack_tasks);
   }
-  //#ifdef WITH_CUDA
+  // #ifdef WITH_CUDA
   if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_g) {
     atomic_inc(&s->nr_self_pack_tasks_g);
   }
   if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_g) {
     atomic_inc(&s->nr_pair_pack_tasks_g);
   }
-  //#ifdef WITH_CUDA
+  // #ifdef WITH_CUDA
   if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_f) {
     atomic_inc(&s->nr_self_pack_tasks_f);
   }
@@ -1783,7 +1784,7 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
     atomic_inc(&s->nr_pair_pack_tasks_f);
   }
 
-  //#endif
+  // #endif
   /* Add an index for it. */
   // lock_lock( &s->lock );
   s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind;
@@ -1869,10 +1870,11 @@ void scheduler_set_unlocks(struct scheduler *s) {
     struct task *t = &s->tasks[k];
     for (int i = 0; i < t->nr_unlock_tasks; i++) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
-    	/*Fix for the case when one unpack task works over the same cell connected to two pair pack tasks*/
+        /*Fix for the case when one unpack task works over the same cell
+         * connected to two pair pack tasks*/
         if (t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack ||
-        		t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack_g ||
-        		t->unlock_tasks[i]->subtype != task_subtype_gpu_unpack_f){
+            t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack_g ||
+            t->unlock_tasks[i]->subtype != task_subtype_gpu_unpack_f) {
           continue;
         }
         if (t->unlock_tasks[i] == t->unlock_tasks[j])
@@ -1982,7 +1984,7 @@ void scheduler_reset(struct scheduler *s, int size) {
   /* Reset the counters. */
   s->size = size;
   s->nr_tasks = 0;
-  s->nr_self_pack_tasks = 0; // A. Nasar
+  s->nr_self_pack_tasks = 0;  // A. Nasar
   s->nr_pair_pack_tasks = 0;
   s->nr_self_pack_tasks_f = 0;
   s->nr_pair_pack_tasks_f = 0;
@@ -2056,12 +2058,12 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * (wscale * gcount_i) * gcount_i;
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
-        else if (t->subtype == task_subtype_gpu_pack) // A. Nasar
-          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack)  // A. Nasar
+          cost = 1.f * (wscale * count_i) * count_i;   // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_f)
-          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_g)
-          cost = 1.f * (wscale * count_i) * count_i;// * s->pack_size;
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack)
           cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_f)
@@ -2106,7 +2108,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
             cost = 3.f * (wscale * gcount_i) * gcount_j;
           else
             cost = 2.f * (wscale * gcount_i) * gcount_j;
-        // Abouzied: Think about good cost (for rainy days) A. Nasar
+          // Abouzied: Think about good cost (for rainy days) A. Nasar
         } else if (t->subtype == task_subtype_gpu_pack) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_pack_f) {
@@ -2450,23 +2452,24 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     t->done = 0;
     t->gpu_done = 0;
 
-//    if (t->type == task_type_self){ // A. Nasar increment number of waiting tasks
-//      if(t->subtype == task_subtype_gpu_pack)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
-//      if (t->subtype == task_subtype_gpu_pack_f)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
-//      if (t->subtype == task_subtype_gpu_pack_g)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
-//    }
-//
-//    if (t->type == task_type_pair){
-//      if(t->subtype == task_subtype_gpu_pack)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
-//      if (t->subtype == task_subtype_gpu_pack_f)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
-//      if (t->subtype == task_subtype_gpu_pack_g)
-//        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
-//    }
+    //    if (t->type == task_type_self){ // A. Nasar increment number of
+    //    waiting tasks
+    //      if(t->subtype == task_subtype_gpu_pack)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
+    //      if (t->subtype == task_subtype_gpu_pack_f)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
+    //      if (t->subtype == task_subtype_gpu_pack_g)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_g);
+    //    }
+    //
+    //    if (t->type == task_type_pair){
+    //      if(t->subtype == task_subtype_gpu_pack)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
+    //      if (t->subtype == task_subtype_gpu_pack_f)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
+    //      if (t->subtype == task_subtype_gpu_pack_g)
+    //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_g);
+    //    }
 
 #ifdef SWIFT_DEBUG_CHECKS
     /* Check that we don't have more waits that what can be stored. */
@@ -2504,7 +2507,7 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  * @param s The #scheduler.
  */
 void scheduler_start(struct scheduler *s) {
-  for (int i = 0; i < s->nr_queues; i++){ // A. Nasar
+  for (int i = 0; i < s->nr_queues; i++) {  // A. Nasar
     s->queues[i].n_packs_self_left = 0;
     s->queues[i].n_packs_pair_left = 0;
     s->queues[i].n_packs_self_left_f = 0;
@@ -2587,13 +2590,13 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
             t->subtype == task_subtype_external_grav) {
           qid = t->ci->grav.super->owner;
           owner = &t->ci->grav.super->owner;
-        } else if (t->subtype == task_subtype_gpu_pack) { // A. Nasar
+        } else if (t->subtype == task_subtype_gpu_pack) {  // A. Nasar
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
-            //          fprintf(stderr,"nqueues %i waiting %i active_count %i\n",
-            //          s->nr_queues, s->waiting, s->active_count);
-            //          if(qid==-1)fprintf(stderr,"queue id is negative\n");
-            //          else fprintf(stderr,"queue id is %i\n", qid);
+          //          fprintf(stderr,"nqueues %i waiting %i active_count %i\n",
+          //          s->nr_queues, s->waiting, s->active_count);
+          //          if(qid==-1)fprintf(stderr,"queue id is negative\n");
+          //          else fprintf(stderr,"queue id is %i\n", qid);
         } else if (t->subtype == task_subtype_gpu_pack_f) {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
@@ -2633,12 +2636,11 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_pair:
       case task_type_sub_pair:
-        if(t->subtype == task_subtype_gpu_unpack ||
-           t->subtype == task_subtype_gpu_unpack_f ||
-  		   t->subtype == task_subtype_gpu_unpack_g){
-        	qid = -1;
-        }
-        else{
+        if (t->subtype == task_subtype_gpu_unpack ||
+            t->subtype == task_subtype_gpu_unpack_f ||
+            t->subtype == task_subtype_gpu_unpack_g) {
+          qid = -1;
+        } else {
           qid = t->ci->super->owner;
           owner = &t->ci->super->owner;
           if ((qid < 0) ||
@@ -2859,32 +2861,34 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
-    //A. Nasar Do the same for the pack tasks
-    if (t->type == task_type_self){
-      if(t->subtype == task_subtype_gpu_pack)
+    // A. Nasar Do the same for the pack tasks
+    if (t->type == task_type_self) {
+      if (t->subtype == task_subtype_gpu_pack)
         atomic_inc(&s->queues[qid].n_packs_self_left);
       if (t->subtype == task_subtype_gpu_pack_f)
         atomic_inc(&s->queues[qid].n_packs_self_left_f);
       if (t->subtype == task_subtype_gpu_pack_g)
         atomic_inc(&s->queues[qid].n_packs_self_left_g);
     }
-    if (t->type == task_type_pair){ // A. Nasar NEED to think about how to do this with MPI where ci may not be on this node/rank
-      if(t->subtype == task_subtype_gpu_pack){
-    	  if(t->ci->nodeID == s->nodeID)
-            atomic_inc(&s->queues[qid].n_packs_pair_left);
-    	  else
-    		atomic_inc(&s->queues[qid].n_packs_pair_left);
+    if (t->type ==
+        task_type_pair) {  // A. Nasar NEED to think about how to do this with
+                           // MPI where ci may not be on this node/rank
+      if (t->subtype == task_subtype_gpu_pack) {
+        if (t->ci->nodeID == s->nodeID)
+          atomic_inc(&s->queues[qid].n_packs_pair_left);
+        else
+          atomic_inc(&s->queues[qid].n_packs_pair_left);
       }
-      if (t->subtype == task_subtype_gpu_pack_f){
-    	if(t->ci->nodeID == s->nodeID)
+      if (t->subtype == task_subtype_gpu_pack_f) {
+        if (t->ci->nodeID == s->nodeID)
+          atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+        else
           atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-    	else
-    	  atomic_inc(&s->queues[qid].n_packs_pair_left_f);
       }
-      if (t->subtype == task_subtype_gpu_pack_g){
-      	if(t->ci->nodeID == s->nodeID)
+      if (t->subtype == task_subtype_gpu_pack_g) {
+        if (t->ci->nodeID == s->nodeID)
           atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-      	else
+        else
           atomic_inc(&s->queues[qid].n_packs_pair_left_g);
       }
     }
@@ -2943,16 +2947,16 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
 
 struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
   /* Mark the task as skip. */
-//  t->skip = 1;
+  //  t->skip = 1;
 
   /* Task definitely done, signal any sleeping runners. */
   if (!t->implicit) {
-	t->toc = getticks();
-	t->total_ticks += t->toc - t->tic;
-	pthread_mutex_lock(&s->sleep_mutex);
-	atomic_dec(&s->waiting);
-	pthread_cond_broadcast(&s->sleep_cond);
-	pthread_mutex_unlock(&s->sleep_mutex);
+    t->toc = getticks();
+    t->total_ticks += t->toc - t->tic;
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
   }
   return NULL;
 }
@@ -2960,17 +2964,17 @@ struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
 struct task *enqueue_dependencies(struct scheduler *s, struct task *t) {
 
   /* Loop through the dependencies and add them to a queue if
-	 they are ready. */
+         they are ready. */
   for (int k = 0; k < t->nr_unlock_tasks; k++) {
-	struct task *t2 = t->unlock_tasks[k];
-	if (t2->skip) continue;
-
-	const int res = atomic_dec(&t2->wait);
-	if (res < 1) {
-	  error("Negative wait!");
-	} else if (res == 1) {
-	  scheduler_enqueue(s, t2);
-	}
+    struct task *t2 = t->unlock_tasks[k];
+    if (t2->skip) continue;
+
+    const int res = atomic_dec(&t2->wait);
+    if (res < 1) {
+      error("Negative wait!");
+    } else if (res == 1) {
+      scheduler_enqueue(s, t2);
+    }
   }
 
   return NULL;
@@ -3098,7 +3102,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
   struct task *res = NULL;
   const int nr_queues = s->nr_queues;
-//  unsigned int seed = qid;
+  //  unsigned int seed = qid;
 
   /* Check qid. */
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
@@ -3116,26 +3120,28 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         if (res != NULL) break;
       }
 
-      /* If unsuccessful, try stealing from the other queues. A. Nasar commented out for GPU work*/
-//      if (s->flags & scheduler_flag_steal) {
-//        int count = 0, qids[nr_queues];
-//        for (int k = 0; k < nr_queues; k++)
-//          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
-//            qids[count++] = k;
-//          }
-//        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
-//          const int ind = rand_r(&seed) % count;
-//          TIMER_TIC
-//          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
-//          TIMER_TOC(timer_qsteal);
-//          if (res != NULL) {
-//            break;
-//          } else {
-//            qids[ind] = qids[--count];
-//          }
-//        }
-//        if (res != NULL) break;
-//      }
+      /* If unsuccessful, try stealing from the other queues. A. Nasar commented
+       * out for GPU work*/
+      //      if (s->flags & scheduler_flag_steal) {
+      //        int count = 0, qids[nr_queues];
+      //        for (int k = 0; k < nr_queues; k++)
+      //          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0)
+      //          {
+      //            qids[count++] = k;
+      //          }
+      //        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+      //          const int ind = rand_r(&seed) % count;
+      //          TIMER_TIC
+      //          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+      //          TIMER_TOC(timer_qsteal);
+      //          if (res != NULL) {
+      //            break;
+      //          } else {
+      //            qids[ind] = qids[--count];
+      //          }
+      //        }
+      //        if (res != NULL) break;
+      //      }
     }
 
 /* If we failed, take a short nap. */
@@ -3282,7 +3288,7 @@ void scheduler_free_tasks(struct scheduler *s) {
   }
   s->size = 0;
   s->nr_tasks = 0;
-  //reset GPU task counters too
+  // reset GPU task counters too
   s->nr_self_pack_tasks = 0;
   s->nr_self_pack_tasks_f = 0;
   s->nr_self_pack_tasks_g = 0;
diff --git a/src/scheduler.h b/src/scheduler.h
index 16ef975be8..23363d9eb6 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -61,7 +61,7 @@ extern int activate_by_unskip;
 /* Data of a scheduler. */
 struct scheduler {
 
-  int nr_packs_self_dens_done; //A. Nasar
+  int nr_packs_self_dens_done;  // A. Nasar
   int nr_packs_pair_dens_done;
   int nr_packs_self_forc_done;
   int nr_packs_pair_forc_done;
diff --git a/src/space_getsid.h b/src/space_getsid.h
index 8b115a251d..f5e0101d30 100644
--- a/src/space_getsid.h
+++ b/src/space_getsid.h
@@ -78,22 +78,21 @@ space_getsid_and_swap_cells(const struct space *s, struct cell **ci,
   return sid;
 }
 
-__attribute__((always_inline, nonnull)) INLINE static int // A. Nasar Same as usual but only used to pack GPU cells
-space_getsid_GPU(const struct space *s, struct cell **ci,
-             struct cell **cj, double *shift_x, double *shift_y,
-			 double *shift_z) {
+__attribute__((always_inline, nonnull))
+INLINE static int  // A. Nasar Same as usual but only used to pack GPU cells
+space_getsid_GPU(const struct space *s, struct cell **ci, struct cell **cj,
+                 double *shift_x, double *shift_y, double *shift_z) {
   /* Get the relative distance between the pairs, wrapping. */
   const int periodic = s->periodic;
   double dx[3];
-  for(int k = 0; k < 3; k++)
-	  dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
+  for (int k = 0; k < 3; k++) dx[k] = (*cj)->loc[k] - (*ci)->loc[k];
 
   if (periodic && dx[0] < -s->dim[0] / 2)
     *(shift_x) = s->dim[0];
   else if (periodic && dx[0] > s->dim[0] / 2)
     *(shift_x) = -s->dim[0];
   else
-	  *(shift_x) = 0.0;
+    *(shift_x) = 0.0;
 
   dx[0] += *(shift_x);
 
@@ -102,7 +101,7 @@ space_getsid_GPU(const struct space *s, struct cell **ci,
   else if (periodic && dx[1] > s->dim[1] / 2)
     *(shift_y) = -s->dim[1];
   else
-	  *(shift_y) = 0.0;
+    *(shift_y) = 0.0;
 
   dx[1] += *(shift_y);
 
@@ -111,7 +110,7 @@ space_getsid_GPU(const struct space *s, struct cell **ci,
   else if (periodic && dx[2] > s->dim[2] / 2)
     *(shift_z) = -s->dim[2];
   else
-	  *(shift_z) = 0.0;
+    *(shift_z) = 0.0;
 
   dx[2] += *(shift_z);
 
@@ -135,9 +134,9 @@ space_getsid_GPU(const struct space *s, struct cell **ci,
   return sid;
 }
 
-__attribute__((always_inline, nonnull)) INLINE static int
-space_getsid_filter(const struct space *s, struct cell **ci, struct cell **cj,
-             double shift[3]) {
+__attribute__((always_inline, nonnull)) INLINE static int space_getsid_filter(
+    const struct space *s, struct cell **ci, struct cell **cj,
+    double shift[3]) {
 
   /* Get the relative distance between the pairs, wrapping. */
   const int periodic = s->periodic;
diff --git a/src/space_recycle.c b/src/space_recycle.c
index 47ed2e43d7..0b915ac7a2 100644
--- a/src/space_recycle.c
+++ b/src/space_recycle.c
@@ -232,7 +232,7 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements,
     c->mpi.recv = NULL;
     c->mpi.send = NULL;
 #endif
-    c->hydro.density_pack = NULL; // A. Nasar
+    c->hydro.density_pack = NULL;  // A. Nasar
     c->hydro.density_unpack = NULL;
     c->hydro.gradient_pack = NULL;
     c->hydro.gradient_unpack = NULL;
diff --git a/src/space_regrid.c b/src/space_regrid.c
index 487fe7c0e3..95fa4d9cd9 100644
--- a/src/space_regrid.c
+++ b/src/space_regrid.c
@@ -388,7 +388,7 @@ void space_regrid(struct space *s, int verbose) {
     // message( "rebuilding upper-level cells took %.3f %s." ,
     // clocks_from_ticks(double)(getticks() - tic), clocks_getunit());
 
-  } /* re-build upper-level cells? */
+  }      /* re-build upper-level cells? */
   else { /* Otherwise, just clean up the cells. */
 
     /* Free the old cells, if they were allocated. */
diff --git a/src/space_split.c b/src/space_split.c
index 43e020d5e5..0b79d5b23f 100644
--- a/src/space_split.c
+++ b/src/space_split.c
@@ -439,7 +439,7 @@ void space_split_recursive(struct space *s, struct cell *c,
       gravity_multipole_compute_power(&c->grav.multipole->m_pole);
 
     } /* Deal with gravity */
-  } /* Split or let it be? */
+  }   /* Split or let it be? */
 
   /* Otherwise, collect the data from the particles this cell. */
   else {
diff --git a/src/task.c b/src/task.c
index 29093634c0..1f8f0712c6 100644
--- a/src/task.c
+++ b/src/task.c
@@ -164,12 +164,12 @@ const char *subtaskID_names[task_subtype_count] = {
     "sink_do_gas_swallow",
     "rt_gradient",
     "rt_transport",
-	"gpu_pack", // A. Nasar
-	"gpu_pack_g",
-	"gpu_pack_f",
-	"gpu_unpack",
-	"gpu_unpack_g",
-	"gpu_unpack_f",
+    "gpu_pack",  // A. Nasar
+    "gpu_pack_g",
+    "gpu_pack_f",
+    "gpu_unpack",
+    "gpu_unpack_g",
+    "gpu_unpack_f",
 };
 
 const char *task_category_names[task_category_count] = {
@@ -193,22 +193,22 @@ MPI_Comm subtaskMPI_comms[task_subtype_count];
  * @param ARRAY is the array of this specific type.
  * @param COUNT is the number of elements in the array.
  */
-#define TASK_CELL_OVERLAP(TYPE, ARRAY, COUNT)                    \
-  __attribute__((always_inline)) INLINE static size_t            \
-      task_cell_overlap_##TYPE(const struct cell *restrict ci,   \
-                               const struct cell *restrict cj) { \
-                                                                 \
-    if (ci == NULL || cj == NULL) return 0;                      \
-                                                                 \
-    if (ci->ARRAY <= cj->ARRAY &&                                \
-        ci->ARRAY + ci->COUNT >= cj->ARRAY + cj->COUNT) {        \
-      return cj->COUNT;                                          \
-    } else if (cj->ARRAY <= ci->ARRAY &&                         \
-               cj->ARRAY + cj->COUNT >= ci->ARRAY + ci->COUNT) { \
-      return ci->COUNT;                                          \
-    }                                                            \
-                                                                 \
-    return 0;                                                    \
+#define TASK_CELL_OVERLAP(TYPE, ARRAY, COUNT)                           \
+  __attribute__((always_inline))                                        \
+  INLINE static size_t task_cell_overlap_##TYPE(                        \
+      const struct cell *restrict ci, const struct cell *restrict cj) { \
+                                                                        \
+    if (ci == NULL || cj == NULL) return 0;                             \
+                                                                        \
+    if (ci->ARRAY <= cj->ARRAY &&                                       \
+        ci->ARRAY + ci->COUNT >= cj->ARRAY + cj->COUNT) {               \
+      return cj->COUNT;                                                 \
+    } else if (cj->ARRAY <= ci->ARRAY &&                                \
+               cj->ARRAY + cj->COUNT >= ci->ARRAY + ci->COUNT) {        \
+      return ci->COUNT;                                                 \
+    }                                                                   \
+                                                                        \
+    return 0;                                                           \
   }
 
 TASK_CELL_OVERLAP(part, hydro.parts, hydro.count);
@@ -605,20 +605,21 @@ void task_unlock(struct task *t) {
         cell_unlocktree(ci);
 #endif
       } else if (subtype == task_subtype_gpu_unpack) {
-//        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
-//		  cell_unlocktree(t->ci_unpack[pp]);
-//	    }
-      /*Do nothing and be on your way*/
+        //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+        //        pp++){
+        //		  cell_unlocktree(t->ci_unpack[pp]);
+        //	    }
+        /*Do nothing and be on your way*/
       } else if (subtype == task_subtype_gpu_unpack_f) {
-      /*Do nothing and be on your way*/
+        /*Do nothing and be on your way*/
       } else if (subtype == task_subtype_gpu_unpack_g) {
-      /*Do nothing and be on your way*/
+        /*Do nothing and be on your way*/
       } else if (subtype == task_subtype_gpu_pack) {
-      	cell_unlocktree(ci);
+        cell_unlocktree(ci);
       } else if (subtype == task_subtype_gpu_pack_f) {
-      	cell_unlocktree(ci);
+        cell_unlocktree(ci);
       } else if (subtype == task_subtype_gpu_pack_g) {
-      	cell_unlocktree(ci);
+        cell_unlocktree(ci);
       } else { /* hydro */
         cell_unlocktree(ci);
       }
@@ -667,14 +668,14 @@ void task_unlock(struct task *t) {
         cell_unlocktree(cj);
 #endif
       } else if (subtype == task_subtype_gpu_pack) {
-      	cell_unlocktree(ci);
-      	cell_unlocktree(cj);
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
       } else if (subtype == task_subtype_gpu_pack_f) {
-      	cell_unlocktree(ci);
-      	cell_unlocktree(cj);
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
       } else if (subtype == task_subtype_gpu_pack_g) {
-      	cell_unlocktree(ci);
-      	cell_unlocktree(cj);
+        cell_unlocktree(ci);
+        cell_unlocktree(cj);
       } else if (subtype == task_subtype_gpu_unpack) {
         /* Nothing to do */
       } else if (subtype == task_subtype_gpu_unpack_f) {
@@ -886,33 +887,28 @@ int task_lock(struct task *t) {
 #endif
       } else if (subtype == task_subtype_gpu_pack) {
         /* Attempt to lock the cell */
-        if (ci->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
       } else if (subtype == task_subtype_gpu_pack_f) {
         /* Attempt to lock the cell */
-        if (ci->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
       } else if (subtype == task_subtype_gpu_pack_g) {
         /* Attempt to lock the cell */
-        if (ci->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
       } else if (subtype == task_subtype_gpu_unpack) {
-//        for(int pp = 0; pp < 128 /*should be sched->pack_size*/; pp++){
-//    	  if (t->ci_unpack[pp]->gpu_done == 0){
-//    		  message("trying to queue an unpack before all packs done on GPU");
-//    		  return 0;
-//    	  }
-////          if (t->ci_unpack[pp]->hydro.hold)
-////    		return 0;
-////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
-////            return 0;
-//        }
+        //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
+        //        pp++){
+        //    	  if (t->ci_unpack[pp]->gpu_done == 0){
+        //    		  message("trying to queue an unpack before all packs
+        //    done on GPU"); 		  return 0;
+        //    	  }
+        ////          if (t->ci_unpack[pp]->hydro.hold)
+        ////    		return 0;
+        ////    	  if (cell_locktree(t->ci_unpack[pp]) != 0)
+        ////            return 0;
+        //        }
         /* Nothing to do here */
         return 1;
       } else if (subtype == task_subtype_gpu_unpack_f) {
@@ -1039,30 +1035,24 @@ int task_lock(struct task *t) {
 #endif
       } else if (subtype == task_subtype_gpu_pack) {
         /* Lock the parts in both cells */
-        if (ci->hydro.hold || cj->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
         if (cell_locktree(cj) != 0) {
           cell_unlocktree(ci);
           return 0;
         }
       } else if (subtype == task_subtype_gpu_pack_f) {
         /* Lock the parts in both cells */
-        if (ci->hydro.hold || cj->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
         if (cell_locktree(cj) != 0) {
           cell_unlocktree(ci);
           return 0;
         }
       } else if (subtype == task_subtype_gpu_pack_g) {
         /* Lock the parts in both cells */
-        if (ci->hydro.hold || cj->hydro.hold)
-          return 0;
-        if (cell_locktree(ci) != 0)
-          return 0;
+        if (ci->hydro.hold || cj->hydro.hold) return 0;
+        if (cell_locktree(ci) != 0) return 0;
         if (cell_locktree(cj) != 0) {
           cell_unlocktree(ci);
           return 0;
@@ -1880,7 +1870,7 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_density:
         case task_subtype_gradient:
         case task_subtype_force:
-        case task_subtype_gpu_pack: // A. Nasar
+        case task_subtype_gpu_pack:  // A. Nasar
         case task_subtype_gpu_unpack:
         case task_subtype_gpu_pack_f:
         case task_subtype_gpu_unpack_f:
diff --git a/src/task.h b/src/task.h
index 68c495cf17..af5332641d 100644
--- a/src/task.h
+++ b/src/task.h
@@ -160,7 +160,7 @@ enum task_subtypes {
   task_subtype_sink_do_gas_swallow,
   task_subtype_rt_gradient,
   task_subtype_rt_transport,
-  task_subtype_gpu_pack, // A. Nasar
+  task_subtype_gpu_pack,  // A. Nasar
   task_subtype_gpu_pack_g,
   task_subtype_gpu_pack_f,
   task_subtype_gpu_unpack,
@@ -241,14 +241,14 @@ struct task {
   /*! Pointers to the cells this task acts upon */
   struct cell *ci, *cj;
 
-  int done; // A. Nasar
+  int done;  // A. Nasar
 
   int gpu_done;
 
   int corner_pair;
 
   /*! Pointers to the cells this task acts upon */
-  struct cell **ci_unpack;//, **cj;
+  struct cell **ci_unpack;  //, **cj;
 
   /*! List of tasks unlocked by this one */
   struct task **unlock_tasks;

From 780553d7ef48d117b346de222a37cf67407e3a0d Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 10:48:14 +0100
Subject: [PATCH 041/217] Added new ifdef-controls to offload only the
 density/gradient/force hydro tasks to the GPU

---
 src/runner_main_clean.cu | 97 +++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 37 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 2bf1352fe8..d4edbe7047 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,7 +19,11 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD 1  // off-load hydro to GPU
+#define GPUOFFLOAD_DENSITY 1  // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+#define GPUOFFLOAD_FORCE 1  // off-load hydro density to GPU
+
+
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
 #include "../config.h"
@@ -1015,7 +1019,7 @@ void *runner_main2(void *data) {
             unpacked_f++;
           } else if (t->subtype == task_subtype_density) {
             cpu_self++;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_DENSITY
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself1_branch_density(r, ci);
@@ -1028,7 +1032,7 @@ void *runner_main2(void *data) {
             /* GPU WORK */
           } else if (t->subtype == task_subtype_gpu_pack) {
             packed_self++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_DENSITY
             //          struct timespec t0, t1; //
             //          clock_gettime(CLOCK_REALTIME, &t0);
             packing_time +=
@@ -1060,11 +1064,11 @@ void *runner_main2(void *data) {
               //stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
               //					&tot_time_for_hard_memcpys);
             } /*End of GPU work Self*/
-#endif        // GPUDENSSELF
+#endif  
           }   /* self / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_self_g++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_GRADIENT
             //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
             //        		  t, parts_aos_grad, &packing_time_g);
             packing_time_g += runner_doself1_pack_f4_g(
@@ -1091,7 +1095,7 @@ void *runner_main2(void *data) {
 #endif        // GPUGRADSELF
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_self_f++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_FORCE
             //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
             //        		  t, parts_aos_forc, &packing_time_f);
             packing_time_f += runner_doself1_pack_f4_f(
@@ -1121,12 +1125,12 @@ void *runner_main2(void *data) {
                   &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
                   d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
             } /*End of GPU work Self*/
-#endif        // GPUFORCSELF
+#endif 
           }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             cpu_self_g++;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_GRADIENT
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself1_branch_gradient(r, ci);
@@ -1134,12 +1138,12 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_g += (t1.tv_sec - t0.tv_sec) +
                               (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif  // GPUGRADSELF
+#endif 
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             cpu_self_f++;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_FORCE
             struct timespec t0, t1;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_doself2_branch_force(r, ci);
@@ -1147,7 +1151,7 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_f += (t1.tv_sec - t0.tv_sec) +
                               (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif  // GPUFORCSELF
+#endif 
           } else if (t->subtype == task_subtype_limiter)
             runner_doself1_branch_limiter(r, ci);
           else if (t->subtype == task_subtype_grav)
@@ -1194,7 +1198,7 @@ void *runner_main2(void *data) {
             /* Abouzied: To be commented out when the GPU pairs have been coded
              * up */
             cpu_pair++;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_DENSITY
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_dopair1_branch_density(r, ci, cj);
@@ -1208,7 +1212,7 @@ void *runner_main2(void *data) {
           /* GPU WORK */
           else if (t->subtype == task_subtype_gpu_pack) {
             packed_pair++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_DENSITY
 #ifdef DO_CORNERS
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
@@ -1282,11 +1286,11 @@ void *runner_main2(void *data) {
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif        // DO_CORNERS
-#endif        // GPUDENS
+#endif        // GPUOFFLOAD_DENSITY
           }   /* pair / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_pair_g++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_GRADIENT
 #ifdef DO_CORNERS
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
@@ -1362,10 +1366,10 @@ void *runner_main2(void *data) {
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif        // DO_CORNERS
-#endif        // GPUGRADPAIR
+#endif        // GPUOFFLOAD_GRADIENT
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_pair_f++;
-#ifdef GPUOFFLOAD
+#ifdef GPUOFFLOAD_FORCE
 #ifdef DO_CORNERS
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
@@ -1438,7 +1442,7 @@ void *runner_main2(void *data) {
 #ifdef DO_CORNERS
             }
 #endif  // DO_CORNERS
-#endif  // GPUFORCPAIR
+#endif  // GPUOFFLOAD_FORCE
           } else if (t->subtype == task_subtype_gpu_unpack) {
             unpacked_pair++;
           } else if (t->subtype == task_subtype_gpu_unpack_g) {
@@ -1449,7 +1453,7 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             int Do_nothing = 0;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_GRADIENT
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_dopair1_branch_gradient(r, ci, cj);
@@ -1457,12 +1461,12 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) +
                                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif  // GPUGRADPAIR
+#endif 
           }
 #endif  // EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_force) {
             int Do_nothing = 0;
-#ifndef GPUOFFLOAD
+#ifndef GPUOFFLOAD_FORCE
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_dopair2_branch_force(r, ci, cj);
@@ -1470,7 +1474,7 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_pair_f += (t1.tv_sec - t0.tv_sec) +
                                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif  // GPUFORCPAIR
+#endif  // GPUOFFLOAD_FORCE
           } else if (t->subtype == task_subtype_limiter)
             runner_dopair1_branch_limiter(r, ci, cj);
           else if (t->subtype == task_subtype_grav)
@@ -1853,23 +1857,42 @@ void *runner_main2(void *data) {
 
       /* We're done with this task, see if we get a next one. */
       prev = t;
-#ifdef GPUOFFLOAD
-      //      if (t->type == task_type_self && t->subtype ==
-      //      task_subtype_gpu_pack){
-      if (t->subtype == task_subtype_gpu_pack ||
-          t->subtype == task_subtype_gpu_pack_g ||
-          t->subtype == task_subtype_gpu_pack_f) {
+
+      if (t->subtype == task_subtype_gpu_pack) {
+#ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
         t = NULL;
-        //        if(t->gpu_done == 0)message("Missed packing a GPU tasks\n");
-      } else { /* Mark task as done, as per usual */
-        t = scheduler_done(sched, t);
+#else
+	t = scheduler_done(sched, t);
+#endif
       }
-#else   // GPUOFFLOAD
-      t = scheduler_done(sched, t);
-#endif  // GPUOFFLOAD
 
+      if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+        /* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+#else
+	t = scheduler_done(sched, t);
+#endif
+      }
+
+      if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+        /* Don't enqueue unpacks yet. Just signal the runners */
+        t->skip = 1;
+        t = NULL;
+#else
+	t = scheduler_done(sched, t);
+      }
+#endif
+
+      if (t->subtype != task_subtype_gpu_pack &&
+	  t->subtype != task_subtype_gpu_pack_g &&
+	  t->subtype != task_subtype_gpu_pack_f) {
+	t = scheduler_done(sched, t);
+      }
     } /* main loop. */
       // Stuff for writing debug data to file for validation
     ////        if (step % 10 == 0 || step == 1) {
@@ -1901,7 +1924,7 @@ void *runner_main2(void *data) {
     /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
 #ifdef DUMP_TIMINGS
-#ifdef GPUOFFLOAD
+#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || defined(GPUOFFLOAD_FORCE)
     //        char buffer[30];
     //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
     //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
@@ -1941,7 +1964,7 @@ void *runner_main2(void *data) {
       ///////////////////////////////////////////////////////////////
       ///////////////////////////////////////////////////////////////
 
-#else   // GPUOFFLOAD
+#else   // No GPU offload
     if (r->cpuid == 0 && engine_rank == 0)
       fprintf(fgpu_steps,
               "CPU TIME SELF, CPU TIME PAIR, "
@@ -1955,7 +1978,7 @@ void *runner_main2(void *data) {
       fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
               time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
               time_for_cpu_g, time_for_cpu_pair_g);
-#endif  // GPUOFFLOAD
+#endif  
 #endif  // DUMPTIMINGS
     //    }
     fflush(fgpu_steps);

From 3b58f55b8ebd08e9563309c16fb323623c361d04 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 10:49:12 +0100
Subject: [PATCH 042/217] Added new ifdef-controls to offload only the
 density/gradient/force hydro tasks to the GPU

---
 src/cuda/GPU_runner_functions.cu              |  36 +-
 .../host_device_data_transfer.cu              |   2 +-
 src/runner_main_clean.cu                      | 560 +++++++++---------
 3 files changed, 301 insertions(+), 297 deletions(-)

diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
index 082becaaa9..6197c0231b 100644
--- a/src/cuda/GPU_runner_functions.cu
+++ b/src/cuda/GPU_runner_functions.cu
@@ -80,7 +80,7 @@ __global__ void tester(struct part_soa parts_soa, int *d_task_first_part,
   }
   //  if(parts_soa.tid_p[pid] == 1 && pid < last_part_in_task_blocks)
   //	  printf("tid %i last_part_in_blocks %i\n", parts_soa.tid_p[pid],
-  //last_part_in_task_blocks);
+  // last_part_in_task_blocks);
 }
 #ifdef WITH_CUDA
 }
@@ -859,7 +859,7 @@ __global__ void DOSELF_GPU_AOS_F4_G(
   }
   if (pid < last_part_in_task_blocks) {
     //	  printf("v %f lap %f maxvisc %f\n", vsig_lapu_aviscmax_empty_i.x,
-    //vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
+    // vsig_lapu_aviscmax_empty_i.y, vsig_lapu_aviscmax_empty_i.z);
     parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
   }
 }
@@ -1739,7 +1739,7 @@ __device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
   if (pid < ci_end) {
     hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
     mi = parts_soa.mass[pid];
@@ -1794,7 +1794,7 @@ __device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
 
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         //		const float xij = (pix - pjx) * flip_order, yij = (piy -
-        //pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        // pjy) * flip_order, zij = (piz - pjz) * flip_order;
         const float r2 = xij * xij + yij * yij + zij * zij;
         if (r2 < hig2) {
           /* Recover some data */
@@ -1883,7 +1883,7 @@ __device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
   if (pid < ci_end) {
     hi = parts_aos[pid].h, hig2 = hi * hi * kernel_gamma2;
     mi = parts_aos[pid].mass;
@@ -1937,7 +1937,7 @@ __device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
 
         const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
         //		const float xij = (pix - pjx) * flip_order, yij = (piy -
-        //pjy) * flip_order, zij = (piz - pjz) * flip_order;
+        // pjy) * flip_order, zij = (piz - pjz) * flip_order;
         const float r2 = xij * xij + yij * yij + zij * zij;
         if (r2 < hig2) {
           /* Recover some data */
@@ -1973,7 +1973,7 @@ __device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
           rot_uyi += faci * curlvry;
           rot_uzi += faci * curlvrz;
           //		  if(timebin[j_block] != 1000 && timebin[j_block] !=
-          //20)printf("incorrect timebin %i\n", timebin[j_block]);
+          // 20)printf("incorrect timebin %i\n", timebin[j_block]);
         }
       } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
     }   /*End of looping through particles in shared memory---Shared arrays
@@ -2019,7 +2019,7 @@ __device__ void DOPAIR2NONSYMGPUAOSF4(
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
   //  if (pid < ci_end) {
   hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
   //  }
@@ -2121,7 +2121,7 @@ __device__ void DOPAIR2NAIVEGPUAOSF4(
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
   //  if (pid < ci_end) {
   hi = x_pi.w, hig2 = hi * hi * kernel_gamma2;
   //  }
@@ -2171,7 +2171,7 @@ __device__ void DOPAIR2NAIVEGPUAOSF4(
       res_rot.w -= faci * dvdr;
     }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-    //  if (pid >= ci_start && pid < ci_end) {
+  //  if (pid >= ci_start && pid < ci_end) {
   parts_recv[pid].rho_dh_wcount = res_rho;
   parts_recv[pid].rot_ux_div_v = res_rot;
   //  }
@@ -2361,7 +2361,7 @@ __device__ void DOPAIR2NAIVEGPUAOSF4G(
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
   //  if (pid < ci_end) {
   hi = x_h_i.w, hig2 = hi * hi * kernel_gamma2;
   //  }
@@ -2419,7 +2419,7 @@ __device__ void DOPAIR2NAIVEGPUAOSF4G(
       vsig_lapu_aviscmax_i.z = fmaxf(vsig_lapu_aviscmax_i.z, alpha_j);
     }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-    //  if (pid >= ci_start && pid < ci_end) {
+  //  if (pid >= ci_start && pid < ci_end) {
   parts_recv[pid].vsig_lapu_aviscmax = vsig_lapu_aviscmax_i;
   //  }
 }
@@ -2840,7 +2840,7 @@ __device__ void DOPAIR2NAIVEGPUAOSF4F(
       //          printf("Got in\n");
     }
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
-    //  if (pid >= ci_start && pid < ci_end) {
+  //  if (pid >= ci_start && pid < ci_end) {
   udt_hdt_vsig_mintbinngb.w = f_b_t_mintbinngb_i.w;
   parts_recv[pid].udt_hdt_vsig_mintimebin_ngb = udt_hdt_vsig_mintbinngb;
   parts_recv[pid].a_hydro = ahydro;
@@ -2887,7 +2887,7 @@ __device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
   //  printf("first_part_in_task_blocks_cj %i last_part_in_task_blocks_cj %i
   //  last_part_in_task_blocks_ci %i\n",
   //		  first_part_in_task_blocks_cj, last_part_in_task_blocks_cj,
-  //last_part_in_task_blocks_ci);
+  // last_part_in_task_blocks_ci);
 
   if (pid < ci_end) {
     cellx = parts_soa.locx[pid];
@@ -3033,7 +3033,7 @@ __device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
           //		  atomicAdd(&wcount_tmp[j_block], wj);
           atomicAdd(&parts_soa.wcount[j], wj);
           //		  atomicAdd(&wcount_dh_tmp[j_block], -(hydro_dimension *
-          //wj + uj * wj_dx));
+          // wj + uj * wj_dx));
           atomicAdd(&parts_soa.wcount_dh[j],
                     -(hydro_dimension * wj + uj * wj_dx));
 
@@ -3050,8 +3050,8 @@ __device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
           atomicAdd(&parts_soa.rot_uy[j], facj * curlvry);
           atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
           //		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v
-          //%f rotux %f rotuy %f rotuz %f\n" 				 ,rhoi, rho_dhi, wcounti,
-          //wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+          //%f rotux %f rotuy %f rotuz %f\n" 				 ,rhoi,
+          //rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
         } /*if r2<hjg2 */
       }   /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
     }     /*End of looping through particles in shared memory---Shared arrays
@@ -3677,7 +3677,7 @@ void runner_dopair1_branch_density_gpu(
   //  fprintf(stderr,"max_parts %i, max_partsi %i, max_partsj %i\n, "
   //		  "numBlocks_x %i, numBlocks_y %i, BLOCK_SIZE %i\n", max_parts,
   //		  max_parts_i, max_parts_j, numBlocks_x, numBlocks_y,
-  //BLOCK_SIZE);
+  // BLOCK_SIZE);
 
   /*Do ci & cj*/
   //  fprintf(stderr, "BLOCK_SIZE %i max parts %i num idle threads %i\n",
diff --git a/src/files_for_new_functions/host_device_data_transfer.cu b/src/files_for_new_functions/host_device_data_transfer.cu
index 63748e7e18..ede719529b 100644
--- a/src/files_for_new_functions/host_device_data_transfer.cu
+++ b/src/files_for_new_functions/host_device_data_transfer.cu
@@ -287,7 +287,7 @@ void host2device_async_density_pair(
   //  cudaMemcpyAsync(&tid_p[first_part_tmp],
   //  &(parts_soa_buffer.tid_p[first_part_tmp]),
   //				  bundle_n_parts * sizeof(int),
-  //cudaMemcpyHostToDevice, 				  stream);
+  // cudaMemcpyHostToDevice, 				  stream);
   //  cudaMemcpyAsync(&locx[first_part_tmp],
   //  &(parts_soa_buffer.locx[first_part_tmp]),
   //				  bundle_n_parts * sizeof(float),
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index d4edbe7047..6683e9cf5c 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,10 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD_DENSITY 1  // off-load hydro density to GPU
+#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
 #define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-#define GPUOFFLOAD_FORCE 1  // off-load hydro density to GPU
-
+#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
@@ -785,13 +784,13 @@ void *runner_main2(void *data) {
              2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
 
   ///////////Probably not needed
-  ///anymore////////////////////////////////////////////////////////////////
+  /// anymore////////////////////////////////////////////////////////////////
   cudaMalloc((void **)&d_parts_aos_pair_forc,
              2 * count_max_parts_tmp * sizeof(struct part_aos_f));
   cudaMalloc((void **)&d_parts_aos_pair_grad,
              2 * count_max_parts_tmp * sizeof(struct part_aos_g));
   ///////////Probably not needed
-  ///anymore////////////////////////////////////////////////////////////////
+  /// anymore////////////////////////////////////////////////////////////////
 
   //  cudaMallocHost((void **)&parts_aos_pair_dens, 2 * count_max_parts_tmp *
   //  sizeof(struct part_aos));
@@ -1060,12 +1059,12 @@ void *runner_main2(void *data) {
                   &unpack_time_self, task_first_part_self_dens_f4, devId,
                   task_first_part_f4, d_task_first_part_f4, self_end);
               //	        runner_doself1_launch(r, sched,
-              //pack_vars_self_dens, ci, t, parts_aos_dens, 	        		d_parts_aos_dens,
-              //stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-              //					&tot_time_for_hard_memcpys);
+              // pack_vars_self_dens, ci, t, parts_aos_dens,
+              // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
+              // &time_for_density_gpu, 					&tot_time_for_hard_memcpys);
             } /*End of GPU work Self*/
-#endif  
-          }   /* self / pack */
+#endif
+          } /* self / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_self_g++;
 #ifdef GPUOFFLOAD_GRADIENT
@@ -1116,8 +1115,9 @@ void *runner_main2(void *data) {
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               //  	        runner_doself1_launch_f(r, sched,
-              //  pack_vars_self_forc, ci, t, parts_aos_forc, 	        		d_parts_aos_forc,
-              //  stream, d_a, d_H, e, &packing_time_f, &time_for_gpu_f);
+              //  pack_vars_self_forc, ci, t, parts_aos_forc,
+              //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
+              //  &time_for_gpu_f);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1125,7 +1125,7 @@ void *runner_main2(void *data) {
                   &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
                   d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
             } /*End of GPU work Self*/
-#endif 
+#endif
           }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
@@ -1138,7 +1138,7 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_g += (t1.tv_sec - t0.tv_sec) +
                               (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif 
+#endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
@@ -1151,7 +1151,7 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_f += (t1.tv_sec - t0.tv_sec) +
                               (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif 
+#endif
           } else if (t->subtype == task_subtype_limiter)
             runner_doself1_branch_limiter(r, ci);
           else if (t->subtype == task_subtype_grav)
@@ -1224,7 +1224,7 @@ void *runner_main2(void *data) {
                                  (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
             if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
               //		  if((sid != 4 && sid != 10 && sid == 12) &&
-              //step > 1){
+              // step > 1){
               clock_gettime(CLOCK_REALTIME, &t0);
               runner_dopair1_branch_density(r, ci, cj);
               t->corner_pair = 1;
@@ -1271,9 +1271,9 @@ void *runner_main2(void *data) {
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
                 //				runner_dopair1_launch(r, sched,
-                //pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
                 //						d_parts_aos_pair_dens,
-                //stream, d_a, d_H, e, &packing_time_pair,
+                // stream, d_a, d_H, e, &packing_time_pair,
                 //&time_for_density_gpu_pair);
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
@@ -1335,10 +1335,10 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-              //          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad,
-              //          ci,
-              //        		  cj, t, parts_aos_pair_grad, e,
-              //        &packing_time_g);
+        //          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad,
+        //          ci,
+        //        		  cj, t, parts_aos_pair_grad, e,
+        //        &packing_time_g);
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
@@ -1351,9 +1351,9 @@ void *runner_main2(void *data) {
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
                 //			runner_dopair1_launch_g(r, sched,
-                //pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
                 //					d_parts_aos_pair_grad,
-                //stream, d_a, d_H, e, &packing_time_pair_g,
+                // stream, d_a, d_H, e, &packing_time_pair_g,
                 //&time_for_gpu_pair_g);
                 runner_dopair1_launch_f4_g_one_memcpy(
                     r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
@@ -1413,8 +1413,9 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-              //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc,
-              //            ci, 		cj, t, parts_aos_pair_forc, e, &packing_time_f);
+        //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc,
+        //            ci, 		cj, t, parts_aos_pair_forc, e,
+        //            &packing_time_f);
               packing_time_pair_f +=
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
@@ -1461,7 +1462,7 @@ void *runner_main2(void *data) {
             tasks_done_cpu++;
             time_for_cpu_pair_g += (t1.tv_sec - t0.tv_sec) +
                                    (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif 
+#endif
           }
 #endif  // EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_force) {
@@ -1864,7 +1865,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
         t = NULL;
 #else
-	t = scheduler_done(sched, t);
+        t = scheduler_done(sched, t);
 #endif
       }
 
@@ -1874,7 +1875,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
         t = NULL;
 #else
-	t = scheduler_done(sched, t);
+        t = scheduler_done(sched, t);
 #endif
       }
 
@@ -1884,257 +1885,260 @@ void *runner_main2(void *data) {
         t->skip = 1;
         t = NULL;
 #else
-	t = scheduler_done(sched, t);
+        t = scheduler_done(sched, t);
       }
 #endif
 
-      if (t->subtype != task_subtype_gpu_pack &&
-	  t->subtype != task_subtype_gpu_pack_g &&
-	  t->subtype != task_subtype_gpu_pack_f) {
-	t = scheduler_done(sched, t);
-      }
-    } /* main loop. */
-      // Stuff for writing debug data to file for validation
-    ////        if (step % 10 == 0 || step == 1) {
-    //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
-    //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid =
-    //      0; tid < space->nr_local_cells;
-    //           tid++) { /* This should indeed be tasks_done_gpu as they are
-    //           the only
-    ////                     tasks which have been done*/
-    //        struct cell *ctemp = &(space->cells_top[tid]);
-    //        for (int i = 0; i < ctemp->hydro.count; i++) {
-    //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
-    //          %f\n",
-    //                  ctemp->hydro.parts[i].x[0], ctemp->hydro.parts[i].x[1],
-    //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
-    //                  ctemp->hydro.parts[i].density.rho_dh,
-    //                  ctemp->hydro.parts[i].viscosity.v_sig,
-    //                  ctemp->hydro.parts[i].diffusion.laplace_u,
-    //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
-    //                  ctemp->hydro.parts[i].a_hydro[0],
-    //				  ctemp->hydro.parts[i].a_hydro[1],
-    //				  ctemp->hydro.parts[i].a_hydro[2]);
-    ////          message("wcount %f density %f",
-    ///ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
-    ///message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
-    //        }
-    //      }
-    ////  }
-    /*Output compute times to separate files. cat later into one file*/
+        if (t->subtype != task_subtype_gpu_pack &&
+            t->subtype != task_subtype_gpu_pack_g &&
+            t->subtype != task_subtype_gpu_pack_f) {
+          t = scheduler_done(sched, t);
+        }
+      } /* main loop. */
+        // Stuff for writing debug data to file for validation
+        ////        if (step % 10 == 0 || step == 1) {
+      //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
+      //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
+      //      = 0; tid < space->nr_local_cells;
+      //           tid++) { /* This should indeed be tasks_done_gpu as they are
+      //           the only
+      ////                     tasks which have been done*/
+      //        struct cell *ctemp = &(space->cells_top[tid]);
+      //        for (int i = 0; i < ctemp->hydro.count; i++) {
+      //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
+      //          %f, %f\n",
+      //                  ctemp->hydro.parts[i].x[0],
+      //                  ctemp->hydro.parts[i].x[1],
+      //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+      //                  ctemp->hydro.parts[i].density.rho_dh,
+      //                  ctemp->hydro.parts[i].viscosity.v_sig,
+      //                  ctemp->hydro.parts[i].diffusion.laplace_u,
+      //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
+      //                  ctemp->hydro.parts[i].a_hydro[0],
+      //				  ctemp->hydro.parts[i].a_hydro[1],
+      //				  ctemp->hydro.parts[i].a_hydro[2]);
+      ////          message("wcount %f density %f",
+      /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
+      /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+      //        }
+      //      }
+      ////  }
+      /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
 #ifdef DUMP_TIMINGS
-#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || defined(GPUOFFLOAD_FORCE)
-    //        char buffer[30];
-    //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
-    //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
-    //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial, nfullpair,
-    //        npartialpair\n"); else fprintf(fullbundles, "%i, %i, %i, %i\n",
-    //        		n_full_d_bundles, n_partial_d_bundles,
-    //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
-
-    ///////////////////////////////////////////////////////////////
-    /// to ooutput timings uncomment this
-    ///////////////////////////////////////////////////////////////
-    if (r->cpuid == 0 && engine_rank == 0)
-      fprintf(fgpu_steps,
-              "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
-              "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
-              "GPU_PG, P_PG, U_PG\n "
-              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
-              "%e, %e\n",
-              time_for_density_gpu, packing_time, unpack_time_self,
-              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
-              time_for_gpu_f, packing_time_f, unpack_time_self_f,
-              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
-              time_for_gpu_g, packing_time_g, unpack_time_self_g,
-              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-
-    else
-      fprintf(fgpu_steps,
-              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
-              "%e, %e\n",
-              time_for_density_gpu, packing_time, unpack_time_self,
-              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
-              time_for_gpu_f, packing_time_f, unpack_time_self_f,
-              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
-              time_for_gpu_g, packing_time_g, unpack_time_self_g,
-              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-      //////////////////////////////////////////////////////////////
+#if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \
+    defined(GPUOFFLOAD_FORCE)
+      //        char buffer[30];
+      //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
+      //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
+      //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial,
+      //        nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i,
+      //        %i, %i\n", 		n_full_d_bundles, n_partial_d_bundles,
+      //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
+
       ///////////////////////////////////////////////////////////////
+      /// to ooutput timings uncomment this
       ///////////////////////////////////////////////////////////////
-
-#else   // No GPU offload
-    if (r->cpuid == 0 && engine_rank == 0)
-      fprintf(fgpu_steps,
-              "CPU TIME SELF, CPU TIME PAIR, "
-              "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
-              "PAIR G\n "
-              "%e, %e, %e, %e, %e, %e\n",
-              time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
-              time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-
-    else
-      fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
-              time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
-              time_for_cpu_g, time_for_cpu_pair_g);
-#endif  
+      if (r->cpuid == 0 && engine_rank == 0)
+        fprintf(
+            fgpu_steps,
+            "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
+            "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
+            "GPU_PG, P_PG, U_PG\n "
+            "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+            "%e, %e\n",
+            time_for_density_gpu, packing_time, unpack_time_self,
+            time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+            time_for_gpu_f, packing_time_f, unpack_time_self_f,
+            time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+            time_for_gpu_g, packing_time_g, unpack_time_self_g,
+            time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+      else
+        fprintf(
+            fgpu_steps,
+            "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+            "%e, %e\n",
+            time_for_density_gpu, packing_time, unpack_time_self,
+            time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+            time_for_gpu_f, packing_time_f, unpack_time_self_f,
+            time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+            time_for_gpu_g, packing_time_g, unpack_time_self_g,
+            time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+        //////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+#else  // No GPU offload
+      if (r->cpuid == 0 && engine_rank == 0)
+        fprintf(fgpu_steps,
+                "CPU TIME SELF, CPU TIME PAIR, "
+                "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
+                "PAIR G\n "
+                "%e, %e, %e, %e, %e, %e\n",
+                time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
+                time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+      else
+        fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
+                time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
+                time_for_cpu_g, time_for_cpu_pair_g);
+#endif
 #endif  // DUMPTIMINGS
-    //    }
-    fflush(fgpu_steps);
-    fclose(fgpu_steps);
-    time_for_density_cpu = 0.0;
-    time_for_density_gpu = 0.0;
-    time_for_density_cpu_pair = 0.0;
-    time_for_density_gpu_pair = 0.0;
-    time_for_density_cpu_sub = 0.0;
-    tot_time_for_hard_memcpys = 0.0;
-    tasks_done_gpu = 0;
-    tasks_done_cpu = 0;
-    tasks_done_gpu_inc = 0;
-    if (ghost_in > 0)
-      fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
-              ghost_in);
-    packed_self = 0;
-    packed_pair = 0;
-    packed_self_f = 0;
-    packed_pair_f = 0;
-    packed_self_g = 0;
-    packed_pair_g = 0;
-    density = 0;
-    density_sub = 0;
-    unpacked = 0;
-    //	if(step == 2)cudaProfilerStop();
-    //	if(step == 2)exit(0);
-    //	  size_t free_byte ;
-    //	  size_t total_byte ;
-    //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-    //	  double free = (double)free_byte;
-    //	  double available = (double)total_byte;
-    //	  double used = (available - free);
-    //	  fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
-    /* Wait at the wait barrier. */
-    //    swift_barrier_wait(&e->wait_barrier);
+      //    }
+      fflush(fgpu_steps);
+      fclose(fgpu_steps);
+      time_for_density_cpu = 0.0;
+      time_for_density_gpu = 0.0;
+      time_for_density_cpu_pair = 0.0;
+      time_for_density_gpu_pair = 0.0;
+      time_for_density_cpu_sub = 0.0;
+      tot_time_for_hard_memcpys = 0.0;
+      tasks_done_gpu = 0;
+      tasks_done_cpu = 0;
+      tasks_done_gpu_inc = 0;
+      if (ghost_in > 0)
+        fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
+                ghost_in);
+      packed_self = 0;
+      packed_pair = 0;
+      packed_self_f = 0;
+      packed_pair_f = 0;
+      packed_self_g = 0;
+      packed_pair_g = 0;
+      density = 0;
+      density_sub = 0;
+      unpacked = 0;
+      //	if(step == 2)cudaProfilerStop();
+      //	if(step == 2)exit(0);
+      //	  size_t free_byte ;
+      //	  size_t total_byte ;
+      //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
+      //&total_byte ) ; 	  double free = (double)free_byte; 	  double available =
+      //(double)total_byte; 	  double used = (available - free); 	  fprintf(stderr,
+      //"Used %f GB GPU memory\n", used/1e9);
+      /* Wait at the wait barrier. */
+      //    swift_barrier_wait(&e->wait_barrier);
+    }
+    // Free all data
+    //  cudaFree(d_tid_p);
+    //  cudaFree(d_id);
+    //  cudaFree(d_x_p);
+    //  cudaFree(d_y_p);
+    //  cudaFree(d_z_p);
+    //  cudaFree(d_ux);
+    //  cudaFree(d_uy);
+    //  cudaFree(d_uz);
+    //  cudaFree(d_a_hydrox);
+    //  cudaFree(d_a_hydroy);
+    //  cudaFree(d_a_hydroz);
+    //  cudaFree(d_mass);
+    //  cudaFree(d_h);
+    //  cudaFree(d_u);
+    //  cudaFree(d_u_dt);
+    //  cudaFree(d_rho);
+    //  cudaFree(d_SPH_sum);
+    //  cudaFree(d_locx);
+    //  cudaFree(d_locy);
+    //  cudaFree(d_locz);
+    //  cudaFree(d_widthx);
+    //  cudaFree(d_widthy);
+    //  cudaFree(d_widthz);
+    //  cudaFree(d_h_max);
+    //  cudaFree(d_count_p);
+    //  cudaFree(d_wcount);
+    //  cudaFree(d_wcount_dh);
+    //  cudaFree(d_rho_dh);
+    //  cudaFree(d_rot_ux);
+    //  cudaFree(d_rot_uy);
+    //  cudaFree(d_rot_uz);
+    //  cudaFree(d_div_v);
+    //  cudaFree(d_div_v_previous_step);
+    //  cudaFree(d_alpha_visc);
+    //  cudaFree(d_v_sig);
+    //  cudaFree(d_laplace_u);
+    //  cudaFree(d_alpha_diff);
+    //  cudaFree(d_f);
+    //  cudaFree(d_soundspeed);
+    //  cudaFree(d_h_dt);
+    //  cudaFree(d_balsara);
+    //  cudaFree(d_pressure);
+    //  cudaFree(d_alpha_visc_max_ngb);
+    //  cudaFree(d_time_bin);
+    //  cudaFree(d_wakeup);
+    //  cudaFree(d_min_ngb_time_bin);
+    //  cudaFree(d_to_be_synchronized);
+    //  cudaFree(tid_p);
+    //  cudaFree(id);
+    //  cudaFree(mass);
+    //  cudaFree(h);
+    //  cudaFree(u);
+    //  cudaFree(u_dt);
+    //  cudaFree(rho);
+    //  cudaFree(SPH_sum);
+    //  cudaFree(x_p);
+    //  cudaFree(y_p);
+    //  cudaFree(z_p);
+    //  cudaFree(ux);
+    //  cudaFree(uy);
+    //  cudaFree(uz);
+    //  cudaFree(a_hydrox);
+    //  cudaFree(a_hydroy);
+    //  cudaFree(a_hydroz);
+    //  cudaFree(locx);
+    //  cudaFree(locy);
+    //  cudaFree(locz);
+    //  cudaFree(widthx);
+    //  cudaFree(widthy);
+    //  cudaFree(widthz);
+    //  cudaFree(h_max);
+    //  cudaFree(count_p);
+    //  cudaFree(wcount);
+    //  cudaFree(wcount_dh);
+    //  cudaFree(rho_dh);
+    //  cudaFree(rot_ux);
+    //  cudaFree(rot_uy);
+    //  cudaFree(rot_uz);
+    //  cudaFree(div_v);
+    //  cudaFree(div_v_previous_step);
+    //  cudaFree(alpha_visc);
+    //  cudaFree(v_sig);
+    //  cudaFree(laplace_u);
+    //  cudaFree(alpha_diff);
+    //  cudaFree(f);
+    //  cudaFree(soundspeed);
+    //  cudaFree(h_dt);
+    //  cudaFree(balsara);
+    //  cudaFree(pressure);
+    //  cudaFree(alpha_visc_max_ngb);
+    //  cudaFree(time_bin);
+    //  cudaFree(wakeup);
+    //  cudaFree(min_ngb_time_bin);
+    //  cudaFree(to_be_synchronized);
+    //  cudaFree(partid_p);
+    //  cudaFree(d_task_first_part);
+    //  cudaFree(d_task_last_part);
+    //  cudaFree(task_first_part_self_dens);
+    //  cudaFree(task_last_part_self_dens);
+    //  cudaFree(task_first_part_pair_ci);
+    //  cudaFree(task_last_part_pair_ci);
+    //  cudaFree(task_first_part_pair_cj);
+    //  cudaFree(task_last_part_pair_cj);
+    //  cudaFree(d_bundle_first_part_self_dens);
+    //  cudaFree(d_bundle_last_part_self_dens);
+    //  cudaFree(bundle_first_part_self_dens);
+    //  cudaFree(bundle_last_part_self_dens);
+    //  cudaFree(bundle_first_part_pair_ci);
+    //  cudaFree(bundle_last_part_pair_ci);
+    //  cudaFree(bundle_first_part_pair_cj);
+    //  cudaFree(bundle_last_part_pair_cj);
+    //  free(ci_list_self_dens);
+    //  free(ci_list_pair);
+    //  free(cj_list_pair);
+
+    /* Be kind, rewind. */
+    return NULL;
   }
-  // Free all data
-  //  cudaFree(d_tid_p);
-  //  cudaFree(d_id);
-  //  cudaFree(d_x_p);
-  //  cudaFree(d_y_p);
-  //  cudaFree(d_z_p);
-  //  cudaFree(d_ux);
-  //  cudaFree(d_uy);
-  //  cudaFree(d_uz);
-  //  cudaFree(d_a_hydrox);
-  //  cudaFree(d_a_hydroy);
-  //  cudaFree(d_a_hydroz);
-  //  cudaFree(d_mass);
-  //  cudaFree(d_h);
-  //  cudaFree(d_u);
-  //  cudaFree(d_u_dt);
-  //  cudaFree(d_rho);
-  //  cudaFree(d_SPH_sum);
-  //  cudaFree(d_locx);
-  //  cudaFree(d_locy);
-  //  cudaFree(d_locz);
-  //  cudaFree(d_widthx);
-  //  cudaFree(d_widthy);
-  //  cudaFree(d_widthz);
-  //  cudaFree(d_h_max);
-  //  cudaFree(d_count_p);
-  //  cudaFree(d_wcount);
-  //  cudaFree(d_wcount_dh);
-  //  cudaFree(d_rho_dh);
-  //  cudaFree(d_rot_ux);
-  //  cudaFree(d_rot_uy);
-  //  cudaFree(d_rot_uz);
-  //  cudaFree(d_div_v);
-  //  cudaFree(d_div_v_previous_step);
-  //  cudaFree(d_alpha_visc);
-  //  cudaFree(d_v_sig);
-  //  cudaFree(d_laplace_u);
-  //  cudaFree(d_alpha_diff);
-  //  cudaFree(d_f);
-  //  cudaFree(d_soundspeed);
-  //  cudaFree(d_h_dt);
-  //  cudaFree(d_balsara);
-  //  cudaFree(d_pressure);
-  //  cudaFree(d_alpha_visc_max_ngb);
-  //  cudaFree(d_time_bin);
-  //  cudaFree(d_wakeup);
-  //  cudaFree(d_min_ngb_time_bin);
-  //  cudaFree(d_to_be_synchronized);
-  //  cudaFree(tid_p);
-  //  cudaFree(id);
-  //  cudaFree(mass);
-  //  cudaFree(h);
-  //  cudaFree(u);
-  //  cudaFree(u_dt);
-  //  cudaFree(rho);
-  //  cudaFree(SPH_sum);
-  //  cudaFree(x_p);
-  //  cudaFree(y_p);
-  //  cudaFree(z_p);
-  //  cudaFree(ux);
-  //  cudaFree(uy);
-  //  cudaFree(uz);
-  //  cudaFree(a_hydrox);
-  //  cudaFree(a_hydroy);
-  //  cudaFree(a_hydroz);
-  //  cudaFree(locx);
-  //  cudaFree(locy);
-  //  cudaFree(locz);
-  //  cudaFree(widthx);
-  //  cudaFree(widthy);
-  //  cudaFree(widthz);
-  //  cudaFree(h_max);
-  //  cudaFree(count_p);
-  //  cudaFree(wcount);
-  //  cudaFree(wcount_dh);
-  //  cudaFree(rho_dh);
-  //  cudaFree(rot_ux);
-  //  cudaFree(rot_uy);
-  //  cudaFree(rot_uz);
-  //  cudaFree(div_v);
-  //  cudaFree(div_v_previous_step);
-  //  cudaFree(alpha_visc);
-  //  cudaFree(v_sig);
-  //  cudaFree(laplace_u);
-  //  cudaFree(alpha_diff);
-  //  cudaFree(f);
-  //  cudaFree(soundspeed);
-  //  cudaFree(h_dt);
-  //  cudaFree(balsara);
-  //  cudaFree(pressure);
-  //  cudaFree(alpha_visc_max_ngb);
-  //  cudaFree(time_bin);
-  //  cudaFree(wakeup);
-  //  cudaFree(min_ngb_time_bin);
-  //  cudaFree(to_be_synchronized);
-  //  cudaFree(partid_p);
-  //  cudaFree(d_task_first_part);
-  //  cudaFree(d_task_last_part);
-  //  cudaFree(task_first_part_self_dens);
-  //  cudaFree(task_last_part_self_dens);
-  //  cudaFree(task_first_part_pair_ci);
-  //  cudaFree(task_last_part_pair_ci);
-  //  cudaFree(task_first_part_pair_cj);
-  //  cudaFree(task_last_part_pair_cj);
-  //  cudaFree(d_bundle_first_part_self_dens);
-  //  cudaFree(d_bundle_last_part_self_dens);
-  //  cudaFree(bundle_first_part_self_dens);
-  //  cudaFree(bundle_last_part_self_dens);
-  //  cudaFree(bundle_first_part_pair_ci);
-  //  cudaFree(bundle_last_part_pair_ci);
-  //  cudaFree(bundle_first_part_pair_cj);
-  //  cudaFree(bundle_last_part_pair_cj);
-  //  free(ci_list_self_dens);
-  //  free(ci_list_pair);
-  //  free(cj_list_pair);
-
-  /* Be kind, rewind. */
-  return NULL;
-}
 
 #endif  // WITH_CUDA
 
@@ -2142,10 +2146,10 @@ void *runner_main2(void *data) {
 #include <sys/resource.h>
 #include <sys/time.h>
 
-// uint64_t time_used ( ) {
-//    struct rusage ru;
-//    struct timeval t;
-//    getrusage(RUSAGE_THREAD,&ru);
-//    t = ru.ru_utime;
-//    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
-// }
+  // uint64_t time_used ( ) {
+  //    struct rusage ru;
+  //    struct timeval t;
+  //    getrusage(RUSAGE_THREAD,&ru);
+  //    t = ru.ru_utime;
+  //    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
+  // }

From 81c2283e0f8c0b560ba202193510e1784102e94b Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 11:11:30 +0100
Subject: [PATCH 043/217] Fix another bracketing issue

---
 src/runner_main_clean.cu | 443 +++++++++++++++++++--------------------
 1 file changed, 221 insertions(+), 222 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 6683e9cf5c..792eb4d511 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1061,7 +1061,8 @@ void *runner_main2(void *data) {
               //	        runner_doself1_launch(r, sched,
               // pack_vars_self_dens, ci, t, parts_aos_dens,
               // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
-              // &time_for_density_gpu, 					&tot_time_for_hard_memcpys);
+              // &time_for_density_gpu,
+              // &tot_time_for_hard_memcpys);
             } /*End of GPU work Self*/
 #endif
           } /* self / pack */
@@ -1858,7 +1859,7 @@ void *runner_main2(void *data) {
 
       /* We're done with this task, see if we get a next one. */
       prev = t;
-
+      
       if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
@@ -1868,7 +1869,7 @@ void *runner_main2(void *data) {
         t = scheduler_done(sched, t);
 #endif
       }
-
+      
       if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */
@@ -1889,14 +1890,14 @@ void *runner_main2(void *data) {
       }
 #endif
 
-        if (t->subtype != task_subtype_gpu_pack &&
-            t->subtype != task_subtype_gpu_pack_g &&
-            t->subtype != task_subtype_gpu_pack_f) {
-          t = scheduler_done(sched, t);
-        }
-      } /* main loop. */
-        // Stuff for writing debug data to file for validation
-        ////        if (step % 10 == 0 || step == 1) {
+      if (t->subtype != task_subtype_gpu_pack &&
+          t->subtype != task_subtype_gpu_pack_g &&
+          t->subtype != task_subtype_gpu_pack_f) {
+        t = scheduler_done(sched, t);
+      }
+    } /* main loop. */
+      // Stuff for writing debug data to file for validation
+      ////        if (step % 10 == 0 || step == 1) {
       //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
       //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
       //      = 0; tid < space->nr_local_cells;
@@ -1928,217 +1929,215 @@ void *runner_main2(void *data) {
 #ifdef DUMP_TIMINGS
 #if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \
     defined(GPUOFFLOAD_FORCE)
-      //        char buffer[30];
-      //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
-      //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
-      //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial,
-      //        nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i,
-      //        %i, %i\n", 		n_full_d_bundles, n_partial_d_bundles,
-      //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
-
+    //        char buffer[30];
+    //        snprintf(buffer, sizeof(buffer), "t%d_stepnfullbundles%d",
+    //        r->cpuid, step); FILE *fullbundles = fopen(buffer, "w");
+    //        if(r->cpuid == 0)fprintf(fullbundles, "nfull, npartial,
+    //        nfullpair, npartialpair\n"); else fprintf(fullbundles, "%i, %i,
+    //        %i, %i\n", 		n_full_d_bundles, n_partial_d_bundles,
+    //        n_full_p_d_bundles, n_partial_p_d_bundles); fflush(fullbundles);
+
+    ///////////////////////////////////////////////////////////////
+    /// to ooutput timings uncomment this
+    ///////////////////////////////////////////////////////////////
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
+              "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
+              "GPU_PG, P_PG, U_PG\n "
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+
+    else
+      fprintf(fgpu_steps,
+              "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
+              "%e, %e\n",
+              time_for_density_gpu, packing_time, unpack_time_self,
+              time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
+              time_for_gpu_f, packing_time_f, unpack_time_self_f,
+              time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
+              time_for_gpu_g, packing_time_g, unpack_time_self_g,
+              time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
+      //////////////////////////////////////////////////////////////
       ///////////////////////////////////////////////////////////////
-      /// to ooutput timings uncomment this
       ///////////////////////////////////////////////////////////////
-      if (r->cpuid == 0 && engine_rank == 0)
-        fprintf(
-            fgpu_steps,
-            "GPU_SD, P_SD, U_SD, GPU_PD,  P_PD, U_PD, "
-            "GPU_SF, P_SF, U_SF, GPU_PF, P_PF, U_PF, GPU_SG, P_SG, U_SG, "
-            "GPU_PG, P_PG, U_PG\n "
-            "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
-            "%e, %e\n",
-            time_for_density_gpu, packing_time, unpack_time_self,
-            time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
-            time_for_gpu_f, packing_time_f, unpack_time_self_f,
-            time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
-            time_for_gpu_g, packing_time_g, unpack_time_self_g,
-            time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-
-      else
-        fprintf(
-            fgpu_steps,
-            "%e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, %e, "
-            "%e, %e\n",
-            time_for_density_gpu, packing_time, unpack_time_self,
-            time_for_density_gpu_pair, packing_time_pair, unpacking_time_pair,
-            time_for_gpu_f, packing_time_f, unpack_time_self_f,
-            time_for_gpu_pair_f, packing_time_pair_f, unpacking_time_pair_f,
-            time_for_gpu_g, packing_time_g, unpack_time_self_g,
-            time_for_gpu_pair_g, packing_time_pair_g, unpacking_time_pair_f);
-        //////////////////////////////////////////////////////////////
-        ///////////////////////////////////////////////////////////////
-        ///////////////////////////////////////////////////////////////
 
 #else  // No GPU offload
-      if (r->cpuid == 0 && engine_rank == 0)
-        fprintf(fgpu_steps,
-                "CPU TIME SELF, CPU TIME PAIR, "
-                "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
-                "PAIR G\n "
-                "%e, %e, %e, %e, %e, %e\n",
-                time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
-                time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
-
-      else
-        fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
-                time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
-                time_for_cpu_g, time_for_cpu_pair_g);
+    if (r->cpuid == 0 && engine_rank == 0)
+      fprintf(fgpu_steps,
+              "CPU TIME SELF, CPU TIME PAIR, "
+              "CPU TIME SELF F, CPU TIME PAIR F, CPU TIME SELF G, CPU TIME "
+              "PAIR G\n "
+              "%e, %e, %e, %e, %e, %e\n",
+              time_for_density_cpu, time_for_density_cpu_pair, time_for_cpu_f,
+              time_for_cpu_pair_f, time_for_cpu_g, time_for_cpu_pair_g);
+
+    else
+      fprintf(fgpu_steps, "%e, %e, %e, %e, %e, %e,\n", time_for_density_cpu,
+              time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
+              time_for_cpu_g, time_for_cpu_pair_g);
 #endif
 #endif  // DUMPTIMINGS
-      //    }
-      fflush(fgpu_steps);
-      fclose(fgpu_steps);
-      time_for_density_cpu = 0.0;
-      time_for_density_gpu = 0.0;
-      time_for_density_cpu_pair = 0.0;
-      time_for_density_gpu_pair = 0.0;
-      time_for_density_cpu_sub = 0.0;
-      tot_time_for_hard_memcpys = 0.0;
-      tasks_done_gpu = 0;
-      tasks_done_cpu = 0;
-      tasks_done_gpu_inc = 0;
-      if (ghost_in > 0)
-        fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
-                ghost_in);
-      packed_self = 0;
-      packed_pair = 0;
-      packed_self_f = 0;
-      packed_pair_f = 0;
-      packed_self_g = 0;
-      packed_pair_g = 0;
-      density = 0;
-      density_sub = 0;
-      unpacked = 0;
-      //	if(step == 2)cudaProfilerStop();
-      //	if(step == 2)exit(0);
-      //	  size_t free_byte ;
-      //	  size_t total_byte ;
-      //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
-      //&total_byte ) ; 	  double free = (double)free_byte; 	  double available =
-      //(double)total_byte; 	  double used = (available - free); 	  fprintf(stderr,
-      //"Used %f GB GPU memory\n", used/1e9);
-      /* Wait at the wait barrier. */
-      //    swift_barrier_wait(&e->wait_barrier);
-    }
-    // Free all data
-    //  cudaFree(d_tid_p);
-    //  cudaFree(d_id);
-    //  cudaFree(d_x_p);
-    //  cudaFree(d_y_p);
-    //  cudaFree(d_z_p);
-    //  cudaFree(d_ux);
-    //  cudaFree(d_uy);
-    //  cudaFree(d_uz);
-    //  cudaFree(d_a_hydrox);
-    //  cudaFree(d_a_hydroy);
-    //  cudaFree(d_a_hydroz);
-    //  cudaFree(d_mass);
-    //  cudaFree(d_h);
-    //  cudaFree(d_u);
-    //  cudaFree(d_u_dt);
-    //  cudaFree(d_rho);
-    //  cudaFree(d_SPH_sum);
-    //  cudaFree(d_locx);
-    //  cudaFree(d_locy);
-    //  cudaFree(d_locz);
-    //  cudaFree(d_widthx);
-    //  cudaFree(d_widthy);
-    //  cudaFree(d_widthz);
-    //  cudaFree(d_h_max);
-    //  cudaFree(d_count_p);
-    //  cudaFree(d_wcount);
-    //  cudaFree(d_wcount_dh);
-    //  cudaFree(d_rho_dh);
-    //  cudaFree(d_rot_ux);
-    //  cudaFree(d_rot_uy);
-    //  cudaFree(d_rot_uz);
-    //  cudaFree(d_div_v);
-    //  cudaFree(d_div_v_previous_step);
-    //  cudaFree(d_alpha_visc);
-    //  cudaFree(d_v_sig);
-    //  cudaFree(d_laplace_u);
-    //  cudaFree(d_alpha_diff);
-    //  cudaFree(d_f);
-    //  cudaFree(d_soundspeed);
-    //  cudaFree(d_h_dt);
-    //  cudaFree(d_balsara);
-    //  cudaFree(d_pressure);
-    //  cudaFree(d_alpha_visc_max_ngb);
-    //  cudaFree(d_time_bin);
-    //  cudaFree(d_wakeup);
-    //  cudaFree(d_min_ngb_time_bin);
-    //  cudaFree(d_to_be_synchronized);
-    //  cudaFree(tid_p);
-    //  cudaFree(id);
-    //  cudaFree(mass);
-    //  cudaFree(h);
-    //  cudaFree(u);
-    //  cudaFree(u_dt);
-    //  cudaFree(rho);
-    //  cudaFree(SPH_sum);
-    //  cudaFree(x_p);
-    //  cudaFree(y_p);
-    //  cudaFree(z_p);
-    //  cudaFree(ux);
-    //  cudaFree(uy);
-    //  cudaFree(uz);
-    //  cudaFree(a_hydrox);
-    //  cudaFree(a_hydroy);
-    //  cudaFree(a_hydroz);
-    //  cudaFree(locx);
-    //  cudaFree(locy);
-    //  cudaFree(locz);
-    //  cudaFree(widthx);
-    //  cudaFree(widthy);
-    //  cudaFree(widthz);
-    //  cudaFree(h_max);
-    //  cudaFree(count_p);
-    //  cudaFree(wcount);
-    //  cudaFree(wcount_dh);
-    //  cudaFree(rho_dh);
-    //  cudaFree(rot_ux);
-    //  cudaFree(rot_uy);
-    //  cudaFree(rot_uz);
-    //  cudaFree(div_v);
-    //  cudaFree(div_v_previous_step);
-    //  cudaFree(alpha_visc);
-    //  cudaFree(v_sig);
-    //  cudaFree(laplace_u);
-    //  cudaFree(alpha_diff);
-    //  cudaFree(f);
-    //  cudaFree(soundspeed);
-    //  cudaFree(h_dt);
-    //  cudaFree(balsara);
-    //  cudaFree(pressure);
-    //  cudaFree(alpha_visc_max_ngb);
-    //  cudaFree(time_bin);
-    //  cudaFree(wakeup);
-    //  cudaFree(min_ngb_time_bin);
-    //  cudaFree(to_be_synchronized);
-    //  cudaFree(partid_p);
-    //  cudaFree(d_task_first_part);
-    //  cudaFree(d_task_last_part);
-    //  cudaFree(task_first_part_self_dens);
-    //  cudaFree(task_last_part_self_dens);
-    //  cudaFree(task_first_part_pair_ci);
-    //  cudaFree(task_last_part_pair_ci);
-    //  cudaFree(task_first_part_pair_cj);
-    //  cudaFree(task_last_part_pair_cj);
-    //  cudaFree(d_bundle_first_part_self_dens);
-    //  cudaFree(d_bundle_last_part_self_dens);
-    //  cudaFree(bundle_first_part_self_dens);
-    //  cudaFree(bundle_last_part_self_dens);
-    //  cudaFree(bundle_first_part_pair_ci);
-    //  cudaFree(bundle_last_part_pair_ci);
-    //  cudaFree(bundle_first_part_pair_cj);
-    //  cudaFree(bundle_last_part_pair_cj);
-    //  free(ci_list_self_dens);
-    //  free(ci_list_pair);
-    //  free(cj_list_pair);
-
-    /* Be kind, rewind. */
-    return NULL;
+    //    }
+    fflush(fgpu_steps);
+    fclose(fgpu_steps);
+    time_for_density_cpu = 0.0;
+    time_for_density_gpu = 0.0;
+    time_for_density_cpu_pair = 0.0;
+    time_for_density_gpu_pair = 0.0;
+    time_for_density_cpu_sub = 0.0;
+    tot_time_for_hard_memcpys = 0.0;
+    tasks_done_gpu = 0;
+    tasks_done_cpu = 0;
+    tasks_done_gpu_inc = 0;
+    if (ghost_in > 0)
+      fprintf(stderr, "total tasks not done on GPU %i is %i\n", r->cpuid,
+              ghost_in);
+    packed_self = 0;
+    packed_pair = 0;
+    packed_self_f = 0;
+    packed_pair_f = 0;
+    packed_self_g = 0;
+    packed_pair_g = 0;
+    density = 0;
+    density_sub = 0;
+    unpacked = 0;
+    //	if(step == 2)cudaProfilerStop();
+    //	if(step == 2)exit(0);
+    //	  size_t free_byte ;
+    //	  size_t total_byte ;
+    //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
+    //&total_byte ) ; 	  double free = (double)free_byte; 	  double
+    //available = (double)total_byte; 	  double used = (available - free);
+    //fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+    /* Wait at the wait barrier. */
+    //    swift_barrier_wait(&e->wait_barrier);
   }
+  // Free all data
+  //  cudaFree(d_tid_p);
+  //  cudaFree(d_id);
+  //  cudaFree(d_x_p);
+  //  cudaFree(d_y_p);
+  //  cudaFree(d_z_p);
+  //  cudaFree(d_ux);
+  //  cudaFree(d_uy);
+  //  cudaFree(d_uz);
+  //  cudaFree(d_a_hydrox);
+  //  cudaFree(d_a_hydroy);
+  //  cudaFree(d_a_hydroz);
+  //  cudaFree(d_mass);
+  //  cudaFree(d_h);
+  //  cudaFree(d_u);
+  //  cudaFree(d_u_dt);
+  //  cudaFree(d_rho);
+  //  cudaFree(d_SPH_sum);
+  //  cudaFree(d_locx);
+  //  cudaFree(d_locy);
+  //  cudaFree(d_locz);
+  //  cudaFree(d_widthx);
+  //  cudaFree(d_widthy);
+  //  cudaFree(d_widthz);
+  //  cudaFree(d_h_max);
+  //  cudaFree(d_count_p);
+  //  cudaFree(d_wcount);
+  //  cudaFree(d_wcount_dh);
+  //  cudaFree(d_rho_dh);
+  //  cudaFree(d_rot_ux);
+  //  cudaFree(d_rot_uy);
+  //  cudaFree(d_rot_uz);
+  //  cudaFree(d_div_v);
+  //  cudaFree(d_div_v_previous_step);
+  //  cudaFree(d_alpha_visc);
+  //  cudaFree(d_v_sig);
+  //  cudaFree(d_laplace_u);
+  //  cudaFree(d_alpha_diff);
+  //  cudaFree(d_f);
+  //  cudaFree(d_soundspeed);
+  //  cudaFree(d_h_dt);
+  //  cudaFree(d_balsara);
+  //  cudaFree(d_pressure);
+  //  cudaFree(d_alpha_visc_max_ngb);
+  //  cudaFree(d_time_bin);
+  //  cudaFree(d_wakeup);
+  //  cudaFree(d_min_ngb_time_bin);
+  //  cudaFree(d_to_be_synchronized);
+  //  cudaFree(tid_p);
+  //  cudaFree(id);
+  //  cudaFree(mass);
+  //  cudaFree(h);
+  //  cudaFree(u);
+  //  cudaFree(u_dt);
+  //  cudaFree(rho);
+  //  cudaFree(SPH_sum);
+  //  cudaFree(x_p);
+  //  cudaFree(y_p);
+  //  cudaFree(z_p);
+  //  cudaFree(ux);
+  //  cudaFree(uy);
+  //  cudaFree(uz);
+  //  cudaFree(a_hydrox);
+  //  cudaFree(a_hydroy);
+  //  cudaFree(a_hydroz);
+  //  cudaFree(locx);
+  //  cudaFree(locy);
+  //  cudaFree(locz);
+  //  cudaFree(widthx);
+  //  cudaFree(widthy);
+  //  cudaFree(widthz);
+  //  cudaFree(h_max);
+  //  cudaFree(count_p);
+  //  cudaFree(wcount);
+  //  cudaFree(wcount_dh);
+  //  cudaFree(rho_dh);
+  //  cudaFree(rot_ux);
+  //  cudaFree(rot_uy);
+  //  cudaFree(rot_uz);
+  //  cudaFree(div_v);
+  //  cudaFree(div_v_previous_step);
+  //  cudaFree(alpha_visc);
+  //  cudaFree(v_sig);
+  //  cudaFree(laplace_u);
+  //  cudaFree(alpha_diff);
+  //  cudaFree(f);
+  //  cudaFree(soundspeed);
+  //  cudaFree(h_dt);
+  //  cudaFree(balsara);
+  //  cudaFree(pressure);
+  //  cudaFree(alpha_visc_max_ngb);
+  //  cudaFree(time_bin);
+  //  cudaFree(wakeup);
+  //  cudaFree(min_ngb_time_bin);
+  //  cudaFree(to_be_synchronized);
+  //  cudaFree(partid_p);
+  //  cudaFree(d_task_first_part);
+  //  cudaFree(d_task_last_part);
+  //  cudaFree(task_first_part_self_dens);
+  //  cudaFree(task_last_part_self_dens);
+  //  cudaFree(task_first_part_pair_ci);
+  //  cudaFree(task_last_part_pair_ci);
+  //  cudaFree(task_first_part_pair_cj);
+  //  cudaFree(task_last_part_pair_cj);
+  //  cudaFree(d_bundle_first_part_self_dens);
+  //  cudaFree(d_bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_self_dens);
+  //  cudaFree(bundle_last_part_self_dens);
+  //  cudaFree(bundle_first_part_pair_ci);
+  //  cudaFree(bundle_last_part_pair_ci);
+  //  cudaFree(bundle_first_part_pair_cj);
+  //  cudaFree(bundle_last_part_pair_cj);
+  //  free(ci_list_self_dens);
+  //  free(ci_list_pair);
+  //  free(cj_list_pair);
+
+  /* Be kind, rewind. */
+  return NULL;
+}
 
 #endif  // WITH_CUDA
 
@@ -2146,10 +2145,10 @@ void *runner_main2(void *data) {
 #include <sys/resource.h>
 #include <sys/time.h>
 
-  // uint64_t time_used ( ) {
-  //    struct rusage ru;
-  //    struct timeval t;
-  //    getrusage(RUSAGE_THREAD,&ru);
-  //    t = ru.ru_utime;
-  //    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
-  // }
+// uint64_t time_used ( ) {
+//    struct rusage ru;
+//    struct timeval t;
+//    getrusage(RUSAGE_THREAD,&ru);
+//    t = ru.ru_utime;
+//    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
+// }

From b21d9121939f0d06da3eec4136dffdee4865fbbd Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 11:13:57 +0100
Subject: [PATCH 044/217] Fix another bracketing issue

---
 src/runner_main_clean.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 792eb4d511..113653803e 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1887,8 +1887,8 @@ void *runner_main2(void *data) {
         t = NULL;
 #else
         t = scheduler_done(sched, t);
-      }
 #endif
+      }
 
       if (t->subtype != task_subtype_gpu_pack &&
           t->subtype != task_subtype_gpu_pack_g &&

From b4f42032a9e69f7cdf79e265affd2735170725ca Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 11:16:36 +0100
Subject: [PATCH 045/217] Fix logic mistake

---
 src/runner_main_clean.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 113653803e..092934737b 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1870,7 +1870,7 @@ void *runner_main2(void *data) {
 #endif
       }
       
-      if (t->subtype == task_subtype_gpu_pack_g) {
+      else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
@@ -1880,7 +1880,7 @@ void *runner_main2(void *data) {
 #endif
       }
 
-      if (t->subtype == task_subtype_gpu_pack_f) {
+      else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
@@ -1890,7 +1890,7 @@ void *runner_main2(void *data) {
 #endif
       }
 
-      if (t->subtype != task_subtype_gpu_pack &&
+      else if (t->subtype != task_subtype_gpu_pack &&
           t->subtype != task_subtype_gpu_pack_g &&
           t->subtype != task_subtype_gpu_pack_f) {
         t = scheduler_done(sched, t);

From 4a9051677c43e101d70aba64b1fbd69aaee1f935 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 13:28:29 +0100
Subject: [PATCH 046/217] First attempt at MPI dependencies

---
 src/engine_maketasks.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 58dd8bc453..64342c0138 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -721,8 +721,15 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
     }
-    scheduler_addunlock(s, c->hydro.super->hydro.d_unpack, t_rho);
+    for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, t_rho);
+    }
+    //scheduler_addunlock(s, c->hydro.super->hydro.d_unpack, t_rho);
 #endif
+
+
+
+    
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
@@ -735,15 +742,15 @@ void engine_addtasks_recv_hydro(
 #ifdef WITH_CUDA
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
-      //      scheduler_addunlock(s, l->t, t_gradient);
+      scheduler_addunlock(s, l->t, t_gradient);
     }
-    scheduler_addunlock(s, c->hydro.super->hydro.g_unpack, t_gradient);
+    //scheduler_addunlock(s, c->hydro.super->hydro.g_unpack, t_gradient);
 
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_gradient, l->t);
-      //      scheduler_addunlock(s, l->t, t_ti);
+      scheduler_addunlock(s, l->t, t_ti);
     }
-    scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+    //scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
 #endif /*WITH_CUDA*/
 #else  /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
@@ -759,6 +766,8 @@ void engine_addtasks_recv_hydro(
 #endif /*WITH_CUDA*/
 #endif /*EXTRA_HYDRO_LOOP*/
 
+
+    
     if (with_limiter) {
       for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
         scheduler_addunlock(s, t_unpack_limiter, l->t);

From d21236400e3c79064d95bb34c90cdfadbb994a5f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 5 Nov 2024 12:31:19 +0000
Subject: [PATCH 047/217] Changed fprints to message in engine_maktasks.c

---
 src/engine_maketasks.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 64342c0138..f1f2af277b 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -716,7 +716,7 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
     }
-#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE)*/
+#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT REQUIRED Ghost in for cell j is*/
     for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
@@ -4899,9 +4899,9 @@ void engine_maketasks(struct engine *e) {
 
       /* pack -> unpack -> ghost_in */
       if (t->ci->hydro.ghost_in == NULL)
-        fprintf(stderr, "Ghost in for cell i is NULL\n");
+        message("Ghost in for cell i is NULL\n");
       if (t->cj->hydro.ghost_in == NULL)
-        fprintf(stderr, "Ghost in for cell j is NULL\n");
+        message("Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
       if (t->ci->nodeID == e->nodeID)

From 7c3728d13647047ec4020b814fef508a6372a133 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 13:54:55 +0100
Subject: [PATCH 048/217] Fix comm name typo

---
 src/engine_maketasks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index f1f2af277b..954b422109 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -748,7 +748,7 @@ void engine_addtasks_recv_hydro(
 
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_gradient, l->t);
-      scheduler_addunlock(s, l->t, t_ti);
+      scheduler_addunlock(s, l->t, tend);
     }
     //scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
 #endif /*WITH_CUDA*/

From af8bc7bb2ca8bb8039087db1fa8ee84eb30481ee Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 13:58:45 +0100
Subject: [PATCH 049/217] Only scream if a local cell has no ghost_in

---
 src/engine_maketasks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 954b422109..5d7e995931 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4898,9 +4898,9 @@ void engine_maketasks(struct engine *e) {
       }
 
       /* pack -> unpack -> ghost_in */
-      if (t->ci->hydro.ghost_in == NULL)
+      if (t->ci->hydro.ghost_in == NULL && t->ci->nodeID != e->nodeID)
         message("Ghost in for cell i is NULL\n");
-      if (t->cj->hydro.ghost_in == NULL)
+      if (t->cj->hydro.ghost_in == NULL && t->cj->nodeID != e->nodeID)
         message("Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);

From 732370d63840683e78f7281f937a07c4c2d5b84a Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 14:00:05 +0100
Subject: [PATCH 050/217] Add unpack ---> recv dependencies

---
 src/engine_maketasks.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 5d7e995931..d0a07eeba0 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -724,7 +724,8 @@ void engine_addtasks_recv_hydro(
     for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
       scheduler_addunlock(s, l->t, t_rho);
     }
-    //scheduler_addunlock(s, c->hydro.super->hydro.d_unpack, t_rho);
+
+    
 #endif
 
 
@@ -744,13 +745,20 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_rho, l->t);
       scheduler_addunlock(s, l->t, t_gradient);
     }
-    //scheduler_addunlock(s, c->hydro.super->hydro.g_unpack, t_gradient);
+    for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, t_gradient);
+    }
+
 
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_gradient, l->t);
       scheduler_addunlock(s, l->t, tend);
     }
-    //scheduler_addunlock(s, c->hydro.super->hydro.f_unpack, tend);
+    for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
+      scheduler_addunlock(s, l->t, tend);
+    }
+
+
 #endif /*WITH_CUDA*/
 #else  /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {

From c43b7b5225e4f03ad1993a39dd4f5790afe68bca Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 14:44:28 +0100
Subject: [PATCH 051/217] Fix error message in the task construction

---
 src/engine_maketasks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index d0a07eeba0..577827a266 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4906,9 +4906,9 @@ void engine_maketasks(struct engine *e) {
       }
 
       /* pack -> unpack -> ghost_in */
-      if (t->ci->hydro.ghost_in == NULL && t->ci->nodeID != e->nodeID)
+      if (t->ci->hydro.ghost_in == NULL && t->ci->nodeID == e->nodeID)
         message("Ghost in for cell i is NULL\n");
-      if (t->cj->hydro.ghost_in == NULL && t->cj->nodeID != e->nodeID)
+      if (t->cj->hydro.ghost_in == NULL && t->cj->nodeID == e->nodeID)
         message("Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);

From e64e17a72e3233ed3b84c407cfca7c5db0e5202f Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 14:44:57 +0100
Subject: [PATCH 052/217] Only create the timing files if we are dumping
 timings. Assign the devices with the mpi ranks.

---
 src/runner_main_clean.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 092934737b..2e0cfde511 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -265,9 +265,9 @@ void *runner_main2(void *data) {
 #endif
 #ifdef WITH_MPI
   else {
-    cudaSetDevice(mpi_rank * 2);
+    cudaSetDevice(mpi_rank);
     fprintf(stderr, "%i devices available device id is %i\n", nDevices,
-            mpi_rank * 2);
+            mpi_rank);
   }
 #endif
   fprintf(stderr, "after dev select engine_rank %i rank %i\n", engine_rank,
@@ -952,10 +952,11 @@ void *runner_main2(void *data) {
     /*Some bits for output in case of debug*/
     char buf5[20];
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
-
+#ifdef DUMP_TIMINGS
     FILE *fgpu_steps;
     //    if(step == 0 || step%10 == 0)fgpu_steps = fopen(buf5, "w");
     fgpu_steps = fopen(buf5, "w");
+#endif
     //    if (step == 0) cudaProfilerStart();
     step++;
 
@@ -1983,10 +1984,10 @@ void *runner_main2(void *data) {
               time_for_density_cpu_pair, time_for_cpu_f, time_for_cpu_pair_f,
               time_for_cpu_g, time_for_cpu_pair_g);
 #endif
-#endif  // DUMPTIMINGS
     //    }
     fflush(fgpu_steps);
     fclose(fgpu_steps);
+#endif  // DUMPTIMINGS
     time_for_density_cpu = 0.0;
     time_for_density_gpu = 0.0;
     time_for_density_cpu_pair = 0.0;

From 9981039b92da73810d01bf64be8ec42a31695103 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 15:02:47 +0100
Subject: [PATCH 053/217] Put the gpu pack/unpack tasks in the correct category

---
 src/runner_main_clean.cu | 6 ++++++
 src/task.c               | 8 +++++---
 src/task.h               | 1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 2e0cfde511..8e112bf6c0 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1865,6 +1865,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
+	t->toc = getticks();           
+	t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1875,6 +1877,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
+	t->toc = getticks();           
+	t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1885,6 +1889,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_FORCE
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
+	t->toc = getticks();           
+	t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
diff --git a/src/task.c b/src/task.c
index 1f8f0712c6..4716b35c9c 100644
--- a/src/task.c
+++ b/src/task.c
@@ -177,7 +177,7 @@ const char *task_category_names[task_category_count] = {
     "hydro",       "gravity",  "feedback",
     "black holes", "cooling",  "star formation",
     "limiter",     "sync",     "time integration",
-    "mpi",         "pack",     "fof",
+    "mpi",         "pack",     "gpu_pack",        "fof",
     "others",      "neutrino", "sink",
     "RT",          "CSDS"};
 
@@ -1870,14 +1870,16 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_density:
         case task_subtype_gradient:
         case task_subtype_force:
+          return task_category_hydro;
+
         case task_subtype_gpu_pack:  // A. Nasar
         case task_subtype_gpu_unpack:
         case task_subtype_gpu_pack_f:
         case task_subtype_gpu_unpack_f:
         case task_subtype_gpu_pack_g:
         case task_subtype_gpu_unpack_g:
-          return task_category_hydro;
-
+          return task_category_gpu_pack;
+	  
         case task_subtype_limiter:
           return task_category_limiter;
 
diff --git a/src/task.h b/src/task.h
index af5332641d..1115c6d0a7 100644
--- a/src/task.h
+++ b/src/task.h
@@ -202,6 +202,7 @@ enum task_categories {
   task_category_time_integration,
   task_category_mpi,
   task_category_pack,
+  task_category_gpu_pack,
   task_category_fof,
   task_category_others,
   task_category_neutrino,

From dd041feef5c2013c38bec754940e9c3f15a04467 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Tue, 5 Nov 2024 15:20:08 +0100
Subject: [PATCH 054/217] Do not update the total task time in
 signal_sleeping_runners

---
 src/scheduler.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 3dd29d7966..90d0274850 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2951,8 +2951,6 @@ struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
 
   /* Task definitely done, signal any sleeping runners. */
   if (!t->implicit) {
-    t->toc = getticks();
-    t->total_ticks += t->toc - t->tic;
     pthread_mutex_lock(&s->sleep_mutex);
     atomic_dec(&s->waiting);
     pthread_cond_broadcast(&s->sleep_cond);

From e506c7932757bb39b8ae9e1e138296b26937b472 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Wed, 6 Nov 2024 10:23:17 +0100
Subject: [PATCH 055/217] Time the pack and unpack of density pair separately

---
 src/runner_doiact_functions_hydro_gpu.h | 23 ++++++++++++++++++++++-
 src/runner_main_clean.cu                |  9 ++++++++-
 src/scheduler.c                         | 10 ++++++++++
 src/task.c                              |  3 ++-
 src/task.h                              |  5 +++++
 5 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index ca49e2aa45..0b9c908229 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2680,6 +2680,7 @@ void runner_dopair1_launch_f4(
   *packing_time +=
       (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 } /*End of GPU work*/
+
 void runner_dopair1_launch_f4_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
     struct task *t, struct part_aos_f4_send *parts_send,
@@ -2815,9 +2816,14 @@ void runner_dopair1_launch_f4_one_memcpy(
   clock_gettime(CLOCK_REALTIME, &t1);
   *gpu_time +=
       (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
+
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0;
+  
   for (int bid = 0; bid < nBundles_temp; bid++) {
     /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -2829,9 +2835,11 @@ void runner_dopair1_launch_f4_one_memcpy(
     *gpu_time +=
         (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
+    ////////////
+    
     /*Time unpacking*/
     //		clock_gettime(CLOCK_REALTIME, &tp0);
-
+    
     for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
       if (tid < tasks_packed) {
@@ -2850,11 +2858,18 @@ void runner_dopair1_launch_f4_one_memcpy(
         while (cell_locktree(cjj)) {
           ; /* spin until we acquire the lock */
         }
+
+	const ticks tic = getticks();
+	
         /* Do the copy */
         runner_do_ci_cj_gpu_unpack_neat_aos_f4(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
+	const ticks toc = getticks();
+
+	total_cpu_unpack_ticks += toc - tic;
+	
         /* Record things for debugging */
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
@@ -2885,6 +2900,12 @@ void runner_dopair1_launch_f4_one_memcpy(
   //	clock_gettime(CLOCK_REALTIME, &t1);
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +
   //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
+  
 } /*End of GPU work*/
 
 void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 8e112bf6c0..659c47c537 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1260,17 +1260,24 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-              packing_time_pair += runner_dopair1_pack_f4(
+
+	      ticks tic_cpu_pack = getticks(); 
+
+	      packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+
+	      
               /* Packed enough tasks or no pack tasks left in queue, flag that
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+	      
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
               if (launch || launch_leftovers) {
+
                 /*Launch GPU tasks*/
                 //				runner_dopair1_launch(r, sched,
                 // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
diff --git a/src/scheduler.c b/src/scheduler.c
index 90d0274850..445cc97f19 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1754,6 +1754,8 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   t->tic = 0;
   t->toc = 0;
   t->total_ticks = 0;
+  t->total_cpu_pack_ticks = 0;
+  t->total_cpu_unpack_ticks = 0;
 #ifdef SWIFT_DEBUG_CHECKS
   t->activated_by_unskip = 0;
   t->activated_by_marktask = 0;
@@ -3410,6 +3412,14 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const float total_time = clocks_from_ticks(t->total_ticks);
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
+
+    if(t->type == task_type_pair && t->subtype == task_subtype_pack) {
+      time_local[task_category_gpu_pack] += clock_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu_unpack] += clock_from_ticks(t->total_cpu_unpack_ticks);
+      
+      time_local[task_category_gpu] -= clock_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -= clock_from_ticks(t->total_cpu_unpack_ticks);
+    }
   }
 
   /* Update the global counters */
diff --git a/src/task.c b/src/task.c
index 4716b35c9c..dad7bbe1be 100644
--- a/src/task.c
+++ b/src/task.c
@@ -177,7 +177,8 @@ const char *task_category_names[task_category_count] = {
     "hydro",       "gravity",  "feedback",
     "black holes", "cooling",  "star formation",
     "limiter",     "sync",     "time integration",
-    "mpi",         "pack",     "gpu_pack",        "fof",
+    "mpi",         "pack",     "gpu",
+    "gpu_pack",    "gpu_unpack",  "fof",
     "others",      "neutrino", "sink",
     "RT",          "CSDS"};
 
diff --git a/src/task.h b/src/task.h
index 1115c6d0a7..9452f754a0 100644
--- a/src/task.h
+++ b/src/task.h
@@ -202,7 +202,9 @@ enum task_categories {
   task_category_time_integration,
   task_category_mpi,
   task_category_pack,
+  task_category_gpu,
   task_category_gpu_pack,
+  task_category_gpu_unpack,
   task_category_fof,
   task_category_others,
   task_category_neutrino,
@@ -302,6 +304,9 @@ struct task {
   /*! Start and end time of this task */
   ticks tic, toc;
 
+  ticks total_cpu_pack_ticks;
+  ticks total_cpu_unpack_ticks;
+  
   /* Total time spent running this task */
   ticks total_ticks;
 

From 3ccbaf0ce414332cddb1409ba566bbc7e32dba85 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Wed, 6 Nov 2024 10:28:35 +0100
Subject: [PATCH 056/217] Fix typos

---
 src/scheduler.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 445cc97f19..ae2f941db3 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3413,12 +3413,12 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
 
-    if(t->type == task_type_pair && t->subtype == task_subtype_pack) {
-      time_local[task_category_gpu_pack] += clock_from_ticks(t->total_cpu_pack_ticks);
-      time_local[task_category_gpu_unpack] += clock_from_ticks(t->total_cpu_unpack_ticks);
+    if(t->type == task_type_pair && t->subtype == task_subtype_gpu_pack) {
+      time_local[task_category_gpu_pack] += clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu_unpack] += clocks_from_ticks(t->total_cpu_unpack_ticks);
       
-      time_local[task_category_gpu] -= clock_from_ticks(t->total_cpu_pack_ticks);
-      time_local[task_category_gpu] -= clock_from_ticks(t->total_cpu_unpack_ticks);
+      time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_unpack_ticks);
     }
   }
 

From 707ad45508c742b34446debc6667be21b54c5b23 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 6 Nov 2024 10:04:13 +0000
Subject: [PATCH 057/217] Added timers for packing all GPU task types

---
 src/runner_main_clean.cu | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 659c47c537..d9d61aa4fb 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1035,9 +1035,14 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
             //          struct timespec t0, t1; //
             //          clock_gettime(CLOCK_REALTIME, &t0);
+            ticks tic_cpu_pack = getticks();
+
             packing_time +=
                 runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
                                        parts_aos_f4_send, task_first_part_f4);
+
+	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
             //      	  clock_gettime(CLOCK_REALTIME, &t1);
             //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
             //      			(t1.tv_nsec - t0.tv_nsec) /
@@ -1072,9 +1077,14 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_GRADIENT
             //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
             //        		  t, parts_aos_grad, &packing_time_g);
+            ticks tic_cpu_pack = getticks();
+
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
+
+	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
             /*Packed enough tasks let's go*/
@@ -1099,9 +1109,14 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_FORCE
             //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
             //        		  t, parts_aos_forc, &packing_time_f);
+            ticks tic_cpu_pack = getticks();
+
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
+
+	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
             //          int count = ci->hydro.count;
             //          for(int i = 0; i < count; i++){
             //        	  int pid = pack_vars_self_forc->count_parts - count +
@@ -1261,13 +1276,13 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
 
-	      ticks tic_cpu_pack = getticks(); 
+	          ticks tic_cpu_pack = getticks();
 
-	      packing_time_pair += runner_dopair1_pack_f4(
+	          packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 
-	      
+	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
@@ -1348,10 +1363,15 @@ void *runner_main2(void *data) {
         //          ci,
         //        		  cj, t, parts_aos_pair_grad, e,
         //        &packing_time_g);
+  	          ticks tic_cpu_pack = getticks();
+
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
                                            fparti_fpartj_lparti_lpartj_grad);
+
+  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
               /*Packed enough tasks let's go*/
@@ -1425,10 +1445,15 @@ void *runner_main2(void *data) {
         //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc,
         //            ci, 		cj, t, parts_aos_pair_forc, e,
         //            &packing_time_f);
+              ticks tic_cpu_pack = getticks();
+
               packing_time_pair_f +=
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
                                            fparti_fpartj_lparti_lpartj_forc);
+
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/

From 425b0744471e0a87e5ae10bc3030fa2ba54f6350 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 6 Nov 2024 10:21:47 +0000
Subject: [PATCH 058/217] Added timers for unpacking all GPU task types

---
 src/runner_doiact_functions_hydro_gpu.h | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 0b9c908229..a58358f939 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1407,6 +1407,7 @@ void runner_doself1_launch_f4(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
   for (int bid = 0; bid < nBundles_temp; bid++) {
 
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -1440,11 +1441,14 @@ void runner_doself1_launch_f4(
         //				*hmemcpy_time += (t1hmemcpy.tv_sec -
         //t0hmemcpy.tv_sec) + 				(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
         //1000000000.0;
+        const ticks tic = getticks();
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0,
                                               &pack_length_unpack, tid,
                                               pack_vars->count_max_parts, e);
+    	const ticks toc = getticks();
 
+    	total_cpu_unpack_ticks += toc - tic;
         /* Record things for debugging */
         cii->gpu_done++;
         /*Time end of unpacking*/
@@ -1474,6 +1478,8 @@ void runner_doself1_launch_f4(
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
 
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
 } /*End of GPU work Self*/
 
 void runner_doself1_launch_g(struct runner *r, struct scheduler *s,
@@ -1802,6 +1808,7 @@ void runner_doself1_launch_f4_g(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
   for (int bid = 0; bid < nBundles_temp; bid++) {
 
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -1831,10 +1838,15 @@ void runner_doself1_launch_f4_g(
         }
         /*Time unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp0);
+    	const ticks tic = getticks();
+
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0,
                                                 &pack_length_unpack, tid,
                                                 pack_vars->count_max_parts, e);
+    	const ticks toc = getticks();
+
+    	total_cpu_unpack_ticks += toc - tic;
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
@@ -1865,6 +1877,8 @@ void runner_doself1_launch_f4_g(
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
 
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
 } /*End of GPU work Self Gradient*/
 
 void runner_doself1_launch_f(struct runner *r, struct scheduler *s,
@@ -2200,6 +2214,7 @@ void runner_doself1_launch_f4_f(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
   for (int bid = 0; bid < nBundles_temp; bid++) {
 
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -2227,11 +2242,15 @@ void runner_doself1_launch_f4_f(
           ; /* spin until we acquire the lock */
         }
         clock_gettime(CLOCK_REALTIME, &tp0);
+    	const ticks tic = getticks();
+
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0,
                                                 &pack_length_unpack, tid,
                                                 pack_vars->count_max_parts, e);
+    	const ticks toc = getticks();
 
+    	total_cpu_unpack_ticks += toc - tic;
         /* Record things for debugging */
         cii->gpu_done_f++;
         clock_gettime(CLOCK_REALTIME, &tp1);
@@ -2258,6 +2277,8 @@ void runner_doself1_launch_f4_f(
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
+
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
 } /*End of GPU work Self Gradient*/
 
 void runner_dopair1_launch(struct runner *r, struct scheduler *s,
@@ -3551,6 +3572,9 @@ void runner_dopair1_launch_f4_g_one_memcpy(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+
+  ticks total_cpu_unpack_ticks = 0.;
+
   for (int bid = 0; bid < nBundles_temp; bid++) {
     /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -3582,11 +3606,18 @@ void runner_dopair1_launch_f4_g_one_memcpy(
         while (cell_locktree(cjj)) {
           ; /* spin until we acquire the lock */
         }
+
+    	const ticks tic = getticks();
+
         /* Do the copy */
         runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
+    	const ticks toc = getticks();
+
+    	total_cpu_unpack_ticks += toc - tic;
+
         /* Record things for debugging */
         cii->gpu_done_pair_g++;
         cjj->gpu_done_pair_g++;
@@ -3613,6 +3644,9 @@ void runner_dopair1_launch_f4_g_one_memcpy(
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
   //	/*Time end of unpacking*/
   //	clock_gettime(CLOCK_REALTIME, &t1);
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +
@@ -4291,6 +4325,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
   for (int bid = 0; bid < nBundles_temp; bid++) {
     /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -4322,11 +4357,19 @@ void runner_dopair1_launch_f4_f_one_memcpy(
         while (cell_locktree(cjj)) {
           ; /* spin until we acquire the lock */
         }
+
+        const ticks tic = getticks();
+
         /* Do the copy */
         runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
+
+    	const ticks toc = getticks();
+
+    	total_cpu_unpack_ticks += toc - tic;
+
         /* Record things for debugging */
         cii->gpu_done_pair_f++;
         cjj->gpu_done_pair_f++;
@@ -4353,6 +4396,9 @@ void runner_dopair1_launch_f4_f_one_memcpy(
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
   //	/*Time end of unpacking*/
   //	clock_gettime(CLOCK_REALTIME, &t1);
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +

From 4634905f8cbd3e8153328b3ccbd48dd3cc6accdc Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 6 Nov 2024 11:17:02 +0000
Subject: [PATCH 059/217] modified scheduler_report_task_times_mapper() to
 account for different pack and unpack tasks d/f/g. There is a bug in
 runner_gpu_pack_functions.c when enabling timers

---
 src/scheduler.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index ae2f941db3..954d25afca 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3413,12 +3413,13 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
 
-    if(t->type == task_type_pair && t->subtype == task_subtype_gpu_pack) {
+    if(t->subtype == task_subtype_gpu_pack || t->subtype == task_subtype_gpu_pack_f || t->subtype == task_subtype_gpu_pack_g) {
       time_local[task_category_gpu_pack] += clocks_from_ticks(t->total_cpu_pack_ticks);
-      time_local[task_category_gpu_unpack] += clocks_from_ticks(t->total_cpu_unpack_ticks);
-      
       time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_pack_ticks);
+    }
+    if(t->subtype == task_subtype_gpu_unpack || t->subtype == task_subtype_gpu_unpack_f || t->subtype == task_subtype_gpu_unpack_g) {
       time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_unpack_ticks);
+      time_local[task_category_gpu_unpack] += clocks_from_ticks(t->total_cpu_unpack_ticks);
     }
   }
 

From 7b8baa6d454cc71b23309dc1f52c039f28a930f3 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 6 Nov 2024 11:21:12 +0000
Subject: [PATCH 060/217] Reverted back to one if statement only for pack tasks
 as unpack tasks do nothing

---
 src/scheduler.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 954d25afca..711aebf554 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3416,8 +3416,6 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     if(t->subtype == task_subtype_gpu_pack || t->subtype == task_subtype_gpu_pack_f || t->subtype == task_subtype_gpu_pack_g) {
       time_local[task_category_gpu_pack] += clocks_from_ticks(t->total_cpu_pack_ticks);
       time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_pack_ticks);
-    }
-    if(t->subtype == task_subtype_gpu_unpack || t->subtype == task_subtype_gpu_unpack_f || t->subtype == task_subtype_gpu_unpack_g) {
       time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_unpack_ticks);
       time_local[task_category_gpu_unpack] += clocks_from_ticks(t->total_cpu_unpack_ticks);
     }

From 23cf18188cb33b46307cf1aedc0b3be28916bf6f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 6 Nov 2024 11:30:41 +0000
Subject: [PATCH 061/217] made pack task types set to task_category_gpu

---
 src/task.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/task.c b/src/task.c
index dad7bbe1be..b56e3bdbf9 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1879,7 +1879,7 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_gpu_unpack_f:
         case task_subtype_gpu_pack_g:
         case task_subtype_gpu_unpack_g:
-          return task_category_gpu_pack;
+          return task_category_gpu;
 	  
         case task_subtype_limiter:
           return task_category_limiter;

From d7f980f390ea420e68db0be59bf3717d5550d3f9 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Wed, 6 Nov 2024 16:39:01 +0100
Subject: [PATCH 062/217] First attempt at re-instating task stealing

---
 src/scheduler.c | 83 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 21 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 711aebf554..62abe1e3a1 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3119,29 +3119,70 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         TIMER_TOC(timer_qget);
         if (res != NULL) break;
       }
-
+      
       /* If unsuccessful, try stealing from the other queues. A. Nasar commented
        * out for GPU work*/
-      //      if (s->flags & scheduler_flag_steal) {
-      //        int count = 0, qids[nr_queues];
-      //        for (int k = 0; k < nr_queues; k++)
-      //          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0)
-      //          {
-      //            qids[count++] = k;
-      //          }
-      //        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
-      //          const int ind = rand_r(&seed) % count;
-      //          TIMER_TIC
-      //          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
-      //          TIMER_TOC(timer_qsteal);
-      //          if (res != NULL) {
-      //            break;
-      //          } else {
-      //            qids[ind] = qids[--count];
-      //          }
-      //        }
-      //        if (res != NULL) break;
-      //      }
+      if (s->flags & scheduler_flag_steal) {
+
+	
+	int count = 0, qids[nr_queues];
+
+
+	/* Make list of queues that have 1 or more tasks in them */
+	for (int k = 0; k < nr_queues; k++) {
+	  if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
+	      qids[count++] = k;
+	  }
+	}
+	  
+	for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+
+	  /* Pick a queue at random among the non-empty ones */
+	  const int ind = rand_r(&seed) % count;
+
+	  /* Try to get a task from that random queue */
+	  TIMER_TIC;
+	  res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+	  TIMER_TOC(timer_qsteal);
+
+	  /* Lucky? */
+	  if (res != NULL) {
+
+	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack) {
+	      atomic_inc(&s->queues[qid].n_packs_self_left);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
+	    }
+	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack_g) {
+	      atomic_inc(&s->queues[qid].n_packs_self_left_g);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
+	    }
+	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack_f) {
+	      atomic_inc(&s->queues[qid].n_packs_self_left_f);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
+	    }
+	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack) {
+	      atomic_inc(&s->queues[qid].n_packs_pair_left);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
+	    }
+	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack_g) {
+	      atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
+	    }
+	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack_f) {
+	      atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);
+	    }
+	    
+	    /* Run with the task */
+	    break;
+	  } else {
+
+	    /* Reduce the size of the list of non-empty queues */
+	    qids[ind] = qids[--count];
+	  }
+	}
+	if (res != NULL) break;
+      }
     }
 
 /* If we failed, take a short nap. */

From 5b67cd46de0040135328b93e03def591b6287130 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Wed, 6 Nov 2024 17:29:32 +0100
Subject: [PATCH 063/217] Put varaible back in the code

---
 src/scheduler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 62abe1e3a1..3df05ff095 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3102,7 +3102,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                                const struct task *prev) {
   struct task *res = NULL;
   const int nr_queues = s->nr_queues;
-  //  unsigned int seed = qid;
+  unsigned int seed = qid;
 
   /* Check qid. */
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");

From 51c788a37945000e7b1ecbd24215aca7736883ed Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 7 Nov 2024 10:57:32 +0000
Subject: [PATCH 064/217] put in fix for swift_task_debug in
 runner_main_clean.cu

---
 src/runner_main_clean.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index d9d61aa4fb..f15777ea0c 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -994,7 +994,9 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+        if(t->subtype != task_subtype_gpu_unpack && t->subtype != task_subtype_gpu_unpack_g
+          && t->subtype != task_subtype_gpu_unpack_f && )
+            t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }

From ddc843bac4886c2d1c903f512bbcddc9f12bfb1b Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <matthieu.schaller@gmail.com>
Date: Fri, 8 Nov 2024 09:46:30 +0100
Subject: [PATCH 065/217] Fix code to allow running with SWIFT_DEBUG_TASKS

---
 src/runner_main_clean.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index f15777ea0c..568567f163 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -994,9 +994,10 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        if(t->subtype != task_subtype_gpu_unpack && t->subtype != task_subtype_gpu_unpack_g
-          && t->subtype != task_subtype_gpu_unpack_f && )
-            t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+        if(t->subtype != task_subtype_gpu_unpack &&
+	   t->subtype != task_subtype_gpu_unpack_g &&
+	   t->subtype != task_subtype_gpu_unpack_f)
+	  t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }

From 7eab151f11be0d5c6fe6cf4df1a36860ec4e0d78 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 8 Nov 2024 10:55:12 +0000
Subject: [PATCH 066/217] Commented out task splitting for hydro and added fix
 for cell-less tasks in split_tasks for gravity

---
 src/engine_maketasks.c | 2 +-
 src/scheduler.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 577827a266..0ff182bb8d 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2276,7 +2276,7 @@ void engine_link_gravity_tasks(struct engine *e) {
     /* Get a pointer to the task. */
     struct task *t = &sched->tasks[k];
 
-    if (t->type == task_type_none) continue;
+    if (t->type == task_type_none || t->ci == NULL) continue;
 
     /* Get the cells we act on */
     struct cell *ci = t->ci;
diff --git a/src/scheduler.c b/src/scheduler.c
index 3df05ff095..309594077d 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1652,7 +1652,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
 
     /* Invoke the correct splitting strategy */
     if (t->subtype == task_subtype_density) {
-      scheduler_splittask_hydro(t, s);
+//      scheduler_splittask_hydro(t, s);
     } else if (t->subtype == task_subtype_external_grav) {
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {

From 55c50b283e6d60b25eb41a7451765b8bda524c0f Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <matthieu.schaller@gmail.com>
Date: Fri, 8 Nov 2024 11:45:04 +0100
Subject: [PATCH 067/217] Call signal_sleeping_runners() in runner_main and not
 inside the pack task

---
 src/runner_doiact_functions_hydro_gpu.h | 38 ++++++++++++-------------
 src/runner_main_clean.cu                |  3 ++
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index a58358f939..cd402edd8f 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -215,7 +215,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   task_unlock(t);
   t->gpu_done = 1;
   //		cell_unlocktree(ci);
-  //		signal_sleeping_runners(s, t);
+  //		// MATTHIEU signal_sleeping_runners(s, t);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
@@ -336,7 +336,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Release the lock on the cell */
   //	task_unlock(t);
   cell_unlocktree(ci);
-  //	signal_sleeping_runners(s, t);
+  //	// MATTHIEU signal_sleeping_runners(s, t);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
@@ -457,7 +457,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Release the lock on the cell */
   //	task_unlock(t);
   cell_unlocktree(ci);
-  //	signal_sleeping_runners(s, t);
+  //	// MATTHIEU signal_sleeping_runners(s, t);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
@@ -1225,7 +1225,7 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     tii->gpu_done = 1;
   }
@@ -1462,7 +1462,7 @@ void runner_doself1_launch_f4(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -1653,7 +1653,7 @@ void runner_doself1_launch_g(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     tii->gpu_done = 1;
   }
@@ -1861,7 +1861,7 @@ void runner_doself1_launch_f4_g(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -2051,7 +2051,7 @@ void runner_doself1_launch_f(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     tii->gpu_done = 1;
   }
@@ -2263,7 +2263,7 @@ void runner_doself1_launch_f4_f(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -2478,7 +2478,7 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     /* Release the locks */
     cell_unlocktree(cii);
@@ -2686,7 +2686,7 @@ void runner_dopair1_launch_f4(
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     /* Release the locks */
     cell_unlocktree(cii);
@@ -2908,7 +2908,7 @@ void runner_dopair1_launch_f4_one_memcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -3180,7 +3180,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         /* Release the locks */
         cell_unlocktree(cii);
@@ -3407,7 +3407,7 @@ void runner_dopair1_launch_g(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     /* Release the locks */
     cell_unlocktree(cii);
@@ -3635,7 +3635,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -3917,7 +3917,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         /* Release the locks */
         cell_unlocktree(cii);
@@ -4147,7 +4147,7 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s,
     /*schedule my dependencies (Only unpacks really)*/
     enqueue_dependencies(s, tii);
     /*Signal sleeping runners*/
-    signal_sleeping_runners(s, tii);
+    // MATTHIEU signal_sleeping_runners(s, tii);
 
     /* Release the locks */
     cell_unlocktree(cii);
@@ -4387,7 +4387,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         tii->gpu_done = 1;
       }
@@ -4658,7 +4658,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
-        signal_sleeping_runners(s, tii);
+        // MATTHIEU signal_sleeping_runners(s, tii);
 
         /* Release the locks */
         cell_unlocktree(cii);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 568567f163..8a516ce441 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1902,6 +1902,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
+	signal_sleeping_runners(s, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1914,6 +1915,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
+	signal_sleeping_runners(s, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1926,6 +1928,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
+	signal_sleeping_runners(s, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);

From 04471487eb9b9d31bdccb4c738ce25e63cfe101e Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <matthieu.schaller@gmail.com>
Date: Fri, 8 Nov 2024 11:46:54 +0100
Subject: [PATCH 068/217] Fix typo

---
 src/runner_main_clean.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 8a516ce441..62213e80ff 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1902,7 +1902,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(s, t);
+	signal_sleeping_runners(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1915,7 +1915,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(s, t);
+	signal_sleeping_runners(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1928,7 +1928,7 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(s, t);
+	signal_sleeping_runners(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);

From 037dbd5520d9e1d70d5ac11d960b32ea945d8a08 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <matthieu.schaller@gmail.com>
Date: Fri, 8 Nov 2024 11:50:25 +0100
Subject: [PATCH 069/217] Applied formatting script

---
 src/engine_maketasks.c |  11 +---
 src/scheduler.c        | 140 ++++++++++++++++++++++-------------------
 src/task.c             |  16 ++---
 src/task.h             |   2 +-
 4 files changed, 86 insertions(+), 83 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 0ff182bb8d..2e87b35645 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -716,7 +716,8 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
     }
-#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT REQUIRED Ghost in for cell j is*/
+#ifdef WITH_CUDA /* A. Nasar POSSIBLE BUG HERE (More like PROBABLE) NOT \
+                    REQUIRED Ghost in for cell j is*/
     for (struct link *l = c->hydro.density_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_xv, l->t);
       scheduler_addunlock(s, l->t, t_rho);
@@ -725,12 +726,8 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, l->t, t_rho);
     }
 
-    
 #endif
 
-
-
-    
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_rho, l->t);
@@ -749,7 +746,6 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, l->t, t_gradient);
     }
 
-
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_addunlock(s, t_gradient, l->t);
       scheduler_addunlock(s, l->t, tend);
@@ -758,7 +754,6 @@ void engine_addtasks_recv_hydro(
       scheduler_addunlock(s, l->t, tend);
     }
 
-
 #endif /*WITH_CUDA*/
 #else  /*EXTRA_HYDRO_LOOP*/
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
@@ -774,8 +769,6 @@ void engine_addtasks_recv_hydro(
 #endif /*WITH_CUDA*/
 #endif /*EXTRA_HYDRO_LOOP*/
 
-
-    
     if (with_limiter) {
       for (struct link *l = c->hydro.limiter; l != NULL; l = l->next) {
         scheduler_addunlock(s, t_unpack_limiter, l->t);
diff --git a/src/scheduler.c b/src/scheduler.c
index 309594077d..c7459c3699 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1652,7 +1652,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
 
     /* Invoke the correct splitting strategy */
     if (t->subtype == task_subtype_density) {
-//      scheduler_splittask_hydro(t, s);
+      //      scheduler_splittask_hydro(t, s);
     } else if (t->subtype == task_subtype_external_grav) {
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
@@ -3119,69 +3119,73 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         TIMER_TOC(timer_qget);
         if (res != NULL) break;
       }
-      
+
       /* If unsuccessful, try stealing from the other queues. A. Nasar commented
        * out for GPU work*/
       if (s->flags & scheduler_flag_steal) {
 
-	
-	int count = 0, qids[nr_queues];
-
-
-	/* Make list of queues that have 1 or more tasks in them */
-	for (int k = 0; k < nr_queues; k++) {
-	  if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
-	      qids[count++] = k;
-	  }
-	}
-	  
-	for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
-
-	  /* Pick a queue at random among the non-empty ones */
-	  const int ind = rand_r(&seed) % count;
-
-	  /* Try to get a task from that random queue */
-	  TIMER_TIC;
-	  res = queue_gettask(&s->queues[qids[ind]], prev, 0);
-	  TIMER_TOC(timer_qsteal);
-
-	  /* Lucky? */
-	  if (res != NULL) {
-
-	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack) {
-	      atomic_inc(&s->queues[qid].n_packs_self_left);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
-	    }
-	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack_g) {
-	      atomic_inc(&s->queues[qid].n_packs_self_left_g);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
-	    }
-	    if (res->type == task_type_self && res->subtype == task_subtype_gpu_pack_f) {
-	      atomic_inc(&s->queues[qid].n_packs_self_left_f);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
-	    }
-	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack) {
-	      atomic_inc(&s->queues[qid].n_packs_pair_left);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
-	    }
-	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack_g) {
-	      atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
-	    }
-	    if (res->type == task_type_pair && res->subtype == task_subtype_gpu_pack_f) {
-	      atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-	      atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);
-	    }
-	    
-	    /* Run with the task */
-	    break;
-	  } else {
-
-	    /* Reduce the size of the list of non-empty queues */
-	    qids[ind] = qids[--count];
-	  }
-	}
-	if (res != NULL) break;
+        int count = 0, qids[nr_queues];
+
+        /* Make list of queues that have 1 or more tasks in them */
+        for (int k = 0; k < nr_queues; k++) {
+          if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
+            qids[count++] = k;
+          }
+        }
+
+        for (int k = 0; k < scheduler_maxsteal && count > 0; k++) {
+
+          /* Pick a queue at random among the non-empty ones */
+          const int ind = rand_r(&seed) % count;
+
+          /* Try to get a task from that random queue */
+          TIMER_TIC;
+          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+          TIMER_TOC(timer_qsteal);
+
+          /* Lucky? */
+          if (res != NULL) {
+
+            if (res->type == task_type_self &&
+                res->subtype == task_subtype_gpu_pack) {
+              atomic_inc(&s->queues[qid].n_packs_self_left);
+              atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
+            }
+            if (res->type == task_type_self &&
+                res->subtype == task_subtype_gpu_pack_g) {
+              atomic_inc(&s->queues[qid].n_packs_self_left_g);
+              atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
+            }
+            if (res->type == task_type_self &&
+                res->subtype == task_subtype_gpu_pack_f) {
+              atomic_inc(&s->queues[qid].n_packs_self_left_f);
+              atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
+            }
+            if (res->type == task_type_pair &&
+                res->subtype == task_subtype_gpu_pack) {
+              atomic_inc(&s->queues[qid].n_packs_pair_left);
+              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
+            }
+            if (res->type == task_type_pair &&
+                res->subtype == task_subtype_gpu_pack_g) {
+              atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
+            }
+            if (res->type == task_type_pair &&
+                res->subtype == task_subtype_gpu_pack_f) {
+              atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);
+            }
+
+            /* Run with the task */
+            break;
+          } else {
+
+            /* Reduce the size of the list of non-empty queues */
+            qids[ind] = qids[--count];
+          }
+        }
+        if (res != NULL) break;
       }
     }
 
@@ -3454,11 +3458,17 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
 
-    if(t->subtype == task_subtype_gpu_pack || t->subtype == task_subtype_gpu_pack_f || t->subtype == task_subtype_gpu_pack_g) {
-      time_local[task_category_gpu_pack] += clocks_from_ticks(t->total_cpu_pack_ticks);
-      time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_pack_ticks);
-      time_local[task_category_gpu] -= clocks_from_ticks(t->total_cpu_unpack_ticks);
-      time_local[task_category_gpu_unpack] += clocks_from_ticks(t->total_cpu_unpack_ticks);
+    if (t->subtype == task_subtype_gpu_pack ||
+        t->subtype == task_subtype_gpu_pack_f ||
+        t->subtype == task_subtype_gpu_pack_g) {
+      time_local[task_category_gpu_pack] +=
+          clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -=
+          clocks_from_ticks(t->total_cpu_pack_ticks);
+      time_local[task_category_gpu] -=
+          clocks_from_ticks(t->total_cpu_unpack_ticks);
+      time_local[task_category_gpu_unpack] +=
+          clocks_from_ticks(t->total_cpu_unpack_ticks);
     }
   }
 
diff --git a/src/task.c b/src/task.c
index b56e3bdbf9..52b1e3fb43 100644
--- a/src/task.c
+++ b/src/task.c
@@ -173,13 +173,13 @@ const char *subtaskID_names[task_subtype_count] = {
 };
 
 const char *task_category_names[task_category_count] = {
-    "drift",       "sorts",    "resort",
-    "hydro",       "gravity",  "feedback",
-    "black holes", "cooling",  "star formation",
-    "limiter",     "sync",     "time integration",
-    "mpi",         "pack",     "gpu",
-    "gpu_pack",    "gpu_unpack",  "fof",
-    "others",      "neutrino", "sink",
+    "drift",       "sorts",      "resort",
+    "hydro",       "gravity",    "feedback",
+    "black holes", "cooling",    "star formation",
+    "limiter",     "sync",       "time integration",
+    "mpi",         "pack",       "gpu",
+    "gpu_pack",    "gpu_unpack", "fof",
+    "others",      "neutrino",   "sink",
     "RT",          "CSDS"};
 
 #ifdef WITH_MPI
@@ -1880,7 +1880,7 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_gpu_pack_g:
         case task_subtype_gpu_unpack_g:
           return task_category_gpu;
-	  
+
         case task_subtype_limiter:
           return task_category_limiter;
 
diff --git a/src/task.h b/src/task.h
index 9452f754a0..dfce456a4b 100644
--- a/src/task.h
+++ b/src/task.h
@@ -306,7 +306,7 @@ struct task {
 
   ticks total_cpu_pack_ticks;
   ticks total_cpu_unpack_ticks;
-  
+
   /* Total time spent running this task */
   ticks total_ticks;
 

From 3800f5336fda3c30d45ae072ab752df1bcb0956e Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 9 Nov 2024 11:32:15 +0000
Subject: [PATCH 070/217] Changed the fix for duplicate unlocks so that we skip
 checking for duplicate unpack tasks unlocking other tasks e.g. unpack->ghost
 (not the other way around)

---
 src/scheduler.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index c7459c3699..ebf1535e17 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1874,9 +1874,9 @@ void scheduler_set_unlocks(struct scheduler *s) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
         /*Fix for the case when one unpack task works over the same cell
          * connected to two pair pack tasks*/
-        if (t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack ||
-            t->unlock_tasks[i]->subtype == task_subtype_gpu_unpack_g ||
-            t->unlock_tasks[i]->subtype != task_subtype_gpu_unpack_f) {
+        if (t->subtype == task_subtype_gpu_unpack ||
+            t->subtype == task_subtype_gpu_unpack_g ||
+            t->subtype == task_subtype_gpu_unpack_f) {
           continue;
         }
         if (t->unlock_tasks[i] == t->unlock_tasks[j])

From 40b67eef980c873dd0adb0e193cc9bc6a2485b7a Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 9 Nov 2024 12:04:31 +0000
Subject: [PATCH 071/217] Modified signal_sleeping_runners to only signal once
 per pack. Also moved enqueue_deps to be executed as each pack task is done.
 Seems to work fine. Need to test if dead time is reduced by this

---
 src/runner_doiact_functions_hydro_gpu.h | 12 ++++++------
 src/runner_main_clean.cu                | 15 ++++++++++++---
 src/scheduler.c                         |  4 ++--
 src/scheduler.h                         |  2 +-
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index cd402edd8f..eedaa1a574 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1460,7 +1460,7 @@ void runner_doself1_launch_f4(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -1859,7 +1859,7 @@ void runner_doself1_launch_f4_g(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -2261,7 +2261,7 @@ void runner_doself1_launch_f4_f(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -2906,7 +2906,7 @@ void runner_dopair1_launch_f4_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -3633,7 +3633,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -4385,7 +4385,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
+//        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 62213e80ff..3b1e1a624a 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1061,6 +1061,7 @@ void *runner_main2(void *data) {
             if (launch_leftovers) n_partial_d_bundles++;
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
+              signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
               runner_doself1_launch_f4(
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
@@ -1099,6 +1100,7 @@ void *runner_main2(void *data) {
               //      pack_vars_self_grad, ci, t, parts_aos_grad,
               //      	        		d_parts_aos_grad, stream, d_a,
               //      d_H, e, &packing_time_g, &time_for_gpu_g);
+              signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
               runner_doself1_launch_f4_g(
                   r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                   parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
@@ -1138,6 +1140,7 @@ void *runner_main2(void *data) {
               //  pack_vars_self_forc, ci, t, parts_aos_forc,
               //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
               //  &time_for_gpu_f);
+              signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1302,6 +1305,7 @@ void *runner_main2(void *data) {
                 //						d_parts_aos_pair_dens,
                 // stream, d_a, d_H, e, &packing_time_pair,
                 //&time_for_density_gpu_pair);
+                signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1387,6 +1391,7 @@ void *runner_main2(void *data) {
                 //					d_parts_aos_pair_grad,
                 // stream, d_a, d_H, e, &packing_time_pair_g,
                 //&time_for_gpu_pair_g);
+                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
                 runner_dopair1_launch_f4_g_one_memcpy(
                     r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
                     parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
@@ -1469,6 +1474,7 @@ void *runner_main2(void *data) {
                 //  					d_parts_aos_pair_forc,
                 //  stream, d_a, d_H, e, &packing_time_pair_f,
                 //  &time_for_gpu_pair_f);
+                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
                 runner_dopair1_launch_f4_f_one_memcpy(
                     r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
                     parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
@@ -1902,7 +1908,8 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(sched, t);
+//	signal_sleeping_runners(sched, t);
+	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1915,7 +1922,8 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(sched, t);
+//	signal_sleeping_runners(sched, t);
+	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1928,7 +1936,8 @@ void *runner_main2(void *data) {
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
-	signal_sleeping_runners(sched, t);
+//	signal_sleeping_runners(sched, t);
+	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
diff --git a/src/scheduler.c b/src/scheduler.c
index ebf1535e17..87fe4b3c16 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2947,14 +2947,14 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   return NULL;
 }
 
-struct task *signal_sleeping_runners(struct scheduler *s, struct task *t) {
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, int tasks_packed) {
   /* Mark the task as skip. */
   //  t->skip = 1;
 
   /* Task definitely done, signal any sleeping runners. */
   if (!t->implicit) {
     pthread_mutex_lock(&s->sleep_mutex);
-    atomic_dec(&s->waiting);
+    atomic_sub(&s->waiting, tasks_packed);
     pthread_cond_broadcast(&s->sleep_cond);
     pthread_mutex_unlock(&s->sleep_mutex);
   }
diff --git a/src/scheduler.h b/src/scheduler.h
index 23363d9eb6..9cdc65ccfa 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -346,6 +346,6 @@ void scheduler_dump_queues(struct engine *e);
 void scheduler_report_task_times(const struct scheduler *s,
                                  const int nr_threads);
 struct task *enqueue_dependencies(struct scheduler *s, struct task *t);
-struct task *signal_sleeping_runners(struct scheduler *s, struct task *t);
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, int tasks_packed);
 
 #endif /* SWIFT_SCHEDULER_H */

From 339539dfd04d7b3e30276ab97fad94d8a1ba0f3f Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <anasar@gh001.bede.dur.ac.uk>
Date: Mon, 11 Nov 2024 12:08:28 +0000
Subject: [PATCH 072/217] Setup and tested for optimal pack_size. GPU code is
 about 20% fatser than previously with excessive calls to
 signal_sleeping_runners but dead time is roughly the same

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 3 ++-
 src/cuda/BLOCK_SIZE.h                          | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index ff5ede11dd..c4b7423669 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 64
   cell_split_size: 1000
   deadlock_waiting_time_s: 10.
 
@@ -38,3 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
+  replicate: 4
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
index ac07782b72..fb11bd5ad6 100644
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -2,11 +2,11 @@
 #define BLOCK_SIZE_H
 
 #define BLOCK_SIZE 64
-#define N_TASKS_PER_PACK_SELF 64
-#define N_TASKS_BUNDLE_SELF 8
+#define N_TASKS_PER_PACK_SELF 4096
+#define N_TASKS_BUNDLE_SELF 1024
 
 #define BLOCK_SIZE_PAIR 64
-#define N_TASKS_PER_PACK_PAIR 32
-#define N_TASKS_BUNDLE_PAIR 4
+#define N_TASKS_PER_PACK_PAIR 2048
+#define N_TASKS_BUNDLE_PAIR 512
 
 #endif  // BLOCK_SIZE_H

From 642b4de0fde2523f09e1ec857403da4c3eea5750 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <anasar@login1.bede.dur.ac.uk>
Date: Mon, 11 Nov 2024 12:21:01 +0000
Subject: [PATCH 073/217] Changed part_gpu.h to compile on Bede

---
 src/cuda/part_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index 92d12f45bd..ffb0e288d7 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
+#include <vector_types.h>
 
 typedef struct part_soa {
   /*Task ID*/

From 9664fa880373d807138d3d41b5a06fe3f963c053 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Mon, 11 Nov 2024 15:47:00 +0000
Subject: [PATCH 074/217] Made duplicate CPU tasks implicit and switched back
 to enqueueing dependencies ONLY after they have been actually done on the GPU

---
 src/engine_maketasks.c                  | 12 ++++++++++++
 src/runner_doiact_functions_hydro_gpu.h | 12 ++++++------
 src/runner_main_clean.cu                |  6 +++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 2e87b35645..0e6a13fb1d 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5286,4 +5286,16 @@ void engine_maketasks(struct engine *e) {
   if (e->verbose)
     message("took %.3f %s (including reweight).",
             clocks_from_ticks(getticks() - tic), clocks_getunit());
+
+  /* Loop over all the CPU hydro tasks to make implicit*/
+  for (int i = 0; i < sched->nr_tasks; i++) {
+
+    struct task *t = &sched->tasks[i];
+    if (t->subtype == task_subtype_density ||
+      t->subtype == task_subtype_gradient ||
+	  t->subtype == task_subtype_force){
+    	t->implicit = 1;
+    }
+  }
+
 }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index eedaa1a574..cd402edd8f 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1460,7 +1460,7 @@ void runner_doself1_launch_f4(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -1859,7 +1859,7 @@ void runner_doself1_launch_f4_g(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -2261,7 +2261,7 @@ void runner_doself1_launch_f4_f(
         cell_unlocktree(cii);
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -2906,7 +2906,7 @@ void runner_dopair1_launch_f4_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -3633,7 +3633,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -4385,7 +4385,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
 
         /*schedule my dependencies (Only unpacks really)*/
-//        enqueue_dependencies(s, tii);
+        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 3b1e1a624a..7eb2138a57 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1909,7 +1909,7 @@ void *runner_main2(void *data) {
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
 //	signal_sleeping_runners(sched, t);
-	enqueue_dependencies(sched, t);
+//	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1923,7 +1923,7 @@ void *runner_main2(void *data) {
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
 //	signal_sleeping_runners(sched, t);
-	enqueue_dependencies(sched, t);
+//	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1937,7 +1937,7 @@ void *runner_main2(void *data) {
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
 //	signal_sleeping_runners(sched, t);
-	enqueue_dependencies(sched, t);
+//	enqueue_dependencies(sched, t);
         t = NULL;
 #else
         t = scheduler_done(sched, t);

From 03d15b182854527a781dc9afaca99135d24fd578 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 12 Nov 2024 11:16:54 +0000
Subject: [PATCH 075/217] Commented out debug code in cell_unskip.c, come back
 and put code in SWIFT_DEBUG_CHECKS ifdefs. Also removed un-necessary check in
 runner_doiact_functions_hydro_gpu.h

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  3 +-
 src/cell_unskip.c                             | 64 +++++++++----------
 src/cuda/BLOCK_SIZE.h                         |  8 +--
 src/cuda/part_gpu.h                           |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       |  1 -
 5 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index c4b7423669..ff5ede11dd 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 64
+  max_top_level_cells: 16
   cell_split_size: 1000
   deadlock_waiting_time_s: 10.
 
@@ -38,4 +38,3 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate: 4
diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 965128a537..24f9691b0e 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1909,21 +1909,21 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
          l = l->next) { /* A. Nasar */
       scheduler_activate(s, l->t);
       //	  message("activating pair pack\n");
-      if (l->t->ci != NULL) {
-        l->t->ci->pack_done = 0;
-        l->t->ci->gpu_done = 0;
-        l->t->ci->unpack_done = 0;
-      }
-      if (l->t->cj != NULL) {
-        l->t->cj->pack_done = 0;
-        l->t->cj->gpu_done = 0;
-        l->t->cj->unpack_done = 0;
-      }
+//      if (l->t->ci != NULL) {
+//        l->t->ci->pack_done = 0;
+//        l->t->ci->gpu_done = 0;
+//        l->t->ci->unpack_done = 0;
+//      }
+//      if (l->t->cj != NULL) {
+//        l->t->cj->pack_done = 0;
+//        l->t->cj->gpu_done = 0;
+//        l->t->cj->unpack_done = 0;
+//      }
     }
     for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
       //	  message("activating pair UN-pack\n");
-      l->t->gpu_done = 0;
+//      l->t->gpu_done = 0;
     }
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1937,16 +1937,16 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
       //      message("activating pair pack force\n");
-      if (l->t->ci != NULL) {
-        l->t->ci->pack_done_f = 0;
-        l->t->ci->gpu_done_f = 0;
-        l->t->ci->unpack_done_f = 0;
-      }
-      if (l->t->cj != NULL) {
-        l->t->cj->pack_done_f = 0;
-        l->t->cj->gpu_done_f = 0;
-        l->t->cj->unpack_done_f = 0;
-      }
+//      if (l->t->ci != NULL) {
+//        l->t->ci->pack_done_f = 0;
+//        l->t->ci->gpu_done_f = 0;
+//        l->t->ci->unpack_done_f = 0;
+//      }
+//      if (l->t->cj != NULL) {
+//        l->t->cj->pack_done_f = 0;
+//        l->t->cj->gpu_done_f = 0;
+//        l->t->cj->unpack_done_f = 0;
+//      }
     }
     for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1958,21 +1958,21 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
       //      message("activating pair pack gradient\n");
-      if (l->t->ci != NULL) {
-        l->t->ci->pack_done_g = 0;
-        l->t->ci->gpu_done_g = 0;
-        l->t->ci->unpack_done_g = 0;
-      }
-      if (l->t->cj != NULL) {
-        l->t->cj->pack_done_g = 0;
-        l->t->cj->gpu_done_g = 0;
-        l->t->cj->unpack_done_g = 0;
-      }
+//      if (l->t->ci != NULL) {
+//        l->t->ci->pack_done_g = 0;
+//        l->t->ci->gpu_done_g = 0;
+//        l->t->ci->unpack_done_g = 0;
+//      }
+//      if (l->t->cj != NULL) {
+//        l->t->cj->pack_done_g = 0;
+//        l->t->cj->gpu_done_g = 0;
+//        l->t->cj->unpack_done_g = 0;
+//      }
     }
     for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
       //      message("activating pair UN-pack gradient\n");
-      l->t->gpu_done = 0;
+//      l->t->gpu_done = 0;
     }
 #endif
 
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
index fb11bd5ad6..e27fc3c66c 100644
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -2,11 +2,11 @@
 #define BLOCK_SIZE_H
 
 #define BLOCK_SIZE 64
-#define N_TASKS_PER_PACK_SELF 4096
-#define N_TASKS_BUNDLE_SELF 1024
+#define N_TASKS_PER_PACK_SELF 64
+#define N_TASKS_BUNDLE_SELF 16
 
 #define BLOCK_SIZE_PAIR 64
-#define N_TASKS_PER_PACK_PAIR 2048
-#define N_TASKS_BUNDLE_PAIR 512
+#define N_TASKS_PER_PACK_PAIR 32
+#define N_TASKS_BUNDLE_PAIR 8
 
 #endif  // BLOCK_SIZE_H
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index ffb0e288d7..a9153fecd5 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include <vector_types.h>
+#include "/usr/local/cuda-12.6/targets/x86_64-linux/include/vector_types.h"
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index cd402edd8f..868249076d 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2871,7 +2871,6 @@ void runner_dopair1_launch_f4_one_memcpy(
         struct task *tii = pack_vars->task_list[tid];
 
         /*Let's lock ci*/
-        if (tii->corner_pair == 1) fprintf(stderr, "Corner task\n");
         while (cell_locktree(cii)) {
           ; /* spin until we acquire the lock */
         }

From 66883bbdae0fd3af8f4aacae1f7056ac3becf21c Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 12 Nov 2024 11:43:12 +0000
Subject: [PATCH 076/217] Modified cell_unskip.c so GPU debug bits only active
 when SWIFT_DEBUG_CHECKS is on

---
 src/cell_unskip.c | 83 +++++++++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 24f9691b0e..d0daee37ff 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1908,22 +1908,24 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     for (struct link *l = c->hydro.density_pack; l != NULL;
          l = l->next) { /* A. Nasar */
       scheduler_activate(s, l->t);
-      //	  message("activating pair pack\n");
-//      if (l->t->ci != NULL) {
-//        l->t->ci->pack_done = 0;
-//        l->t->ci->gpu_done = 0;
-//        l->t->ci->unpack_done = 0;
-//      }
-//      if (l->t->cj != NULL) {
-//        l->t->cj->pack_done = 0;
-//        l->t->cj->gpu_done = 0;
-//        l->t->cj->unpack_done = 0;
-//      }
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done = 0;
+        l->t->ci->gpu_done = 0;
+        l->t->ci->unpack_done = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done = 0;
+        l->t->cj->gpu_done = 0;
+        l->t->cj->unpack_done = 0;
+      }
+#endif
     }
     for (struct link *l = c->hydro.density_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      //	  message("activating pair UN-pack\n");
-//      l->t->gpu_done = 0;
+#ifdef SWIFT_DEBUG_CHECKS
+      l->t->gpu_done = 0;
+#endif
     }
     for (struct link *l = c->hydro.gradient; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
@@ -1936,43 +1938,47 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
     // A. Nasar activate force and gradient packing tasks
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      //      message("activating pair pack force\n");
-//      if (l->t->ci != NULL) {
-//        l->t->ci->pack_done_f = 0;
-//        l->t->ci->gpu_done_f = 0;
-//        l->t->ci->unpack_done_f = 0;
-//      }
-//      if (l->t->cj != NULL) {
-//        l->t->cj->pack_done_f = 0;
-//        l->t->cj->gpu_done_f = 0;
-//        l->t->cj->unpack_done_f = 0;
-//      }
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done_f = 0;
+        l->t->ci->gpu_done_f = 0;
+        l->t->ci->unpack_done_f = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done_f = 0;
+        l->t->cj->gpu_done_f = 0;
+        l->t->cj->unpack_done_f = 0;
+      }
+#endif
     }
     for (struct link *l = c->hydro.force_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      //      message("activating pair UN-pack force\n");
+#ifdef SWIFT_DEBUG_CHECKS
       l->t->gpu_done = 0;
+#endif
     }
 
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      //      message("activating pair pack gradient\n");
-//      if (l->t->ci != NULL) {
-//        l->t->ci->pack_done_g = 0;
-//        l->t->ci->gpu_done_g = 0;
-//        l->t->ci->unpack_done_g = 0;
-//      }
-//      if (l->t->cj != NULL) {
-//        l->t->cj->pack_done_g = 0;
-//        l->t->cj->gpu_done_g = 0;
-//        l->t->cj->unpack_done_g = 0;
-//      }
+#ifdef SWIFT_DEBUG_CHECKS
+      if (l->t->ci != NULL) {
+        l->t->ci->pack_done_g = 0;
+        l->t->ci->gpu_done_g = 0;
+        l->t->ci->unpack_done_g = 0;
+      }
+      if (l->t->cj != NULL) {
+        l->t->cj->pack_done_g = 0;
+        l->t->cj->gpu_done_g = 0;
+        l->t->cj->unpack_done_g = 0;
+      }
+#endif
     }
     for (struct link *l = c->hydro.gradient_unpack; l != NULL; l = l->next) {
       scheduler_activate(s, l->t);
-      //      message("activating pair UN-pack gradient\n");
-//      l->t->gpu_done = 0;
+#ifdef SWIFT_DEBUG_CHECKS
+      l->t->gpu_done = 0;
+#endif
     }
 #endif
 
@@ -2000,7 +2006,6 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
    * so, we have to do this now, from the active remote cell). */
   else if (c->nodeID != nodeID && c_active) {
 #if defined(MPI_SYMMETRIC_FORCE_INTERACTION) && defined(WITH_MPI)
-    // A. Nasar POSSIBLE BUG HERE MISSING ACTIVATION OF PACK TASKS
     for (struct link *l = c->hydro.force; l != NULL; l = l->next) {
       struct task *t = l->t;
       if (t->type != task_type_pair && t->type != task_type_sub_pair) continue;

From fb531daebd0b349dceffb9b8480fd160c27b6480 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Tue, 12 Nov 2024 16:57:16 +0000
Subject: [PATCH 077/217] Added some files for HIP compilations

---
 src/hip/BLOCK_SIZE.h                   |   2 +-
 src/hip/Data_and_GPU_prep_functions.cu | 229 +++++++++++++++++++
 src/hip/HIP_runner_functions.h         |   2 +-
 src/hip/HIP_runner_functions.hip       | 229 +++++++++++++++++++
 src/hip/am--include-marker             |   1 +
 src/hip/cell_gpu.h                     | 292 +++++++++++++++++++++++++
 src/hip/cuda_headers.h                 |  63 ++++++
 src/hip/device_functions.h             |  18 +-
 src/hip/dummy.c                        |   2 +
 src/hip/dummy.cpp                      |   2 +
 src/hip/part_gpu.h                     | 137 ++++++++++++
 src/hip/print_something.cu             |  37 ++++
 src/hip/tasks_gpu.h                    |  74 +++++++
 src/hip/tester.cu                      |  20 ++
 14 files changed, 1097 insertions(+), 11 deletions(-)
 create mode 100755 src/hip/Data_and_GPU_prep_functions.cu
 create mode 100755 src/hip/HIP_runner_functions.hip
 create mode 100644 src/hip/am--include-marker
 create mode 100755 src/hip/cell_gpu.h
 create mode 100755 src/hip/cuda_headers.h
 create mode 100755 src/hip/dummy.c
 create mode 100755 src/hip/dummy.cpp
 create mode 100755 src/hip/part_gpu.h
 create mode 100755 src/hip/print_something.cu
 create mode 100755 src/hip/tasks_gpu.h
 create mode 100755 src/hip/tester.cu

diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
index d36e10b99b..b476b4d766 100644
--- a/src/hip/BLOCK_SIZE.h
+++ b/src/hip/BLOCK_SIZE.h
@@ -7,4 +7,4 @@
 #ifdef WITH_CUDA
 //}
 #endif
-#endif  // BLOCK_SIZE_H
+#endif // BLOCK_SIZE_H
diff --git a/src/hip/Data_and_GPU_prep_functions.cu b/src/hip/Data_and_GPU_prep_functions.cu
new file mode 100755
index 0000000000..c96734e8b3
--- /dev/null
+++ b/src/hip/Data_and_GPU_prep_functions.cu
@@ -0,0 +1,229 @@
+/*
+ * Data_and_GPU_prep_functions.cu
+ *
+ *  Created on: 17 Apr 2022
+ *      Author: abouzied
+ */
+
+/*ifdef WITH_CUDA prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+//#ifdef WITH_CUDA
+//	extern "C"{
+//#endif
+
+//#include "cuda/cuda_headers.h"
+//#include "device_functions.h"
+//#include "cuda/cell_gpu.h"
+#include <cuda_profiler_api.h>
+#include <vector.h>
+//#include "../config.h"
+
+void populate_parts_list(struct cell *ci, struct part_gpu *parts) {
+  ////////////////////////////////////////////
+  ///*****Copy variables for cell i (self interaction)*****/
+  int count = ci->hydro.count;
+
+  //	   fprintf(stderr,"Tester 111\n");
+  for (int p = 0; p < count; p++) {
+
+    parts[p].id = ci->hydro.parts[p].id;
+
+    //		   fprintf(stderr,"Tester 222\n");
+    parts[p].count = count;
+    parts[p].h_max = ci->hydro.h_max;
+
+    for (int d = 0; d < 3; d++) {
+      parts[p].x[d] = ci->hydro.parts[p].x[d];
+      parts[p].v[d] = ci->hydro.parts[p].v[d];
+      parts[p].a_hydro[d] = ci->hydro.parts[p].a_hydro[d];
+      parts[p].loc[d] = ci->loc[d];
+    }
+    parts[p].mass = ci->hydro.parts[p].mass;
+    parts[p].h = ci->hydro.parts[p].h;
+    parts[p].u = ci->hydro.parts[p].u;
+    parts[p].u_dt = ci->hydro.parts[p].u_dt;
+    parts[p].rho = ci->hydro.parts[p].rho;
+    parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+    parts[p].div_v_previous_step =
+        ci->hydro.parts[p].viscosity.div_v_previous_step;
+    parts[p].alpha_visc = ci->hydro.parts[p].viscosity.alpha;
+    parts[p].v_sig = ci->hydro.parts[p].viscosity.v_sig;
+    parts[p].laplace_u = ci->hydro.parts[p].diffusion.laplace_u;
+    parts[p].alpha_diff = ci->hydro.parts[p].diffusion.alpha;
+    parts[p].f = ci->hydro.parts[p].force.f;
+    parts[p].soundspeed = ci->hydro.parts[p].force.soundspeed;
+    parts[p].h_dt = ci->hydro.parts[p].force.h_dt;
+    parts[p].balsara = ci->hydro.parts[p].force.balsara;
+    parts[p].pressure = ci->hydro.parts[p].force.pressure;
+    parts[p].time_bin = ci->hydro.parts[p].time_bin;
+    parts[p].wakeup = ci->hydro.parts[p].limiter_data.wakeup;
+    parts[p].min_ngb_time_bin =
+        ci->hydro.parts[p].limiter_data.min_ngb_time_bin;
+    parts[p].to_be_synchronized =
+        ci->hydro.parts[p].limiter_data.to_be_synchronized;
+    parts[p].wcount = ci->hydro.parts[p].density.wcount;
+    parts[p].wcount_dh = ci->hydro.parts[p].density.wcount_dh;
+    parts[p].rho_dh = ci->hydro.parts[p].density.rho_dh;
+    parts[p].div_v = ci->hydro.parts[p].viscosity.div_v;
+    parts[p].rot_v[0] = ci->hydro.parts[p].density.rot_v[0];
+    parts[p].rot_v[1] = ci->hydro.parts[p].density.rot_v[1];
+    parts[p].rot_v[2] = ci->hydro.parts[p].density.rot_v[2];
+    parts[p].SPH_sum = 0.f;
+  }
+}
+
+void populate_parts_list_soa(
+    int count_all_parts, struct cell *ci, int first_part_tmp, int count,
+    int tid, int *tid_p, long long *id, double *x_p, double *y_p, double *z_p,
+    float *ux, float *uy, float *uz, float *a_hydrox, float *a_hydroy,
+    float *a_hydroz, float *mass, float *h, float *u, float *u_dt, float *rho,
+    float *SPH_sum, float *locx, float *locy, float *locz, float *widthx,
+    float *widthy, float *widthz, float *h_max, int *count_p, float *wcount,
+    float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v, float *rot_w,
+    float *div_v, float *div_v_previous_step, float *alpha_visc, float *v_sig,
+    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
+    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
+    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
+    char *to_be_synchronized) {
+  ////////////////////////////////////////////
+  struct part *ptmps;
+  ptmps = ci->hydro.parts;
+  //	   fprintf(stderr,"Tester 111\n");
+#pragma unroll
+  for (int p = 0; p < count; p++) {
+    int p_gid = p + first_part_tmp;
+    //		    if(p_gid>=count_all_parts){
+    //		    	fprintf(stderr,"p>all parts");
+    //		    	exit(0);
+    //		    }
+    id[p_gid] = ptmps[p].id;
+    count_p[p_gid] = count;
+    tid_p[p_gid] = tid;
+    h_max[p_gid] = ci->hydro.h_max;
+    x_p[p_gid] = ptmps[p].x[0];
+    y_p[p_gid] = ptmps[p].x[1];
+    z_p[p_gid] = ptmps[p].x[2];
+    ux[p_gid] = ptmps[p].v[0];
+    uy[p_gid] = ptmps[p].v[1];
+    uz[p_gid] = ptmps[p].v[2];
+    a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+    a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+    a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+    locx[p_gid] = ci->loc[0];
+    locy[p_gid] = ci->loc[1];
+    locz[p_gid] = ci->loc[2];
+
+    mass[p_gid] = ptmps[p].mass;
+    h[p_gid] = ptmps[p].h;
+    u[p_gid] = ptmps[p].u;
+    u_dt[p_gid] = ptmps[p].u_dt;
+    rho[p_gid] = ptmps[p].rho;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+    alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+    v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+    laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+    alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+    f[p_gid] = ptmps[p].force.f;
+    soundspeed[p_gid] = ptmps[p].force.soundspeed;
+    h_dt[p_gid] = ptmps[p].force.h_dt;
+    balsara[p_gid] = ptmps[p].force.balsara;
+    pressure[p_gid] = ptmps[p].force.pressure;
+    time_bin[p_gid] = ptmps[p].time_bin;
+    wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+    min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+    to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+    wcount[p_gid] = ptmps[p].density.wcount;
+    wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+    rho_dh[p_gid] = ptmps[p].density.rho_dh;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    rot_u[p_gid] = ptmps[p].density.rot_v[0];
+    rot_v[p_gid] = ptmps[p].density.rot_v[1];
+    rot_w[p_gid] = ptmps[p].density.rot_v[2];
+    SPH_sum[p_gid] = 0.f;
+    //			fprintf(stderr,"tid is %i\n",tid_p[p]);
+    //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+    //id[p_gid]);
+  }
+}
+
+void pack_data_soa(int count_all_parts, struct cell *ci, int first_part_tmp,
+                   int count, int tid, int *tid_p, long long *id, double *x_p,
+                   double *y_p, double *z_p, float *ux, float *uy, float *uz,
+                   float *a_hydrox, float *a_hydroy, float *a_hydroz,
+                   float *mass, float *h, float *u, float *u_dt, float *rho,
+                   float *SPH_sum, float *locx, float *locy, float *locz,
+                   float *widthx, float *widthy, float *widthz, float *h_max,
+                   int *count_p, float *wcount, float *wcount_dh, float *rho_dh,
+                   float *rot_u, float *rot_v, float *rot_w, float *div_v,
+                   float *div_v_previous_step, float *alpha_visc, float *v_sig,
+                   float *laplace_u, float *alpha_diff, float *f,
+                   float *soundspeed, float *h_dt, float *balsara,
+                   float *pressure, float *alpha_visc_max_ngb,
+                   timebin_t *time_bin, timebin_t *wakeup,
+                   timebin_t *min_ngb_time_bin, char *to_be_synchronized) {
+  ////////////////////////////////////////////
+  struct part *ptmps;
+  ptmps = ci->hydro.parts;
+  //	   fprintf(stderr,"Tester 111\n");
+#pragma unroll
+  for (int p = 0; p < count; p++) {
+    int p_gid = p + first_part_tmp;
+    //		    if(p_gid>=count_all_parts){
+    //		    	fprintf(stderr,"p>all parts");
+    //		    	exit(0);
+    //		    }
+    id[p_gid] = ptmps[p].id;
+    count_p[p_gid] = count;
+    tid_p[p_gid] = tid;
+    h_max[p_gid] = ci->hydro.h_max;
+    x_p[p_gid] = ptmps[p].x[0];
+    y_p[p_gid] = ptmps[p].x[1];
+    z_p[p_gid] = ptmps[p].x[2];
+    ux[p_gid] = ptmps[p].v[0];
+    uy[p_gid] = ptmps[p].v[1];
+    uz[p_gid] = ptmps[p].v[2];
+    a_hydrox[p_gid] = ptmps[p].a_hydro[0];
+    a_hydroy[p_gid] = ptmps[p].a_hydro[1];
+    a_hydroz[p_gid] = ptmps[p].a_hydro[2];
+    locx[p_gid] = ci->loc[0];
+    locy[p_gid] = ci->loc[1];
+    locz[p_gid] = ci->loc[2];
+
+    mass[p_gid] = ptmps[p].mass;
+    h[p_gid] = ptmps[p].h;
+    u[p_gid] = ptmps[p].u;
+    u_dt[p_gid] = ptmps[p].u_dt;
+    rho[p_gid] = ptmps[p].rho;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    div_v_previous_step[p_gid] = ptmps[p].viscosity.div_v_previous_step;
+    alpha_visc[p_gid] = ptmps[p].viscosity.alpha;
+    v_sig[p_gid] = ptmps[p].viscosity.v_sig;
+    laplace_u[p_gid] = ptmps[p].diffusion.laplace_u;
+    alpha_diff[p_gid] = ptmps[p].diffusion.alpha;
+    f[p_gid] = ptmps[p].force.f;
+    soundspeed[p_gid] = ptmps[p].force.soundspeed;
+    h_dt[p_gid] = ptmps[p].force.h_dt;
+    balsara[p_gid] = ptmps[p].force.balsara;
+    pressure[p_gid] = ptmps[p].force.pressure;
+    time_bin[p_gid] = ptmps[p].time_bin;
+    wakeup[p_gid] = ptmps[p].limiter_data.wakeup;
+    min_ngb_time_bin[p_gid] = ptmps[p].limiter_data.min_ngb_time_bin;
+    to_be_synchronized[p_gid] = ptmps[p].limiter_data.to_be_synchronized;
+    wcount[p_gid] = ptmps[p].density.wcount;
+    wcount_dh[p_gid] = ptmps[p].density.wcount_dh;
+    rho_dh[p_gid] = ptmps[p].density.rho_dh;
+    div_v[p_gid] = ptmps[p].viscosity.div_v;
+    rot_u[p_gid] = ptmps[p].density.rot_v[0];
+    rot_v[p_gid] = ptmps[p].density.rot_v[1];
+    rot_w[p_gid] = ptmps[p].density.rot_v[2];
+    SPH_sum[p_gid] = 0.f;
+    //			fprintf(stderr,"tid is %i\n",tid_p[p]);
+    //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
+    //id[p_gid]);
+  }
+}
+
+//#ifdef WITH_CUDA
+//	}
+//#endif
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
index 43a52f96ed..b85772f6b0 100644
--- a/src/hip/HIP_runner_functions.h
+++ b/src/hip/HIP_runner_functions.h
@@ -19,4 +19,4 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
 }
 #endif
 
-#endif  // CUDA_HEADER_H
+#endif // CUDA_HEADER_H
diff --git a/src/hip/HIP_runner_functions.hip b/src/hip/HIP_runner_functions.hip
new file mode 100755
index 0000000000..634c67a9ad
--- /dev/null
+++ b/src/hip/HIP_runner_functions.hip
@@ -0,0 +1,229 @@
+#include "hip/hip_runtime.h"
+/*******************************************************************************
+ * This file contains functions used to setup and execute GPU tasks from within
+ *runner_main.c. Consider this a translator allowing .cu based functions to be
+ *called from within runner_main.c
+ ******************************************************************************/
+
+/* Hacky method to make c++ compilers not die. */
+#ifdef WITH_HIP
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/* Required header files */
+#include <stdio.h>
+/*ifdef WITH_HIP prevents name mangling. C code sees exact names
+ of functions rather than mangled template names produced by C++*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../config.h"
+#include "BLOCK_SIZE.h"
+#include "HIP_runner_functions.h"
+#include "hip/device_functions.h"
+#include "part_gpu.h"
+
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  hipDeviceProp_t prop;
+  hipGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  hipSetDevice(devId);
+  // cuda
+}
+#ifdef __cplusplus
+}
+#endif
+
+__global__ void runner_do_self_density_GPU(
+    struct part_soa parts_soa, int *d_task_first_part, int *d_task_last_part,
+    int *d_bundle_first_part, int *d_bundle_last_part, float d_a, float d_H,
+    int bid, int tid, int count_tasks, int tasksperbundle, int nBlocks_per_task,
+    int bundle_first_task, int max_parts, int time_bin_inhibited) {
+  extern __shared__ float vars[];
+  __shared__ int first_part_tid_0, last_part_tid_0;
+  const int threadid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int task_id = bundle_first_task + blockIdx.y;
+
+  //	printf("task_id is %i, count_tasks is %i\n", task_id, count_tasks);
+  __shared__ int first_part_in_task_blocks, last_part_in_task_blocks;
+  first_part_in_task_blocks = d_task_first_part[task_id],
+  last_part_in_task_blocks = d_task_last_part[task_id];
+  __syncthreads();
+  const int b_first_part = d_bundle_first_part[bid];
+  const int pid = threadid + first_part_in_task_blocks;
+  const int b_last_part = d_bundle_last_part[bid];
+
+  int ttid = 0;
+  int first_part = 0;
+  int count = 0;
+  int last_part = 0;
+  float cellx = 0.0, celly = 0.0, cellz = 0.0;
+  float hi = 0.0, hig2 = hi * hi * kernel_gamma2;
+  float mi = 0.0;
+  float uxi = 0.0;
+  float uyi = 0.0;
+  float uzi = 0.0;
+  float pix = 0.0;
+  float piy = 0.0;
+  float piz = 0.0;
+  float rhoi = 0.0;
+  float rho_dhi = 0.0;
+  float wcounti = 0.0;
+  float wcount_dhi = 0.0;
+  float div_vi = 0.0;
+  float rot_uxi = 0.0;
+  float rot_uyi = 0.0;
+  float rot_uzi = 0.0;
+  //	if(pid<b_last_part&&pid<last_part_in_task_blocks){
+  if (pid < last_part_in_task_blocks) {
+    ttid = parts_soa.tid_p[pid];
+    first_part = d_task_first_part[ttid];
+    last_part = d_task_last_part[ttid];
+    count = last_part - first_part;
+    cellx = parts_soa.locx[pid], celly = parts_soa.locy[pid],
+    cellz = parts_soa.locz[pid];
+    hi = parts_soa.h[pid], hig2 = hi * hi * kernel_gamma2;
+    mi = parts_soa.mass[pid];
+    uxi = parts_soa.ux[pid];
+    uyi = parts_soa.uy[pid];
+    uzi = parts_soa.uz[pid];
+    pix = parts_soa.x_p[pid] - cellx;
+    piy = parts_soa.y_p[pid] - celly;
+    piz = parts_soa.z_p[pid] - cellz;
+  }
+  if (threadIdx.x == 0) {
+    first_part_tid_0 = first_part;
+    last_part_tid_0 = last_part;
+  }
+  __syncthreads();
+  int n_neighbours = 0;
+  /*Here we use different pointers "x_p_tmp", etc. to point to different regions
+   * of the single shared memory space "vars" which we allocate in kernel
+   * invocation*/
+  float *x_p_tmp = (float *)&vars[0];
+  float *y_p_tmp = (float *)&vars[BLOCK_SIZE];
+  float *z_p_tmp = (float *)&vars[BLOCK_SIZE * 2];
+  float *h_tmp = (float *)&vars[BLOCK_SIZE * 3];
+  float *mass_tmp = (float *)&vars[BLOCK_SIZE * 4];
+  float *ux_tmp = (float *)&vars[BLOCK_SIZE * 5];
+  float *uy_tmp = (float *)&vars[BLOCK_SIZE * 6];
+  float *uz_tmp = (float *)&vars[BLOCK_SIZE * 7];
+  timebin_t *timebin = (timebin_t *)&vars[BLOCK_SIZE * 8];
+  /*Particles copied in blocks to shared memory*/
+  for (int b = first_part_in_task_blocks; b < last_part_in_task_blocks;
+       b += BLOCK_SIZE) {
+    int j = b + threadIdx.x;
+    x_p_tmp[threadIdx.x] = parts_soa.x_p[j];
+    y_p_tmp[threadIdx.x] = parts_soa.y_p[j];
+    z_p_tmp[threadIdx.x] = parts_soa.z_p[j];
+    h_tmp[threadIdx.x] = parts_soa.h[j];
+    mass_tmp[threadIdx.x] = parts_soa.mass[j];
+    ux_tmp[threadIdx.x] = parts_soa.ux[j];
+    uy_tmp[threadIdx.x] = parts_soa.uy[j];
+    uz_tmp[threadIdx.x] = parts_soa.uz[j];
+    timebin[threadIdx.x] = parts_soa.time_bin[j];
+    __syncthreads();
+    for (int j_block = 0; j_block < BLOCK_SIZE; j_block++) {
+      j = j_block + b;
+//      if ((j != pid) && (j < last_part_in_task_blocks) &&
+//          timebin[j_block] != time_bin_inhibited) {
+//      if ((j < last_part_in_task_blocks) &&
+//    	  timebin[j_block] != time_bin_inhibited) {
+      if (j < last_part_in_task_blocks) {
+        /* Compute the pairwise distance. */
+        const float pjx = x_p_tmp[j_block] - cellx;
+        const float pjy = y_p_tmp[j_block] - celly;
+        const float pjz = z_p_tmp[j_block] - cellz;
+        const float xij = pix - pjx, yij = piy - pjy, zij = piz - pjz;
+        const float r2 = xij * xij + yij * yij + zij * zij;
+        const float hj = h_tmp[j_block], hjg2 = hj * hj * kernel_gamma2;
+        //				if((hi < 0.0001f || hj < 0.0001f || r2 <
+        //0.0000001f) && pid < last_part_in_task_blocks){ 					printf("very small
+        //value for hi %f or hj %f or r2 %f\n", hi, hj, r2);
+        //				}
+        if (r2 < hig2 && r2 > (0.01f/128.f)*(0.01f/128.f)) {
+          const float r = sqrt(r2);
+          /* Recover some data */
+          const float mj = mass_tmp[j_block];
+          /* Get the kernel for hi. */
+          if(hi<1.f/128.f)printf("h < dx\n");
+          const float h_inv = 1.f / hi;
+          const float ui = r * h_inv;
+          float wi, wi_dx;
+
+          d_kernel_deval(ui, &wi, &wi_dx);
+
+          rhoi += mj * wi;
+          rho_dhi -= mj * (hydro_dimension * wi + ui * wi_dx);
+
+          wcounti += wi;
+          wcount_dhi -= (hydro_dimension * wi + ui * wi_dx);
+
+          const float r_inv = 1.f / r;
+          const float faci = mj * wi_dx * r_inv;
+
+          /* Compute dv dot r */
+          float dvx = uxi - ux_tmp[j_block], dvy = uyi - uy_tmp[j_block],
+                dvz = uzi - uz_tmp[j_block];
+          const float dvdr = dvx * xij + dvy * yij + dvz * zij;
+
+          div_vi -= faci * dvdr;
+
+          /* Compute dv cross r */
+          float curlvrx = dvy * zij - dvz * yij;
+          float curlvry = dvz * xij - dvx * zij;
+          float curlvrz = dvx * yij - dvy * xij;
+
+          rot_uxi += faci * curlvrx;
+          rot_uyi += faci * curlvry;
+          rot_uzi += faci * curlvrz;
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (pid < last_part_in_task_blocks) {
+	float wi, wi_dx;
+	d_kernel_deval(0.f, &wi, &wi_dx);
+//	printf("mass i %e, self rho %e sum rho %e\n", mi, mi*wi, rhoi);
+    parts_soa.rho[pid] = rhoi, parts_soa.rho_dh[pid] = rho_dhi;
+    parts_soa.wcount[pid] = wcounti, parts_soa.wcount_dh[pid] = wcount_dhi;
+    parts_soa.div_v[pid] = div_vi;
+    parts_soa.rot_ux[pid] = rot_uxi, parts_soa.rot_uy[pid] = rot_uyi,
+    parts_soa.rot_uz[pid] = rot_uzi;
+  }
+}
+#ifdef __cplusplus
+extern "C" {
+#endif
+void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
+                           int *d_task_last_part, int *d_bundle_first_part,
+                           int *d_bundle_last_part, float d_a, float d_H,
+                           const char *loop_type, hipStream_t stream, int bid,
+                           int block_size, int count_tasks, int tasksperbundle,
+                           int numBlocks_x, int numBlocks_y, int tid,
+                           int offset, int bundle_first_task, int max_parts,
+                           int time_bin_inhibited) {
+
+  dim3 gridShape = dim3(numBlocks_x, numBlocks_y);
+  int nBlocks_per_task = numBlocks_x;
+  runner_do_self_density_GPU<<<gridShape, BLOCK_SIZE,
+                               8 * BLOCK_SIZE * sizeof(float) +
+                                   BLOCK_SIZE * sizeof(timebin_t),
+                               stream>>>(
+      parts_soa, d_task_first_part, d_task_last_part, d_bundle_first_part,
+      d_bundle_last_part, d_a, d_H, bid, tid, count_tasks, tasksperbundle,
+      nBlocks_per_task, bundle_first_task, max_parts, time_bin_inhibited);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/am--include-marker b/src/hip/am--include-marker
new file mode 100644
index 0000000000..9ce06a81ea
--- /dev/null
+++ b/src/hip/am--include-marker
@@ -0,0 +1 @@
+# dummy
diff --git a/src/hip/cell_gpu.h b/src/hip/cell_gpu.h
new file mode 100755
index 0000000000..0265592a97
--- /dev/null
+++ b/src/hip/cell_gpu.h
@@ -0,0 +1,292 @@
+#ifndef CELL_GPU_H
+#define CELL_GPU_H
+/* Config parameters. */
+#include "../config.h"
+typedef int8_t timebin_t;
+struct xpart_gpu {
+  /*! Offset between current position and position at last tree rebuild. */
+  float x_diff[3];
+  /*! Offset between the current position and position at the last sort. */
+  float x_diff_sort[3];
+  /*! Velocity at the last full step. */
+  float v_full[3];
+  /*! Internal energy at the last full step. */
+  float u_full;
+};
+struct part_gpu {
+  /*Task ID*/
+  int tid;
+  /*! Particle unique ID. */
+  long long id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  float x[3];
+  /*! Particle predicted velocity. */
+  float v[3];
+  /*! Particle acceleration. */
+  float a_hydro[3];
+  /*! Particle mass. */
+  float mass;
+  /*! Particle smoothing length. */
+  float h;
+  /*! Particle internal energy. */
+  float u;
+  /*! Time derivative of the internal energy. */
+  float u_dt;
+  /*! Particle density. */
+  float rho;
+  /*! Kernel summation (For testing/debugging). */
+  float SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  float h_max;
+  int count;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float rho_dh;
+
+  /*! Particle velocity curl. */
+  float rot_v[3];
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float alpha_visc;
+
+  /*! Signal velocity */
+  float v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float f;
+
+  /*! Particle soundspeed. */
+  float soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float h_dt;
+
+  /*! Balsara switch */
+  float balsara;
+
+  /*! Particle pressure. */
+  float pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char to_be_synchronized;
+};
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+
+} part_soa;
+
+struct task_cell {
+  struct part_gpu *parts;
+};
+// struct parts_gpu_SoA{
+//	struct task_cell *tasks;
+// };
+
+struct cell_hydro_gpu {
+  //	struct part_gpu *parts;
+  //	struct xpart_gpu *xparts;
+  float h_max;
+  int count;
+};
+struct cell_gpu {
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  /*Details of contents (particles) and properties*/
+  struct cell_hydro_gpu hydro;
+};
+struct cell_gpu_flat {
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float loc[3];
+  /*! The cell dimensions. */
+  float width[3];
+  float h_max;
+  int count;
+};
+
+struct cells_gpu_flat {
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  /*! The cell location on the grid (corner nearest to the origin). */
+  /*	float *loc[3];*/
+  /*! The cell dimensions. */
+  /*	float *width[3];*/
+  float *h_max;
+  int *count;
+};
+
+struct cells_gpu_flat_test {
+  float *locx;
+};
+
+#endif // CELL_GPU_H
diff --git a/src/hip/cuda_headers.h b/src/hip/cuda_headers.h
new file mode 100755
index 0000000000..40782e0056
--- /dev/null
+++ b/src/hip/cuda_headers.h
@@ -0,0 +1,63 @@
+#ifndef CUDA_HEADERS_H
+#define CUDA_HEADERS_H
+#define n_streams 1024
+
+#ifdef WITH_CUDA
+extern "C" {
+#endif
+
+void GPU_runner_doself1_branch_gradient(struct cell_gpu *ci_gpu,
+                                        struct part_gpu *parts_gpu);
+void cuda_tester(struct cell **ci_list_mgd, int numBlocksTest,
+                 int block_size_test, int count_tasks);
+void launch_cuda_kernel(struct cell_gpu *ci_gpu, struct part_gpu *parts,
+                        int numBlocks, float d_a, float d_H,
+                        const char *loop_type);
+void launch_cuda_kernel_streams(struct part_gpu *d_parts, int numBlocks,
+                                float d_a, float d_H, const char *loop_type,
+                                cudaStream_t stream, int tid, int count,
+                                int max_count, float cellx, float celly,
+                                float cellz, int first_part, int last_part);
+void launch_cuda_kernel_bundles(struct cell_gpu *d_all_cells,
+                                struct part_gpu **d_all_parts, int numBlocks,
+                                float d_a, float d_H, const char *loop_type,
+                                cudaStream_t stream, int bid, int block_size,
+                                int count_tasks, int tasksperbundle,
+                                int numBlocks_x, int numBlocks_y, int tid,
+                                int offset);
+void launch_cuda_kernel_bundles_revised(
+    struct part_gpu *d_all_parts, int *d_task_first_part, int *d_task_last_part,
+    int *d_bundle_first_part, int *d_bundle_last_part, int numBlocks, float d_a,
+    float d_H, const char *loop_type, cudaStream_t stream, int bid,
+    int block_size, int count_tasks, int tasksperbundle, int numBlocks_x,
+    int numBlocks_y, int tid, int offset);
+void launch_cuda_kernel_bundles_revised_soa(
+    struct part_soa parts_gpu_soa, int *d_task_first_part,
+    int *d_task_last_part, int *d_bundle_first_part, int *d_bundle_last_part,
+    int numBlocks, float d_a, float d_H, const char *loop_type,
+    cudaStream_t stream, int bid, int block_size, int count_tasks,
+    int tasksperbundle, int numBlocks_x, int numBlocks_y, int tid, int offset,
+    int bundle_first_task, int max_parts);
+void launch_cuda_print_streams(int numBlocks, cudaStream_t stream, int tid);
+void launch_cuda_kernel_tester(struct cell_gpu *d_ci_gpu,
+                               struct part_gpu **d_parts, int numBlocks,
+                               float d_a, float d_H, const char *loop_type,
+                               cudaStream_t stream, int bid, int block_size,
+                               int count_tasks, int tasksperbundle,
+                               int numBlocks_x, int numBlocks_y, int tid);
+void launch_cuda_kernel_bundles_test(struct cell_gpu *d_all_cells,
+                                     struct part_gpu **d_all_parts,
+                                     int numBlocks, float d_a, float d_H,
+                                     int count_tasks);
+void mgd_mem_cuda_kernel_bundles(struct part_gpu **parts_gpu_list,
+                                 int numBlocks, float d_a, float d_H,
+                                 const char *loop_type, cudaStream_t stream,
+                                 int bid, int block_size, int count_tasks,
+                                 int tasksperbundle, int numBlocks_x,
+                                 int numBlocks_y, int tid, int offset);
+
+#ifdef WITH_CUDA
+}
+#endif
+
+#endif // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
index 237c87dec1..2cba0e9829 100644
--- a/src/hip/device_functions.h
+++ b/src/hip/device_functions.h
@@ -3,11 +3,11 @@
 #include "../../config.h"
 
 /* Local headers. */
-// #include "../dimension.h"
-// #include "../error.h"
-// #include "../inline.h"
-// #include "../minmax.h"
-// #include "../vector.h"
+//#include "../dimension.h"
+//#include "../error.h"
+//#include "../inline.h"
+//#include "../minmax.h"
+//#include "../vector.h"
 
 // Is this even necessary? Probably not as our code will operate differently
 #define num_cuda_threads 128
@@ -22,11 +22,11 @@
 #define kernel_ivals 2
 #define kernel_degree 3 /*!< Degree of the polynomial */
 #define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_dim_plus_one \
+#define kernel_gamma_dim_plus_one                                              \
   ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_inv_dim \
+#define kernel_gamma_inv_dim                                                   \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
-#define kernel_gamma_inv_dim_plus_one \
+#define kernel_gamma_inv_dim_plus_one                                          \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
 #define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
 #define kernel_constant ((float)(16. * M_1_PI))
@@ -146,4 +146,4 @@ __device__ void d_kernel_deval(float u, float *restrict W,
 }
 #endif
 
-#endif  // DEVICE_FUNCTIONS_H
+#endif // DEVICE_FUNCTIONS_H
diff --git a/src/hip/dummy.c b/src/hip/dummy.c
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.c
@@ -0,0 +1,2 @@
+#include <stdio.h>
+void swiftcudadummy() {}
diff --git a/src/hip/dummy.cpp b/src/hip/dummy.cpp
new file mode 100755
index 0000000000..66ab4665f9
--- /dev/null
+++ b/src/hip/dummy.cpp
@@ -0,0 +1,2 @@
+#include <stdio.h>
+void swiftcudadummy() {}
diff --git a/src/hip/part_gpu.h b/src/hip/part_gpu.h
new file mode 100755
index 0000000000..a19257abc4
--- /dev/null
+++ b/src/hip/part_gpu.h
@@ -0,0 +1,137 @@
+#ifndef PART_GPU_H
+#define PART_GPU_H
+/* Config parameters. */
+#include "../../config.h"
+typedef int8_t timebin_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// extern "C" {
+
+typedef struct part_soa {
+  /*Task ID*/
+  int *tid_p;
+  /*bundle ID*/
+  int *bid_p;
+  /*! Particle unique ID. */
+  long long *id;
+  /*! Pointer to corresponding gravity part. */
+  //	struct gpu_gpart* gpart;
+  /*! Particle position. */
+  double *x_p;
+  double *y_p;
+  double *z_p;
+  /*! Particle predicted velocity. */
+  float *ux;
+  float *uy;
+  float *uz;
+  /*! Particle acceleration. */
+  float *a_hydrox;
+  float *a_hydroy;
+  float *a_hydroz;
+  /*! Particle mass. */
+  float *mass;
+  /*! Particle smoothing length. */
+  float *h;
+  /*! Particle internal energy. */
+  float *u;
+  /*! Time derivative of the internal energy. */
+  float *u_dt;
+  /*! Particle density. */
+  float *rho;
+  /*! Kernel summation (For testing/debugging). */
+  float *SPH_sum;
+
+  /* Cell information */
+  /*! The cell location on the grid (corner nearest to the origin). */
+  float *locx;
+  float *locy;
+  float *locz;
+  /*! The cell dimensions. */
+  float *widthx;
+  float *widthy;
+  float *widthz;
+  float *h_max;
+  int *count_p;
+  int *count_test;
+  /* Density information */
+
+  /*! Neighbour number count. */
+  float *wcount;
+
+  /*! Derivative of the neighbour number with respect to h. */
+  float *wcount_dh;
+
+  /*! Derivative of density with respect to h */
+  float *rho_dh;
+
+  /*! Particle velocity curl. */
+  float *rot_ux;
+  float *rot_uy;
+  float *rot_uz;
+
+  /* viscosity information */
+
+  /*! Particle velocity divergence */
+  float *div_v;
+
+  /*! Particle velocity divergence from previous step */
+  float *div_v_previous_step;
+
+  /*! Artificial viscosity parameter */
+  float *alpha_visc;
+
+  /*! Signal velocity */
+  float *v_sig;
+
+  /* thermal diffusion information  */
+
+  /*! del^2 u, a smoothed quantity */
+  float *laplace_u;
+
+  /*! Thermal diffusion coefficient */
+  float *alpha_diff;
+
+  /* force information  */
+
+  /*! "Grad h" term -- only partial in P-U */
+  float *f;
+
+  /*! Particle soundspeed. */
+  float *soundspeed;
+
+  /*! Time derivative of smoothing length  */
+  float *h_dt;
+
+  /*! Balsara switch */
+  float *balsara;
+
+  /*! Particle pressure. */
+  float *pressure;
+  /*! Maximal alpha (viscosity) over neighbours */
+  float *alpha_visc_max_ngb;
+
+  /* timestep stuff */
+
+  /*! Time-step length */
+  timebin_t *time_bin;
+
+  /*all part of struct timestep_limiter_data, we had to destruct it
+   as GPUs don't like pointer chasing especially when memcpying*/
+  /* Need waking-up ? */
+  timebin_t *wakeup;
+
+  /*! Minimal time-bin across all neighbours */
+  timebin_t *min_ngb_time_bin;
+
+  /* Do we want this particle to be synched back on the time-line? */
+  char *to_be_synchronized;
+};
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif // PART_GPU_H
diff --git a/src/hip/print_something.cu b/src/hip/print_something.cu
new file mode 100755
index 0000000000..b69ad05dd4
--- /dev/null
+++ b/src/hip/print_something.cu
@@ -0,0 +1,37 @@
+#ifdef WITH_CUDA
+#ifndef static
+#define static
+#endif
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "cuda_headers.h"
+#ifdef __cplusplus
+}
+#endif
+
+extern "C" {
+void print_something_cu() { printf("In Here\n"); }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void Initialise_GPU() {
+  int devId = 0;
+  // find and print device name
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, devId);
+  printf("Device : %s\n", prop.name);
+  cudaSetDevice(devId);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/hip/tasks_gpu.h b/src/hip/tasks_gpu.h
new file mode 100755
index 0000000000..a3912aee2c
--- /dev/null
+++ b/src/hip/tasks_gpu.h
@@ -0,0 +1,74 @@
+/* Config parameters. */
+#include "../config.h"
+
+struct tasks_self_gpu {
+  struct task_gpu *tgpu;
+};
+
+/**
+ * @brief A task to be run by the #scheduler.
+ */
+struct task_gpu {
+
+  /*! Pointers to the cells this task acts upon */
+  struct cell *ci, *cj;
+
+  /*! List of tasks unlocked by this one */
+  struct task_gpu **unlock_tasks;
+
+  /*! Flags used to carry additional information (e.g. sort directions) */
+  long long flags;
+
+#ifdef WITH_MPI
+
+  /*! Buffer for this task's communications */
+  void *buff;
+
+  /*! MPI request corresponding to this task */
+  MPI_Request req;
+
+#endif
+
+  /*! Rank of a task in the order */
+  int rank;
+
+  /*! Weight of the task */
+  float weight;
+
+  /*! Number of tasks unlocked by this one */
+  int nr_unlock_tasks;
+
+  /*! Number of unsatisfied dependencies */
+  int wait;
+
+  /*! Type of the task */
+  enum task_types type;
+
+  /*! Sub-type of the task (for the tasks that have one */
+  enum task_subtypes subtype;
+
+  /*! Should the scheduler skip this task ? */
+  char skip;
+
+  /*! Is this task implicit (i.e. does not do anything) ? */
+  char implicit;
+
+#ifdef SWIFT_DEBUG_TASKS
+  /*! ID of the queue or runner owning this task */
+  short int rid;
+
+  /*! Information about the direction of the pair task */
+  short int sid;
+#endif
+
+  /*! Start and end time of this task */
+  ticks tic, toc;
+
+  /* Total time spent running this task */
+  ticks total_ticks;
+
+#ifdef SWIFT_DEBUG_CHECKS
+  /* When was this task last run? */
+  integertime_t ti_run;
+#endif /* SWIFT_DEBUG_CHECKS */
+};
diff --git a/src/hip/tester.cu b/src/hip/tester.cu
new file mode 100755
index 0000000000..5e70230211
--- /dev/null
+++ b/src/hip/tester.cu
@@ -0,0 +1,20 @@
+#include "tester.h"
+#include <iostream>
+#include <vector>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void testing_linkage(int a, float *b, float c) {
+  std::vector<float> b_value_list;
+  b_value_list.reserve(a);
+  for (int i = 0; i < a; i++) {
+    (*b) = (*b) + c;
+    b_value_list.push_back((*b));
+    std::cout << "Vector value is " << b_value_list[i] << " b value is " << (*b)
+              << std::endl;
+  }
+  std::cout << "Final value of b is " << (*b) << std::endl;
+}
+#ifdef __cplusplus
+}
+#endif

From 69d7d477ca6afab9ad9071855bc6e7a0336c1554 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 19 Nov 2024 14:45:47 +0000
Subject: [PATCH 078/217] Quick stab at splitting GPU tasks. May have to revert
 as done in haste

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   3 +-
 src/cuda/part_gpu.h                           |   2 +-
 src/engine_maketasks.c                        |  37 +--
 src/engine_marktasks.c                        |  14 +-
 src/runner_main_clean.cu                      | 227 +++++++++++++++++-
 src/scheduler.c                               |  48 +++-
 6 files changed, 300 insertions(+), 31 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index ff5ede11dd..5f8b0d9f95 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -8,7 +8,7 @@ InternalUnitSystem:
 
 Scheduler:
   max_top_level_cells: 16
-  cell_split_size: 1000
+  cell_split_size: 64
   deadlock_waiting_time_s: 10.
 
 # Parameters governing the time integration
@@ -38,3 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
+  replicate:  2
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index a9153fecd5..92d12f45bd 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.6/targets/x86_64-linux/include/vector_types.h"
+#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 0e6a13fb1d..a58c1cc2cf 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2207,11 +2207,14 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
       } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.density_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.force_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_g) {
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2227,11 +2230,17 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
       } else if (t_subtype == task_subtype_gpu_pack) {
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.density_pack, t);
+        engine_addlink(e, &cj->hydro.density_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.force_pack, t);
+        engine_addlink(e, &cj->hydro.force_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_g) {
-        error("Abouzied: you need to code this up!");
+        engine_addlink(e, &ci->hydro.gradient_pack, t);
+        engine_addlink(e, &cj->hydro.gradient_pack, t);
+//        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -4872,7 +4881,7 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->subtype != task_subtype_gpu_pack) continue;
 
-    if (t->type == task_type_self) {
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
 
       if (count_current_self % pack_size == 0) {
         last_created_self_unpack = scheduler_addtask(
@@ -4892,16 +4901,16 @@ void engine_maketasks(struct engine *e) {
       ++count_current_self;
     }
 
-    else if (t->type == task_type_pair) {
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (count_current_pair % pack_size == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
       }
 
       /* pack -> unpack -> ghost_in */
-      if (t->ci->hydro.ghost_in == NULL && t->ci->nodeID == e->nodeID)
+      if (t->ci->hydro.super->hydro.ghost_in == NULL && t->ci->nodeID == e->nodeID)
         message("Ghost in for cell i is NULL\n");
-      if (t->cj->hydro.ghost_in == NULL && t->cj->nodeID == e->nodeID)
+      if (t->cj->hydro.super->hydro.ghost_in == NULL && t->cj->nodeID == e->nodeID)
         message("Ghost in for cell j is NULL\n");
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
@@ -4977,7 +4986,7 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->subtype != task_subtype_gpu_pack_g) continue;
 
-    if (t->type == task_type_self) {
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
 
       if (count_current_self % pack_size == 0) {
         last_created_self_unpack = scheduler_addtask(
@@ -4997,7 +5006,7 @@ void engine_maketasks(struct engine *e) {
       ++count_current_self;
     }
 
-    else if (t->type == task_type_pair) {
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (count_current_pair % pack_size == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
@@ -5067,7 +5076,7 @@ void engine_maketasks(struct engine *e) {
     struct task *t = &sched->tasks[i];
     if (t->subtype != task_subtype_gpu_pack_f) continue;
 
-    if (t->type == task_type_self) {
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
 
       if (count_current_self % pack_size == 0) {
         last_created_self_unpack = scheduler_addtask(
@@ -5086,7 +5095,7 @@ void engine_maketasks(struct engine *e) {
       ++count_current_self;
     }
 
-    else if (t->type == task_type_pair) {
+    else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (count_current_pair % pack_size == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 26bdc1333f..00222ffc46 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -186,12 +186,21 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         }
       }
 
+      /* Store current values of dx_max and h_max. A. Nasar: Unsure if we actually need this*/
+      else if (t_type == task_type_sub_self &&
+               t_subtype == task_subtype_gpu_pack) {
+        if (ci_active_hydro) {
+          scheduler_activate(s, t);
+        }
+      }
+
       else if (t_type == task_type_self && t_subtype == task_subtype_force) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_force) {
+               (t_subtype == task_subtype_force ||
+            	t_subtype == task_subtype_gpu_pack_f)) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
@@ -210,7 +219,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_gradient) {
+               t_subtype == task_subtype_gradient ||
+			   t_subtype == task_subtype_gpu_pack_g) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7eb2138a57..7e16db394b 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1570,17 +1570,133 @@ void *runner_main2(void *data) {
             time_for_density_cpu_sub +=
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-          }
+          /* GPU WORK */
+        } else if (t->subtype == task_subtype_gpu_pack) {
+          packed_self++;
+#ifdef GPUOFFLOAD_DENSITY
+          //          struct timespec t0, t1; //
+          //          clock_gettime(CLOCK_REALTIME, &t0);
+          ticks tic_cpu_pack = getticks();
+
+          packing_time +=
+              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+                                     parts_aos_f4_send, task_first_part_f4);
+
+	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+          //      	  clock_gettime(CLOCK_REALTIME, &t1);
+          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
+          //      			(t1.tv_nsec - t0.tv_nsec) /
+          //      1000000000.0;
+          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
+          //        		  t, parts_aos_dens, &packing_time);
+          /* No pack tasks left in queue, flag that we want to run */
+          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+          /*Packed enough tasks let's go*/
+          int launch = pack_vars_self_dens->launch;
+          /* Do we have enough stuff to run the GPU ? */
+          if (launch) n_full_d_bundles++;
+          if (launch_leftovers) n_partial_d_bundles++;
+          if (launch || launch_leftovers) {
+            /*Launch GPU tasks*/
+            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
+            runner_doself1_launch_f4(
+                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+                &unpack_time_self, task_first_part_self_dens_f4, devId,
+                task_first_part_f4, d_task_first_part_f4, self_end);
+            //	        runner_doself1_launch(r, sched,
+            // pack_vars_self_dens, ci, t, parts_aos_dens,
+            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
+            // &time_for_density_gpu,
+            // &tot_time_for_hard_memcpys);
+          } /*End of GPU work Self*/
+#endif  //GPUOFFLOAD_DENSITY
+        } /* self / pack */
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
             fprintf(stderr, "split a g task\n");
           }
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+              //        		  t, parts_aos_grad, &packing_time_g);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_g += runner_doself1_pack_f4_g(
+                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                  task_first_part_f4_g);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //      	        runner_doself1_launch_g(r, sched,
+                //      pack_vars_self_grad, ci, t, parts_aos_grad,
+                //      	        		d_parts_aos_grad, stream, d_a,
+                //      d_H, e, &packing_time_g, &time_for_gpu_g);
+                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
+                runner_doself1_launch_f4_g(
+                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+              } /*End of GPU work Self*/
+#endif //GPUOFFLOAD_GRADIENT
+          }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
             fprintf(stderr, "split a f task\n");
-          } else if (t->subtype == task_subtype_limiter)
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+              //        		  t, parts_aos_forc, &packing_time_f);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_f += runner_doself1_pack_f4_f(
+                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                  task_first_part_f4_f);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              //          int count = ci->hydro.count;
+              //          for(int i = 0; i < count; i++){
+              //        	  int pid = pack_vars_self_forc->count_parts - count +
+              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
+              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
+              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
+              //          }
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //  	        runner_doself1_launch_f(r, sched,
+                //  pack_vars_self_forc, ci, t, parts_aos_forc,
+                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
+                //  &time_for_gpu_f);
+                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
+                runner_doself1_launch_f4_f(
+                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+              } /*End of GPU work Self*/
+#endif //GPUOFFLOAD_FORCE
+          }
+          else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_self_stars_density(r, ci, 1);
@@ -1623,15 +1739,122 @@ void *runner_main2(void *data) {
             fprintf(stderr, "Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
+          if (t->subtype == task_subtype_gpu_pack) {
+#ifdef GPUOFFLOAD_DENSITY
+		  ticks tic_cpu_pack = getticks();
+
+		  packing_time_pair += runner_dopair1_pack_f4(
+			  r, sched, pack_vars_pair_dens, ci, cj, t,
+			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+
+		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+		  /* Packed enough tasks or no pack tasks left in queue, flag that
+		   * we want to run */
+		  int launch = pack_vars_pair_dens->launch;
+		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+
+		  /* Do we have enough stuff to run the GPU ? */
+		  if (launch) n_full_p_d_bundles++;
+		  if (launch_leftovers) n_partial_p_d_bundles++;
+		  if (launch || launch_leftovers) {
+
+			/*Launch GPU tasks*/
+			//				runner_dopair1_launch(r, sched,
+			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+			//						d_parts_aos_pair_dens,
+			// stream, d_a, d_H, e, &packing_time_pair,
+			//&time_for_density_gpu_pair);
+			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
+			runner_dopair1_launch_f4_one_memcpy(
+				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+				&packing_time_pair, &time_for_density_gpu_pair,
+				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+				pair_end);
+		  }
+#endif
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
             fprintf(stderr, "split a g task\n");
+          } else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+              //        		  t, parts_aos_grad, &packing_time_g);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_g += runner_doself1_pack_f4_g(
+                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                  task_first_part_f4_g);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //      	        runner_doself1_launch_g(r, sched,
+                //      pack_vars_self_grad, ci, t, parts_aos_grad,
+                //      	        		d_parts_aos_grad, stream, d_a,
+                //      d_H, e, &packing_time_g, &time_for_gpu_g);
+                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
+                runner_doself1_launch_f4_g(
+                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+              } /*End of GPU work Self*/
+#endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
             fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+              //        		  t, parts_aos_forc, &packing_time_f);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_f += runner_doself1_pack_f4_f(
+                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                  task_first_part_f4_f);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              //          int count = ci->hydro.count;
+              //          for(int i = 0; i < count; i++){
+              //        	  int pid = pack_vars_self_forc->count_parts - count +
+              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
+              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
+              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
+              //          }
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //  	        runner_doself1_launch_f(r, sched,
+                //  pack_vars_self_forc, ci, t, parts_aos_forc,
+                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
+                //  &time_for_gpu_f);
+                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
+                runner_doself1_launch_f4_f(
+                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+              } /*End of GPU work Self*/
+#endif
           } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
diff --git a/src/scheduler.c b/src/scheduler.c
index 87fe4b3c16..f0d59975dd 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1652,20 +1652,23 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
 
     /* Invoke the correct splitting strategy */
     if (t->subtype == task_subtype_density) {
-      //      scheduler_splittask_hydro(t, s);
+            scheduler_splittask_hydro(t, s);
     } else if (t->subtype == task_subtype_external_grav) {
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
       scheduler_splittask_gravity(t, s);
       // if task is gpu task do not split A. Nasar
     } else if (t->subtype == task_subtype_gpu_pack ||
-               t->subtype == task_subtype_gpu_unpack ||
                t->subtype == task_subtype_gpu_pack_g ||
-               t->subtype == task_subtype_gpu_unpack_g ||
-               t->subtype == task_subtype_gpu_pack_f ||
-               t->subtype == task_subtype_gpu_unpack_f) {
-      continue; /*Do nothing and grab next task to split*/
-    } else {
+               t->subtype == task_subtype_gpu_pack_f) {
+        scheduler_splittask_hydro(t, s);
+//      continue; /*Do nothing and grab next task to split*/
+    } else if (t->subtype == task_subtype_gpu_unpack ||
+            t->subtype == task_subtype_gpu_unpack_g ||
+            t->subtype == task_subtype_gpu_unpack_f){
+    	continue;
+    }
+    else {
 #ifdef SWIFT_DEBUG_CHECKS
       error("Unexpected task sub-type %s/%s", taskID_names[t->type],
             subtaskID_names[t->subtype]);
@@ -2254,7 +2257,18 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
 
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * (bcount_i + bcount_j);
-
+        } else if (t->subtype == task_subtype_gpu_pack) {
+		  cost = 2.f * (wscale * count_i) * count_i;
+		} else if (t->subtype == task_subtype_gpu_pack_f) {
+		  cost = 2.f * (wscale * count_i) * count_i;
+		} else if (t->subtype == task_subtype_gpu_pack_g) {
+		  cost = 2.f * (wscale * count_i) * count_i;
+		} else if (t->subtype == task_subtype_gpu_unpack) {
+		  cost = 1.f * wscale;
+		} else if (t->subtype == task_subtype_gpu_unpack_f) {
+		  cost = 1.f * wscale;
+		} else if (t->subtype == task_subtype_gpu_unpack_g) {
+		  cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_density ||
                    t->subtype == task_subtype_gradient ||
                    t->subtype == task_subtype_force ||
@@ -2293,7 +2307,19 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * wscale * count_i;
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * bcount_i;
-        } else if (t->subtype == task_subtype_density ||
+        } else if (t->subtype == task_subtype_gpu_pack)  // A. Nasar
+            cost = 1.f * (wscale * count_i) * count_i;   // * s->pack_size;
+          else if (t->subtype == task_subtype_gpu_pack_f)
+            cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+          else if (t->subtype == task_subtype_gpu_pack_g)
+            cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+          else if (t->subtype == task_subtype_gpu_unpack)
+            cost = 1.f * wscale * s->pack_size;
+          else if (t->subtype == task_subtype_gpu_unpack_f)
+            cost = 1.f * wscale * s->pack_size;
+          else if (t->subtype == task_subtype_gpu_unpack_g)
+            cost = 1.f * wscale * s->pack_size;
+          else if (t->subtype == task_subtype_density ||
                    t->subtype == task_subtype_gradient ||
                    t->subtype == task_subtype_force ||
                    t->subtype == task_subtype_limiter) {
@@ -2864,7 +2890,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
     // A. Nasar Do the same for the pack tasks
-    if (t->type == task_type_self) {
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
       if (t->subtype == task_subtype_gpu_pack)
         atomic_inc(&s->queues[qid].n_packs_self_left);
       if (t->subtype == task_subtype_gpu_pack_f)
@@ -2873,7 +2899,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         atomic_inc(&s->queues[qid].n_packs_self_left_g);
     }
     if (t->type ==
-        task_type_pair) {  // A. Nasar NEED to think about how to do this with
+        task_type_pair || t->type == task_type_sub_pair) {  // A. Nasar NEED to think about how to do this with
                            // MPI where ci may not be on this node/rank
       if (t->subtype == task_subtype_gpu_pack) {
         if (t->ci->nodeID == s->nodeID)

From 72fa9d34179441a2f14972bb8c29d8816da5c20b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 19 Nov 2024 16:26:23 +0000
Subject: [PATCH 079/217] Fixed a few bugs with GPU task splitting. Code hangs
 after a few steps when using low thread count and almost immediately if using
 many threads

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  3 +-
 src/engine_maketasks.c                        | 59 ++++++++++++++++++-
 src/scheduler.c                               | 12 ++--
 3 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 5f8b0d9f95..53807913b1 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 8
   cell_split_size: 64
   deadlock_waiting_time_s: 10.
 
@@ -38,4 +38,3 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index a58c1cc2cf..4ab0e1317f 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -3424,7 +3424,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         }
       }
     }
-
+    /*Make packing depend on sorts and drift A. Nasar */
+    else if (t_type == task_type_sub_self && t_subtype == task_subtype_gpu_pack) {
+      scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+    }
     /* Otherwise, sub-self interaction? */
     else if (t_type == task_type_sub_self &&
              t_subtype == task_subtype_density) {
@@ -3438,6 +3442,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second hydro loop */
       t_force = scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
                                   flags, 0, ci, NULL);
+      t_force_gpu = scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
+                                  flags, 0, ci, NULL);
 
       /* and the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -3510,6 +3516,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.force, t_force);
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
       }
@@ -3543,21 +3551,30 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
+      t_gradient_gpu = scheduler_addtask(sched, task_type_sub_self,
+                                     task_subtype_gpu_pack_g, flags, 0, ci, NULL);
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+    		              t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+    		              t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+    		              t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -3695,6 +3712,22 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
 
     /* Otherwise, sub-pair interaction? */
+    else if (t_type == task_type_sub_pair &&
+             t_subtype == task_subtype_gpu_pack) {
+        /* Make all density pack tasks depend on the drift */
+        if (ci->nodeID == nodeID) {
+          scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+        }
+        if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+          scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+        }
+
+        /* Make all density tasks depend on the sorts */
+        scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+        if (ci->hydro.super != cj->hydro.super) {
+          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+        }
+    }
     else if (t_type == task_type_sub_pair &&
              t_subtype == task_subtype_density) {
 
@@ -3718,6 +3751,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force */
       t_force = scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
                                   flags, 0, ci, cj);
+      t_force_gpu = scheduler_addtask(sched, task_type_sub_pair, task_subtype_gpu_pack_f,
+                                  flags, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3727,8 +3762,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
-        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
+          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
+          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -3826,6 +3863,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
+      engine_addlink(e, &ci->hydro.force, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force, t_force_gpu);
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3879,10 +3918,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
+      t_gradient_gpu = scheduler_addtask(sched, task_type_sub_pair,
+                                     task_subtype_gpu_pack_g, flags, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
+      engine_addlink(e, &ci->hydro.gradient, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -3890,11 +3933,19 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+        		            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+        		            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+        		            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+        		            t_force_gpu);
       }
 #else
 
@@ -3904,11 +3955,15 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                              with_cooling,
                                              with_timestep_limiter);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+        		            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+        		            t_force_gpu);
       }
 #endif
 
diff --git a/src/scheduler.c b/src/scheduler.c
index f0d59975dd..b4040604bb 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1768,24 +1768,24 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
   // #ifdef WITH_CUDA  A. Nasar
-  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack) {
+  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack) {
     atomic_inc(&s->nr_self_pack_tasks);
   }
-  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack) {
+  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack) {
     atomic_inc(&s->nr_pair_pack_tasks);
   }
   // #ifdef WITH_CUDA
-  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_g) {
+  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack_g) {
     atomic_inc(&s->nr_self_pack_tasks_g);
   }
-  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_g) {
+  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack_g) {
     atomic_inc(&s->nr_pair_pack_tasks_g);
   }
   // #ifdef WITH_CUDA
-  if (t->type == task_type_self && t->subtype == task_subtype_gpu_pack_f) {
+  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack_f) {
     atomic_inc(&s->nr_self_pack_tasks_f);
   }
-  if (t->type == task_type_pair && t->subtype == task_subtype_gpu_pack_f) {
+  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack_f) {
     atomic_inc(&s->nr_pair_pack_tasks_f);
   }
 

From 83972f3dcade8eb20d5277a889979c4cc87492e3 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 19 Nov 2024 16:33:51 +0000
Subject: [PATCH 080/217] Fixed a bug in if statement in runner_main2(). Added
 some code to make GPU tasks implicit in engine_maketasks(). CPU code seems to
 run fine without hanging. Need to figure out what issue is with the GPU task
 splitting

---
 src/engine_maketasks.c   | 14 +++++++++++---
 src/runner_main_clean.cu | 10 +++++-----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 4ab0e1317f..9980738c51 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5355,9 +5355,17 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype == task_subtype_density ||
-      t->subtype == task_subtype_gradient ||
-	  t->subtype == task_subtype_force){
+//    if (t->subtype == task_subtype_density ||
+//      t->subtype == task_subtype_gradient ||
+//	  t->subtype == task_subtype_force){
+//    	t->implicit = 1;
+//    }
+    if (t->subtype == task_subtype_gpu_pack ||
+      t->subtype == task_subtype_gpu_pack_g ||
+	  t->subtype == task_subtype_gpu_pack_f ||
+	  t->subtype == task_subtype_gpu_unpack ||
+	  t->subtype == task_subtype_gpu_unpack_g ||
+	  t->subtype == task_subtype_gpu_unpack_f){
     	t->implicit = 1;
     }
   }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7e16db394b..01922f2c09 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,9 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+//#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
+//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
@@ -1736,10 +1736,10 @@ void *runner_main2(void *data) {
         case task_type_sub_pair:
           if (t->subtype == task_subtype_density) {
             int nothing = 0;
-            fprintf(stderr, "Doing a pair sub task");
+            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
-          if (t->subtype == task_subtype_gpu_pack) {
+          else if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
 		  ticks tic_cpu_pack = getticks();
 

From d250edfcc2b63290cc647932047788c72097117f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 19 Nov 2024 18:29:24 +0000
Subject: [PATCH 081/217] Found another bug. I was not accounting for split
 tasks when allowing task stealing

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   1 +
 src/engine_maketasks.c                        |  22 ++--
 src/runner_doiact_functions_hydro_gpu.h       |  13 ++-
 src/runner_main_clean.cu                      | 107 +++++++++---------
 src/scheduler.c                               |  18 ++-
 5 files changed, 82 insertions(+), 79 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 53807913b1..9f3882f573 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -38,3 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
+  replicate:  2
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 9980738c51..ff2ecb72d3 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5355,19 +5355,19 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-//    if (t->subtype == task_subtype_density ||
-//      t->subtype == task_subtype_gradient ||
-//	  t->subtype == task_subtype_force){
-//    	t->implicit = 1;
-//    }
-    if (t->subtype == task_subtype_gpu_pack ||
-      t->subtype == task_subtype_gpu_pack_g ||
-	  t->subtype == task_subtype_gpu_pack_f ||
-	  t->subtype == task_subtype_gpu_unpack ||
-	  t->subtype == task_subtype_gpu_unpack_g ||
-	  t->subtype == task_subtype_gpu_unpack_f){
+    if (t->subtype == task_subtype_density ||
+      t->subtype == task_subtype_gradient ||
+	  t->subtype == task_subtype_force){
     	t->implicit = 1;
     }
+//    if (t->subtype == task_subtype_gpu_pack ||
+//      t->subtype == task_subtype_gpu_pack_g ||
+//	  t->subtype == task_subtype_gpu_pack_f ||
+//	  t->subtype == task_subtype_gpu_unpack ||
+//	  t->subtype == task_subtype_gpu_unpack_g ||
+//	  t->subtype == task_subtype_gpu_unpack_f){
+//    	t->implicit = 1;
+//    }
   }
 
 }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 868249076d..d7acca7f8d 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -334,8 +334,8 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
    * launch_leftovers statement)*/
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
-  //	task_unlock(t);
-  cell_unlocktree(ci);
+  	task_unlock(t);
+//  cell_unlocktree(ci);
   //	// MATTHIEU signal_sleeping_runners(s, t);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -2894,10 +2894,11 @@ void runner_dopair1_launch_f4_one_memcpy(
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
 
-        //		  /* Release the locks */
-        cell_unlocktree(cii);
-        //		  /* Release the locks */
-        cell_unlocktree(cjj);
+//        //		  /* Release the locks */
+//        cell_unlocktree(cii);
+//        //		  /* Release the locks */
+//        cell_unlocktree(cjj);
+        task_unlock(tii);
 
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 01922f2c09..3d2b34a35a 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -19,9 +19,9 @@
  *
  ******************************************************************************/
 /* Config parameters. */
-//#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+#define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
@@ -1617,7 +1617,7 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
-            fprintf(stderr, "split a g task\n");
+//            fprintf(stderr, "split a g task\n");
           }
           else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
@@ -1655,7 +1655,7 @@ void *runner_main2(void *data) {
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
-            fprintf(stderr, "split a f task\n");
+//            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
               //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
@@ -1736,7 +1736,7 @@ void *runner_main2(void *data) {
         case task_type_sub_pair:
           if (t->subtype == task_subtype_density) {
             int nothing = 0;
-            message("Doing a pair sub task");
+//            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
           else if (t->subtype == task_subtype_gpu_pack) {
@@ -1778,82 +1778,77 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
-            fprintf(stderr, "split a g task\n");
+//            fprintf(stderr, "split a g task\n");
           } else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
-              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-              //        		  t, parts_aos_grad, &packing_time_g);
-              ticks tic_cpu_pack = getticks();
+  	          ticks tic_cpu_pack = getticks();
 
-              packing_time_g += runner_doself1_pack_f4_g(
-                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                  task_first_part_f4_g);
+              packing_time_pair_g +=
+                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+                                           cj, t, parts_aos_pair_f4_g_send, e,
+                                           fparti_fpartj_lparti_lpartj_grad);
 
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
               /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
               /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_grad->launch;
+              int launch = pack_vars_pair_grad->launch;
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-                //      	        runner_doself1_launch_g(r, sched,
-                //      pack_vars_self_grad, ci, t, parts_aos_grad,
-                //      	        		d_parts_aos_grad, stream, d_a,
-                //      d_H, e, &packing_time_g, &time_for_gpu_g);
-                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
-                runner_doself1_launch_f4_g(
-                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
-                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
-              } /*End of GPU work Self*/
+                //			runner_dopair1_launch_g(r, sched,
+                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+                //					d_parts_aos_pair_grad,
+                // stream, d_a, d_H, e, &packing_time_pair_g,
+                //&time_for_gpu_pair_g);
+                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
+                runner_dopair1_launch_f4_g_one_memcpy(
+                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_g, &time_for_gpu_pair_g,
+                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+                    pair_end_g);
+              }
 #endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
-            fprintf(stderr, "split a f task\n");
+//            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
-              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-              //        		  t, parts_aos_forc, &packing_time_f);
               ticks tic_cpu_pack = getticks();
 
-              packing_time_f += runner_doself1_pack_f4_f(
-                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                  task_first_part_f4_f);
+              packing_time_pair_f +=
+                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+                                           cj, t, parts_aos_pair_f4_f_send, e,
+                                           fparti_fpartj_lparti_lpartj_forc);
 
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
-              //          int count = ci->hydro.count;
-              //          for(int i = 0; i < count; i++){
-              //        	  int pid = pack_vars_self_forc->count_parts - count +
-              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
-              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
-              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
-              //          }
               /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_forc->launch;
+              int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-                //  	        runner_doself1_launch_f(r, sched,
-                //  pack_vars_self_forc, ci, t, parts_aos_forc,
-                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
-                //  &time_for_gpu_f);
-                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
-                runner_doself1_launch_f4_f(
-                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
-                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
-              } /*End of GPU work Self*/
+                //  			runner_dopair1_launch_f(r, sched,
+                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+                //  					d_parts_aos_pair_forc,
+                //  stream, d_a, d_H, e, &packing_time_pair_f,
+                //  &time_for_gpu_pair_f);
+                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
+                runner_dopair1_launch_f4_f_one_memcpy(
+                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_f, &time_for_gpu_pair_f,
+                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+                    pair_end_f);
+              } /* End of GPU work Pairs */
 #endif
           } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
diff --git a/src/scheduler.c b/src/scheduler.c
index b4040604bb..4c8e52a4c3 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3172,32 +3172,38 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           /* Lucky? */
           if (res != NULL) {
 
-            if (res->type == task_type_self &&
+            if ((res->type == task_type_self ||
+            	 res->type == task_type_sub_self)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_self_left);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
             }
-            if (res->type == task_type_self &&
+            if ((res->type == task_type_self ||
+            	 res->type == task_type_sub_self)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_self_left_g);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
             }
-            if (res->type == task_type_self &&
+            if ((res->type == task_type_self ||
+            	 res->type == task_type_sub_self)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_self_left_f);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
             }
-            if (res->type == task_type_pair &&
+            if ((res->type == task_type_pair ||
+            	 res->type == task_type_sub_pair)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_pair_left);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
             }
-            if (res->type == task_type_pair &&
+            if ((res->type == task_type_pair ||
+            	 res->type == task_type_sub_pair)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_g);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
             }
-            if (res->type == task_type_pair &&
+            if ((res->type == task_type_pair ||
+            	 res->type == task_type_sub_pair)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_f);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);

From cb7abcfaa0e4f9a828e00665000b3f222a662fa7 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 11:47:30 +0000
Subject: [PATCH 082/217] Converted sub_selfs and sub_pairs to selfs and pairs
 in maketasks. Code still hangs when using more than 4 threads I think issue
 is in engine_make_extra_hydroloop_tasks_mapper()

---
 src/cuda/BLOCK_SIZE.h    |  4 ++--
 src/engine_maketasks.c   | 26 ++++++++++++++++++++++++--
 src/runner_main_clean.cu |  2 ++
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
index e27fc3c66c..15259a7883 100644
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -2,11 +2,11 @@
 #define BLOCK_SIZE_H
 
 #define BLOCK_SIZE 64
-#define N_TASKS_PER_PACK_SELF 64
+#define N_TASKS_PER_PACK_SELF 128
 #define N_TASKS_BUNDLE_SELF 16
 
 #define BLOCK_SIZE_PAIR 64
-#define N_TASKS_PER_PACK_PAIR 32
+#define N_TASKS_PER_PACK_PAIR 64
 #define N_TASKS_BUNDLE_PAIR 8
 
 #endif  // BLOCK_SIZE_H
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index ff2ecb72d3..bf735e978c 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4857,6 +4857,15 @@ void engine_maketasks(struct engine *e) {
   /* Split the tasks. */
   scheduler_splittasks(sched, /*fof_tasks=*/0, e->verbose);
 
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+    	t->type = task_type_pair;
+      }
+  }
   if (e->verbose)
     message("Splitting tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
@@ -4921,7 +4930,6 @@ void engine_maketasks(struct engine *e) {
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
-
   const int pack_size = sched->pack_size;
 
   int count_current_self = 0;
@@ -5028,7 +5036,21 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
-
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+    	t->type = task_type_pair;
+      }
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+    	t->type = task_type_pair;
+      }
+  }
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
   count_current_pair = 0;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 3d2b34a35a..1972197873 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1577,6 +1577,7 @@ void *runner_main2(void *data) {
           //          struct timespec t0, t1; //
           //          clock_gettime(CLOCK_REALTIME, &t0);
           ticks tic_cpu_pack = getticks();
+          message("Did a sub_self density");
 
           packing_time +=
               runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
@@ -1743,6 +1744,7 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
 		  ticks tic_cpu_pack = getticks();
 
+          message("Did a sub_pair density");
 		  packing_time_pair += runner_dopair1_pack_f4(
 			  r, sched, pack_vars_pair_dens, ci, cj, t,
 			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);

From 2ee5742c7a47b013d5ca080e364d006ccb4c8a6e Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 13:10:47 +0000
Subject: [PATCH 083/217] Removed sub tasks from runner_main and moved making
 density sub tasks into self and pairs to after creation of sub tasks for
 gradient and force

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   2 +-
 src/engine_maketasks.c                        |  25 +-
 src/runner_main_clean.cu                      | 226 +-----------------
 3 files changed, 22 insertions(+), 231 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 9f3882f573..5f8b0d9f95 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 8
+  max_top_level_cells: 16
   cell_split_size: 64
   deadlock_waiting_time_s: 10.
 
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index bf735e978c..f485df86c7 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4857,15 +4857,6 @@ void engine_maketasks(struct engine *e) {
   /* Split the tasks. */
   scheduler_splittasks(sched, /*fof_tasks=*/0, e->verbose);
 
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-    	t->type = task_type_pair;
-      }
-  }
   if (e->verbose)
     message("Splitting tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
@@ -5036,6 +5027,15 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+    	t->type = task_type_pair;
+      }
+  }
   for (int i = 0; i < sched->nr_tasks; i++) {
 	  struct task * t = &sched->tasks[i];
 	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
@@ -5389,6 +5389,13 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_g ||
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
+//    }
+//    if ((t->subtype == task_subtype_gpu_pack ||
+//      t->subtype == task_subtype_gpu_pack_g  ||
+//	  t->subtype == task_subtype_gpu_pack_f) &&
+//	  (t->type == task_type_sub_pair ||
+//	  t->type == task_type_sub_self)){
+//    	error("STill have subs");
 //    }
   }
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 1972197873..0944bb45a0 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -658,7 +658,8 @@ void *runner_main2(void *data) {
     error("MPI_Comm_size failed with error %i.", res);
 #endif
   int count_max_parts_tmp =
-      2 * target_n_tasks * space->nr_parts * nr_nodes / space->nr_cells;
+      100 * target_n_tasks * space->nr_parts * nr_nodes / (32*32*32);//space->nr_cells;
+  message("max_parts %i\n", count_max_parts_tmp);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -1571,131 +1572,16 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
           /* GPU WORK */
-        } else if (t->subtype == task_subtype_gpu_pack) {
-          packed_self++;
-#ifdef GPUOFFLOAD_DENSITY
-          //          struct timespec t0, t1; //
-          //          clock_gettime(CLOCK_REALTIME, &t0);
-          ticks tic_cpu_pack = getticks();
-          message("Did a sub_self density");
-
-          packing_time +=
-              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
-                                     parts_aos_f4_send, task_first_part_f4);
-
-	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-          //      	  clock_gettime(CLOCK_REALTIME, &t1);
-          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
-          //      			(t1.tv_nsec - t0.tv_nsec) /
-          //      1000000000.0;
-          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
-          //        		  t, parts_aos_dens, &packing_time);
-          /* No pack tasks left in queue, flag that we want to run */
-          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-          /*Packed enough tasks let's go*/
-          int launch = pack_vars_self_dens->launch;
-          /* Do we have enough stuff to run the GPU ? */
-          if (launch) n_full_d_bundles++;
-          if (launch_leftovers) n_partial_d_bundles++;
-          if (launch || launch_leftovers) {
-            /*Launch GPU tasks*/
-            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
-            runner_doself1_launch_f4(
-                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
-                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
-                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-                &unpack_time_self, task_first_part_self_dens_f4, devId,
-                task_first_part_f4, d_task_first_part_f4, self_end);
-            //	        runner_doself1_launch(r, sched,
-            // pack_vars_self_dens, ci, t, parts_aos_dens,
-            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
-            // &time_for_density_gpu,
-            // &tot_time_for_hard_memcpys);
-          } /*End of GPU work Self*/
-#endif  //GPUOFFLOAD_DENSITY
-        } /* self / pack */
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
 //            fprintf(stderr, "split a g task\n");
           }
-          else if (t->subtype == task_subtype_gpu_pack_g) {
-#ifdef GPUOFFLOAD_GRADIENT
-              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-              //        		  t, parts_aos_grad, &packing_time_g);
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_g += runner_doself1_pack_f4_g(
-                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                  task_first_part_f4_g);
-
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_grad->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //      	        runner_doself1_launch_g(r, sched,
-                //      pack_vars_self_grad, ci, t, parts_aos_grad,
-                //      	        		d_parts_aos_grad, stream, d_a,
-                //      d_H, e, &packing_time_g, &time_for_gpu_g);
-                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
-                runner_doself1_launch_f4_g(
-                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
-                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
-              } /*End of GPU work Self*/
-#endif //GPUOFFLOAD_GRADIENT
-          }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
 //            fprintf(stderr, "split a f task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_f) {
-#ifdef GPUOFFLOAD_FORCE
-              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-              //        		  t, parts_aos_forc, &packing_time_f);
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_f += runner_doself1_pack_f4_f(
-                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                  task_first_part_f4_f);
-
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              //          int count = ci->hydro.count;
-              //          for(int i = 0; i < count; i++){
-              //        	  int pid = pack_vars_self_forc->count_parts - count +
-              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
-              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
-              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
-              //          }
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_forc->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //  	        runner_doself1_launch_f(r, sched,
-                //  pack_vars_self_forc, ci, t, parts_aos_forc,
-                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
-                //  &time_for_gpu_f);
-                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
-                runner_doself1_launch_f4_f(
-                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
-                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
-              } /*End of GPU work Self*/
-#endif //GPUOFFLOAD_FORCE
           }
           else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
@@ -1740,119 +1626,17 @@ void *runner_main2(void *data) {
 //            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
-          else if (t->subtype == task_subtype_gpu_pack) {
-#ifdef GPUOFFLOAD_DENSITY
-		  ticks tic_cpu_pack = getticks();
-
-          message("Did a sub_pair density");
-		  packing_time_pair += runner_dopair1_pack_f4(
-			  r, sched, pack_vars_pair_dens, ci, cj, t,
-			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-
-		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-		  /* Packed enough tasks or no pack tasks left in queue, flag that
-		   * we want to run */
-		  int launch = pack_vars_pair_dens->launch;
-		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-
-		  /* Do we have enough stuff to run the GPU ? */
-		  if (launch) n_full_p_d_bundles++;
-		  if (launch_leftovers) n_partial_p_d_bundles++;
-		  if (launch || launch_leftovers) {
-
-			/*Launch GPU tasks*/
-			//				runner_dopair1_launch(r, sched,
-			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-			//						d_parts_aos_pair_dens,
-			// stream, d_a, d_H, e, &packing_time_pair,
-			//&time_for_density_gpu_pair);
-			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
-			runner_dopair1_launch_f4_one_memcpy(
-				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-				&packing_time_pair, &time_for_density_gpu_pair,
-				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-				pair_end);
-		  }
-#endif
-          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
 //            fprintf(stderr, "split a g task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_g) {
-#ifdef GPUOFFLOAD_GRADIENT
-  	          ticks tic_cpu_pack = getticks();
-
-              packing_time_pair_g +=
-                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-                                           cj, t, parts_aos_pair_f4_g_send, e,
-                                           fparti_fpartj_lparti_lpartj_grad);
-
-  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_pair_grad->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //			runner_dopair1_launch_g(r, sched,
-                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-                //					d_parts_aos_pair_grad,
-                // stream, d_a, d_H, e, &packing_time_pair_g,
-                //&time_for_gpu_pair_g);
-                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
-                runner_dopair1_launch_f4_g_one_memcpy(
-                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
-                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
-                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_g, &time_for_gpu_pair_g,
-                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
-                    pair_end_g);
-              }
-#endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
 //            fprintf(stderr, "split a f task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_f) {
-#ifdef GPUOFFLOAD_FORCE
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_pair_f +=
-                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-                                           cj, t, parts_aos_pair_f4_f_send, e,
-                                           fparti_fpartj_lparti_lpartj_forc);
-
-              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_pair_forc->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //  			runner_dopair1_launch_f(r, sched,
-                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-                //  					d_parts_aos_pair_forc,
-                //  stream, d_a, d_H, e, &packing_time_pair_f,
-                //  &time_for_gpu_pair_f);
-                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
-                runner_dopair1_launch_f4_f_one_memcpy(
-                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
-                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
-                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_f, &time_for_gpu_pair_f,
-                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
-                    pair_end_f);
-              } /* End of GPU work Pairs */
-#endif
-          } else if (t->subtype == task_subtype_limiter)
+          }
+          else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_pair_stars_density(r, ci, cj, 1);

From 4d726d15fd3541d831834c88b3ad925e592e0b5f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 15:56:15 +0000
Subject: [PATCH 084/217] commented out sub tasks in runner_main. Edited how
 tasks are created

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   4 +-
 src/engine_maketasks.c                        |  82 +++---
 src/runner_main_clean.cu                      | 238 +++++++++++++++++-
 3 files changed, 280 insertions(+), 44 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 5f8b0d9f95..5ba23a260b 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,8 +7,8 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
-  cell_split_size: 64
+  max_top_level_cells: 15
+  cell_split_size: 100
   deadlock_waiting_time_s: 10.
 
 # Parameters governing the time integration
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index f485df86c7..a383a3fe76 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4821,7 +4821,7 @@ void engine_maketasks(struct engine *e) {
 
   ticks tic2 = getticks();
   /*Initialise GPU task size in prep. for creation A. Nasar */
-  sched->target_gpu_tasks = s->nr_cells;  // OK AS LONG AS NOT SPLITTING
+  sched->target_gpu_tasks = 32*32*32;//s->nr_cells;  // OK AS LONG AS NOT SPLITTING
   const int target_gpu_tasks = sched->target_gpu_tasks;
 
   /* Construct the first hydro loop over neighbours */
@@ -4919,6 +4919,47 @@ void engine_maketasks(struct engine *e) {
 
   tic2 = getticks();
 
+
+  /* Run through the tasks and make force tasks for each density task.
+     Each force task depends on the cell ghosts and unlocks the kick task
+     of its super-cell. */
+  if (e->policy & engine_policy_hydro) {
+
+    /* Note that this does not scale well at all so we do not use the
+     * threadpool version here until the reason for this is found.
+     * We call the mapper function directly as if there was only 1 thread
+     * in the pool. */
+    engine_make_extra_hydroloop_tasks_mapper(sched->tasks, sched->nr_tasks, e);
+    /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
+     *                sched->tasks, sched->nr_tasks, sizeof(struct task),
+     *                threadpool_auto_chunk_size, e); */
+  }
+
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//	  struct task * t = &sched->tasks[i];
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+//    	t->type = task_type_pair;
+//      }
+//  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//	  struct task * t = &sched->tasks[i];
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+//    	t->type = task_type_pair;
+//      }
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+//    	t->type = task_type_pair;
+//      }
+//  }
+
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
   const int pack_size = sched->pack_size;
@@ -5013,44 +5054,7 @@ void engine_maketasks(struct engine *e) {
   //    	  scheduler_addunlock(sched, l->t, t);
   //    }
   //  }
-  /* Run through the tasks and make force tasks for each density task.
-     Each force task depends on the cell ghosts and unlocks the kick task
-     of its super-cell. */
-  if (e->policy & engine_policy_hydro) {
 
-    /* Note that this does not scale well at all so we do not use the
-     * threadpool version here until the reason for this is found.
-     * We call the mapper function directly as if there was only 1 thread
-     * in the pool. */
-    engine_make_extra_hydroloop_tasks_mapper(sched->tasks, sched->nr_tasks, e);
-    /* threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper,
-     *                sched->tasks, sched->nr_tasks, sizeof(struct task),
-     *                threadpool_auto_chunk_size, e); */
-  }
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-    	t->type = task_type_pair;
-      }
-  }
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-    	t->type = task_type_pair;
-      }
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-    	t->type = task_type_pair;
-      }
-  }
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
   count_current_pair = 0;
@@ -5395,7 +5399,7 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_pack_f) &&
 //	  (t->type == task_type_sub_pair ||
 //	  t->type == task_type_sub_self)){
-//    	error("STill have subs");
+////    	error("STill have subs");
 //    }
   }
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 0944bb45a0..f4fa8592ff 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1572,16 +1572,131 @@ void *runner_main2(void *data) {
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
           /* GPU WORK */
-          }
+        } else if (t->subtype == task_subtype_gpu_pack) {
+          packed_self++;
+#ifdef GPUOFFLOAD_DENSITY
+//          //          struct timespec t0, t1; //
+//          //          clock_gettime(CLOCK_REALTIME, &t0);
+//          ticks tic_cpu_pack = getticks();
+//          message("Did a sub_self density");
+//
+//          packing_time +=
+//              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+//                                     parts_aos_f4_send, task_first_part_f4);
+//
+//	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//
+//          //      	  clock_gettime(CLOCK_REALTIME, &t1);
+//          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
+//          //      			(t1.tv_nsec - t0.tv_nsec) /
+//          //      1000000000.0;
+//          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
+//          //        		  t, parts_aos_dens, &packing_time);
+//          /* No pack tasks left in queue, flag that we want to run */
+//          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+//          /*Packed enough tasks let's go*/
+//          int launch = pack_vars_self_dens->launch;
+//          /* Do we have enough stuff to run the GPU ? */
+//          if (launch) n_full_d_bundles++;
+//          if (launch_leftovers) n_partial_d_bundles++;
+//          if (launch || launch_leftovers) {
+//            /*Launch GPU tasks*/
+//            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
+//            runner_doself1_launch_f4(
+//                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+//                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+//                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+//                &unpack_time_self, task_first_part_self_dens_f4, devId,
+//                task_first_part_f4, d_task_first_part_f4, self_end);
+//            //	        runner_doself1_launch(r, sched,
+//            // pack_vars_self_dens, ci, t, parts_aos_dens,
+//            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
+//            // &time_for_density_gpu,
+//            // &tot_time_for_hard_memcpys);
+//          } /*End of GPU work Self*/
+#endif  //GPUOFFLOAD_DENSITY
+        } /* self / pack */
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
 //            fprintf(stderr, "split a g task\n");
           }
+          else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+//              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+//              //        		  t, parts_aos_grad, &packing_time_g);
+//              ticks tic_cpu_pack = getticks();
+//
+//              packing_time_g += runner_doself1_pack_f4_g(
+//                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+//                  task_first_part_f4_g);
+//
+//  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//
+//              /* No pack tasks left in queue, flag that we want to run */
+//              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+//              /*Packed enough tasks let's go*/
+//              int launch = pack_vars_self_grad->launch;
+//              /* Do we have enough stuff to run the GPU ? */
+//              if (launch || launch_leftovers) {
+//                /*Launch GPU tasks*/
+//                //      	        runner_doself1_launch_g(r, sched,
+//                //      pack_vars_self_grad, ci, t, parts_aos_grad,
+//                //      	        		d_parts_aos_grad, stream, d_a,
+//                //      d_H, e, &packing_time_g, &time_for_gpu_g);
+//                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
+//                runner_doself1_launch_f4_g(
+//                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+//                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+//                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+//                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+//                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+//              } /*End of GPU work Self*/
+#endif //GPUOFFLOAD_GRADIENT
+          }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
 //            fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+//              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+//              //        		  t, parts_aos_forc, &packing_time_f);
+//              ticks tic_cpu_pack = getticks();
+//
+//              packing_time_f += runner_doself1_pack_f4_f(
+//                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+//                  task_first_part_f4_f);
+//
+//  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//
+//              //          int count = ci->hydro.count;
+//              //          for(int i = 0; i < count; i++){
+//              //        	  int pid = pack_vars_self_forc->count_parts - count +
+//              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
+//              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
+//              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
+//              //          }
+//              /* No pack tasks left in queue, flag that we want to run */
+//              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+//              /*Packed enough tasks let's go*/
+//              int launch = pack_vars_self_forc->launch;
+//              /* Do we have enough stuff to run the GPU ? */
+//              if (launch || launch_leftovers) {
+//                /*Launch GPU tasks*/
+//                //  	        runner_doself1_launch_f(r, sched,
+//                //  pack_vars_self_forc, ci, t, parts_aos_forc,
+//                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
+//                //  &time_for_gpu_f);
+//                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
+//                runner_doself1_launch_f4_f(
+//                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+//                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+//                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+//                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+//                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+//              } /*End of GPU work Self*/
+#endif //GPUOFFLOAD_FORCE
           }
           else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
@@ -1626,17 +1741,119 @@ void *runner_main2(void *data) {
 //            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
+          else if (t->subtype == task_subtype_gpu_pack) {
+#ifdef GPUOFFLOAD_DENSITY
+//		  ticks tic_cpu_pack = getticks();
+//
+//          message("Did a sub_pair density");
+//		  packing_time_pair += runner_dopair1_pack_f4(
+//			  r, sched, pack_vars_pair_dens, ci, cj, t,
+//			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+//
+//		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//		  /* Packed enough tasks or no pack tasks left in queue, flag that
+//		   * we want to run */
+//		  int launch = pack_vars_pair_dens->launch;
+//		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+//
+//		  /* Do we have enough stuff to run the GPU ? */
+//		  if (launch) n_full_p_d_bundles++;
+//		  if (launch_leftovers) n_partial_p_d_bundles++;
+//		  if (launch || launch_leftovers) {
+//
+//			/*Launch GPU tasks*/
+//			//				runner_dopair1_launch(r, sched,
+//			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+//			//						d_parts_aos_pair_dens,
+//			// stream, d_a, d_H, e, &packing_time_pair,
+//			//&time_for_density_gpu_pair);
+//			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
+//			runner_dopair1_launch_f4_one_memcpy(
+//				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+//				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+//				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+//				&packing_time_pair, &time_for_density_gpu_pair,
+//				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+//				pair_end);
+//		  }
+#endif
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
 //            fprintf(stderr, "split a g task\n");
+          } else if (t->subtype == task_subtype_gpu_pack_g) {
+#ifdef GPUOFFLOAD_GRADIENT
+//  	          ticks tic_cpu_pack = getticks();
+//
+//              packing_time_pair_g +=
+//                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+//                                           cj, t, parts_aos_pair_f4_g_send, e,
+//                                           fparti_fpartj_lparti_lpartj_grad);
+//
+//  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//
+//              /* No pack tasks left in queue, flag that we want to run */
+//              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+//              /*Packed enough tasks let's go*/
+//              int launch = pack_vars_pair_grad->launch;
+//              /* Do we have enough stuff to run the GPU ? */
+//              if (launch || launch_leftovers) {
+//                /*Launch GPU tasks*/
+//                //			runner_dopair1_launch_g(r, sched,
+//                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+//                //					d_parts_aos_pair_grad,
+//                // stream, d_a, d_H, e, &packing_time_pair_g,
+//                //&time_for_gpu_pair_g);
+//                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
+//                runner_dopair1_launch_f4_g_one_memcpy(
+//                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+//                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+//                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+//                    &packing_time_pair_g, &time_for_gpu_pair_g,
+//                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+//                    pair_end_g);
+//              }
+#endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
 //            fprintf(stderr, "split a f task\n");
-          }
-          else if (t->subtype == task_subtype_limiter)
+          } else if (t->subtype == task_subtype_gpu_pack_f) {
+#ifdef GPUOFFLOAD_FORCE
+//              ticks tic_cpu_pack = getticks();
+//
+//              packing_time_pair_f +=
+//                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+//                                           cj, t, parts_aos_pair_f4_f_send, e,
+//                                           fparti_fpartj_lparti_lpartj_forc);
+//
+//              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//
+//              /* No pack tasks left in queue, flag that we want to run */
+//              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+//              /*Packed enough tasks let's go*/
+//              int launch = pack_vars_pair_forc->launch;
+//              /* Do we have enough stuff to run the GPU ? */
+//              if (launch || launch_leftovers) {
+//                /*Launch GPU tasks*/
+//                //  			runner_dopair1_launch_f(r, sched,
+//                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+//                //  					d_parts_aos_pair_forc,
+//                //  stream, d_a, d_H, e, &packing_time_pair_f,
+//                //  &time_for_gpu_pair_f);
+//                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
+//                runner_dopair1_launch_f4_f_one_memcpy(
+//                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+//                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+//                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+//                    &packing_time_pair_f, &time_for_gpu_pair_f,
+//                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+//                    pair_end_f);
+//              } /* End of GPU work Pairs */
+#endif
+          } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_pair_stars_density(r, ci, cj, 1);
@@ -1908,6 +2125,10 @@ void *runner_main2(void *data) {
       
       if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
+        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+    	  t = scheduler_done(sched, t);
+        }
+    	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
 	t->toc = getticks();           
@@ -1915,6 +2136,7 @@ void *runner_main2(void *data) {
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
+    	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -1922,6 +2144,10 @@ void *runner_main2(void *data) {
       
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
+        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+      	  t = scheduler_done(sched, t);
+        }
+      	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
 	t->toc = getticks();           
@@ -1929,6 +2155,7 @@ void *runner_main2(void *data) {
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
+      	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -1936,13 +2163,18 @@ void *runner_main2(void *data) {
 
       else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
+      	if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+      		t = scheduler_done(sched, t);
+      	}
         /* Don't enqueue unpacks yet. Just signal the runners */
+      	else{
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
+      	}
 #else
         t = scheduler_done(sched, t);
 #endif

From dcbad172c4afe3c33dc7040b96b33d4206104d2b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 16:40:28 +0000
Subject: [PATCH 085/217] Made sub tasks implicit and fixed bug with atomic_dec
 in runner_main

---
 src/engine_maketasks.c   | 15 ++++++++-------
 src/runner_main_clean.cu | 21 +++++++++++++++++++++
 src/scheduler.c          |  9 ---------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index a383a3fe76..5be385ce76 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5394,13 +5394,14 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
-//    if ((t->subtype == task_subtype_gpu_pack ||
-//      t->subtype == task_subtype_gpu_pack_g  ||
-//	  t->subtype == task_subtype_gpu_pack_f) &&
-//	  (t->type == task_type_sub_pair ||
-//	  t->type == task_type_sub_self)){
-////    	error("STill have subs");
-//    }
+    if ((t->subtype == task_subtype_gpu_pack ||
+      t->subtype == task_subtype_gpu_pack_g  ||
+	  t->subtype == task_subtype_gpu_pack_f) &&
+	  (t->type == task_type_sub_pair ||
+	  t->type == task_type_sub_self)){
+    	t->implicit = 1;
+//    	error("STill have subs");
+    }
   }
 
 }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index f4fa8592ff..dd031bdaf7 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -2126,6 +2126,13 @@ void *runner_main2(void *data) {
       if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
         if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+      	  int qid = r->qid;
+      	  if (t->type == task_type_sub_pair){
+      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+      	  }
+      	  if (t->type == task_type_sub_self){
+      		  atomic_dec(&(sched->queues[qid].n_packs_self_left));
+      	  }
     	  t = scheduler_done(sched, t);
         }
     	else{
@@ -2145,6 +2152,13 @@ void *runner_main2(void *data) {
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
         if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+      	  int qid = r->qid;
+      	  if (t->type == task_type_sub_pair){
+      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+      	  }
+      	  if (t->type == task_type_sub_self){
+      		  atomic_dec(&(sched->queues[qid].n_packs_self_left_g));
+      	  }
       	  t = scheduler_done(sched, t);
         }
       	else{
@@ -2164,6 +2178,13 @@ void *runner_main2(void *data) {
       else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
       	if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+        	  int qid = r->qid;
+        	  if (t->type == task_type_sub_pair){
+        		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+        	  }
+        	  if (t->type == task_type_sub_self){
+        		  atomic_dec(&(sched->queues[qid].n_packs_self_left_f));
+        	  }
       		t = scheduler_done(sched, t);
       	}
         /* Don't enqueue unpacks yet. Just signal the runners */
diff --git a/src/scheduler.c b/src/scheduler.c
index 4c8e52a4c3..ae4572177d 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2902,21 +2902,12 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         task_type_pair || t->type == task_type_sub_pair) {  // A. Nasar NEED to think about how to do this with
                            // MPI where ci may not be on this node/rank
       if (t->subtype == task_subtype_gpu_pack) {
-        if (t->ci->nodeID == s->nodeID)
-          atomic_inc(&s->queues[qid].n_packs_pair_left);
-        else
           atomic_inc(&s->queues[qid].n_packs_pair_left);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
-        if (t->ci->nodeID == s->nodeID)
-          atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-        else
           atomic_inc(&s->queues[qid].n_packs_pair_left_f);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
-        if (t->ci->nodeID == s->nodeID)
-          atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-        else
           atomic_inc(&s->queues[qid].n_packs_pair_left_g);
       }
     }

From 2beb67e3e56367d509a04089593c7a30b83a9273 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 18:25:15 +0000
Subject: [PATCH 086/217] reverted to converting subs

---
 .../HydroTests/GreshoVortex_3D/getGlass.sh    |  2 +-
 .../HydroTests/GreshoVortex_3D/gresho.yml     | 11 ++-
 examples/HydroTests/GreshoVortex_3D/makeIC.py |  2 +-
 src/engine_maketasks.c                        | 68 ++++++++++---------
 4 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/getGlass.sh b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
index d5c5f590ac..068986fc10 100755
--- a/examples/HydroTests/GreshoVortex_3D/getGlass.sh
+++ b/examples/HydroTests/GreshoVortex_3D/getGlass.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_64.hdf5
+wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/glassCube_128.hdf5
diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 5ba23a260b..810f7afb1d 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -8,21 +8,19 @@ InternalUnitSystem:
 
 Scheduler:
   max_top_level_cells: 15
-  cell_split_size: 100
-  deadlock_waiting_time_s: 10.
 
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
-  time_end:   1.0    # The end time of the simulation (in internal units).
+  time_end:   1.    # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
   basename:            gresho # Common part of the name of output files
   time_first:          0.     # Time of the first output (in internal units)
-  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
+  delta_time:          1e-1   # Time difference between consecutive outputs (in internal units)
   compression:         1
   
 # Parameters governing the conserved quantities statistics
@@ -31,11 +29,10 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        2.01   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
diff --git a/examples/HydroTests/GreshoVortex_3D/makeIC.py b/examples/HydroTests/GreshoVortex_3D/makeIC.py
index c611132715..19b38352eb 100644
--- a/examples/HydroTests/GreshoVortex_3D/makeIC.py
+++ b/examples/HydroTests/GreshoVortex_3D/makeIC.py
@@ -28,7 +28,7 @@
 rho0 = 1  # Gas density
 P0 = 0.0  # Constant additional pressure (should have no impact on the dynamics)
 fileOutputName = "greshoVortex.hdf5"
-fileGlass = "glassCube_64.hdf5"
+fileGlass = "glassCube_128.hdf5"
 # ---------------------------------------------------
 
 # Get position and smoothing lengths from the glass
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 5be385ce76..788b9a9a9d 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -3427,7 +3427,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /*Make packing depend on sorts and drift A. Nasar */
     else if (t_type == task_type_sub_self && t_subtype == task_subtype_gpu_pack) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+//      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
     }
     /* Otherwise, sub-self interaction? */
     else if (t_type == task_type_sub_self &&
@@ -4855,7 +4855,9 @@ void engine_maketasks(struct engine *e) {
   tic2 = getticks();
 
   /* Split the tasks. */
+//  message("before split");
   scheduler_splittasks(sched, /*fof_tasks=*/0, e->verbose);
+//  message("after split");
 
   if (e->verbose)
     message("Splitting tasks took %.3f %s.",
@@ -4935,30 +4937,30 @@ void engine_maketasks(struct engine *e) {
      *                threadpool_auto_chunk_size, e); */
   }
 
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//	  struct task * t = &sched->tasks[i];
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-//    	t->type = task_type_pair;
-//      }
-//  }
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//	  struct task * t = &sched->tasks[i];
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-//    	t->type = task_type_pair;
-//      }
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-//    	t->type = task_type_pair;
-//      }
-//  }
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+    	t->type = task_type_pair;
+      }
+  }
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+    	t->type = task_type_pair;
+      }
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+    	t->type = task_type_pair;
+      }
+  }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
@@ -5394,14 +5396,14 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
-    if ((t->subtype == task_subtype_gpu_pack ||
-      t->subtype == task_subtype_gpu_pack_g  ||
-	  t->subtype == task_subtype_gpu_pack_f) &&
-	  (t->type == task_type_sub_pair ||
-	  t->type == task_type_sub_self)){
-    	t->implicit = 1;
-//    	error("STill have subs");
-    }
+//    if ((t->subtype == task_subtype_gpu_pack ||
+//      t->subtype == task_subtype_gpu_pack_g  ||
+//	  t->subtype == task_subtype_gpu_pack_f) &&
+//	  (t->type == task_type_sub_pair ||
+//	  t->type == task_type_sub_self)){
+//    	t->implicit = 1;
+////    	error("STill have subs");
+//    }
   }
 
 }

From 2a0d137c7954df8fdf42d60af8365fe1135eb770 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 20 Nov 2024 19:30:21 +0000
Subject: [PATCH 087/217] Too tired to carry on. There might be something to
 playing with space_subsize_pair/self_hydro_default

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 2 +-
 src/runner_main_clean.cu                       | 2 +-
 src/space.h                                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 810f7afb1d..57135a4df1 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -8,7 +8,7 @@ InternalUnitSystem:
 
 Scheduler:
   max_top_level_cells: 15
-
+  tasks_per_cell: 200
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index dd031bdaf7..49ffa7033d 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -658,7 +658,7 @@ void *runner_main2(void *data) {
     error("MPI_Comm_size failed with error %i.", res);
 #endif
   int count_max_parts_tmp =
-      100 * target_n_tasks * space->nr_parts * nr_nodes / (32*32*32);//space->nr_cells;
+      100 * target_n_tasks * space->nr_parts * nr_nodes / (64*64*64);//space->nr_cells;
   message("max_parts %i\n", count_max_parts_tmp);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
diff --git a/src/space.h b/src/space.h
index 4e0e849d64..8e3dcabf15 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,8 +55,8 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 256000000
-#define space_subsize_self_hydro_default 32000
+#define space_subsize_pair_hydro_default 512
+#define space_subsize_self_hydro_default 256
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000
 #define space_subsize_pair_grav_default 256000000

From 4bfa575c04bd28343af9a025a30eb993f3334ff4 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 10:06:18 +0000
Subject: [PATCH 088/217] A few edits to test on Bede GHs

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   6 +-
 examples/HydroTests/GreshoVortex_3D/makeIC.py |   2 +-
 src/engine_maketasks.c                        |  48 +-
 src/runner_main_clean.cu                      | 470 +++++++++---------
 src/space.h                                   |   4 +-
 5 files changed, 265 insertions(+), 265 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 57135a4df1..36d2f23ea8 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,20 +7,20 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 15
+  max_top_level_cells: 8
   tasks_per_cell: 200
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
   time_end:   1.    # The end time of the simulation (in internal units).
   dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
-  dt_max:     1e-2  # The maximal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
   basename:            gresho # Common part of the name of output files
   time_first:          0.     # Time of the first output (in internal units)
-  delta_time:          1e-1   # Time difference between consecutive outputs (in internal units)
+  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
   compression:         1
   
 # Parameters governing the conserved quantities statistics
diff --git a/examples/HydroTests/GreshoVortex_3D/makeIC.py b/examples/HydroTests/GreshoVortex_3D/makeIC.py
index 19b38352eb..c611132715 100644
--- a/examples/HydroTests/GreshoVortex_3D/makeIC.py
+++ b/examples/HydroTests/GreshoVortex_3D/makeIC.py
@@ -28,7 +28,7 @@
 rho0 = 1  # Gas density
 P0 = 0.0  # Constant additional pressure (should have no impact on the dynamics)
 fileOutputName = "greshoVortex.hdf5"
-fileGlass = "glassCube_128.hdf5"
+fileGlass = "glassCube_64.hdf5"
 # ---------------------------------------------------
 
 # Get position and smoothing lengths from the glass
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 788b9a9a9d..0893510f1e 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4937,30 +4937,30 @@ void engine_maketasks(struct engine *e) {
      *                threadpool_auto_chunk_size, e); */
   }
 
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-    	t->type = task_type_pair;
-      }
-  }
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-    	t->type = task_type_pair;
-      }
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-    	t->type = task_type_pair;
-      }
-  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//	  struct task * t = &sched->tasks[i];
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+//    	t->type = task_type_pair;
+//      }
+//  }
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//	  struct task * t = &sched->tasks[i];
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+//    	t->type = task_type_pair;
+//      }
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+//        t->type = task_type_self;
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+//    	t->type = task_type_pair;
+//      }
+//  }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 49ffa7033d..68e17612e9 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -658,7 +658,7 @@ void *runner_main2(void *data) {
     error("MPI_Comm_size failed with error %i.", res);
 #endif
   int count_max_parts_tmp =
-      100 * target_n_tasks * space->nr_parts * nr_nodes / (64*64*64);//space->nr_cells;
+      2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
   message("max_parts %i\n", count_max_parts_tmp);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
@@ -1575,45 +1575,45 @@ void *runner_main2(void *data) {
         } else if (t->subtype == task_subtype_gpu_pack) {
           packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
-//          //          struct timespec t0, t1; //
-//          //          clock_gettime(CLOCK_REALTIME, &t0);
-//          ticks tic_cpu_pack = getticks();
-//          message("Did a sub_self density");
-//
-//          packing_time +=
-//              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
-//                                     parts_aos_f4_send, task_first_part_f4);
-//
-//	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//
-//          //      	  clock_gettime(CLOCK_REALTIME, &t1);
-//          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
-//          //      			(t1.tv_nsec - t0.tv_nsec) /
-//          //      1000000000.0;
-//          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
-//          //        		  t, parts_aos_dens, &packing_time);
-//          /* No pack tasks left in queue, flag that we want to run */
-//          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-//          /*Packed enough tasks let's go*/
-//          int launch = pack_vars_self_dens->launch;
-//          /* Do we have enough stuff to run the GPU ? */
-//          if (launch) n_full_d_bundles++;
-//          if (launch_leftovers) n_partial_d_bundles++;
-//          if (launch || launch_leftovers) {
-//            /*Launch GPU tasks*/
-//            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
-//            runner_doself1_launch_f4(
-//                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
-//                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
-//                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-//                &unpack_time_self, task_first_part_self_dens_f4, devId,
-//                task_first_part_f4, d_task_first_part_f4, self_end);
-//            //	        runner_doself1_launch(r, sched,
-//            // pack_vars_self_dens, ci, t, parts_aos_dens,
-//            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
-//            // &time_for_density_gpu,
-//            // &tot_time_for_hard_memcpys);
-//          } /*End of GPU work Self*/
+          //          struct timespec t0, t1; //
+          //          clock_gettime(CLOCK_REALTIME, &t0);
+          ticks tic_cpu_pack = getticks();
+          message("Did a sub_self density");
+
+          packing_time +=
+              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
+                                     parts_aos_f4_send, task_first_part_f4);
+
+	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+          //      	  clock_gettime(CLOCK_REALTIME, &t1);
+          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
+          //      			(t1.tv_nsec - t0.tv_nsec) /
+          //      1000000000.0;
+          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
+          //        		  t, parts_aos_dens, &packing_time);
+          /* No pack tasks left in queue, flag that we want to run */
+          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+          /*Packed enough tasks let's go*/
+          int launch = pack_vars_self_dens->launch;
+          /* Do we have enough stuff to run the GPU ? */
+          if (launch) n_full_d_bundles++;
+          if (launch_leftovers) n_partial_d_bundles++;
+          if (launch || launch_leftovers) {
+            /*Launch GPU tasks*/
+            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
+            runner_doself1_launch_f4(
+                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
+                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
+                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
+                &unpack_time_self, task_first_part_self_dens_f4, devId,
+                task_first_part_f4, d_task_first_part_f4, self_end);
+            //	        runner_doself1_launch(r, sched,
+            // pack_vars_self_dens, ci, t, parts_aos_dens,
+            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
+            // &time_for_density_gpu,
+            // &tot_time_for_hard_memcpys);
+          } /*End of GPU work Self*/
 #endif  //GPUOFFLOAD_DENSITY
         } /* self / pack */
 #ifdef EXTRA_HYDRO_LOOP
@@ -1623,35 +1623,35 @@ void *runner_main2(void *data) {
           }
           else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
-//              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-//              //        		  t, parts_aos_grad, &packing_time_g);
-//              ticks tic_cpu_pack = getticks();
-//
-//              packing_time_g += runner_doself1_pack_f4_g(
-//                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-//                  task_first_part_f4_g);
-//
-//  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//
-//              /* No pack tasks left in queue, flag that we want to run */
-//              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-//              /*Packed enough tasks let's go*/
-//              int launch = pack_vars_self_grad->launch;
-//              /* Do we have enough stuff to run the GPU ? */
-//              if (launch || launch_leftovers) {
-//                /*Launch GPU tasks*/
-//                //      	        runner_doself1_launch_g(r, sched,
-//                //      pack_vars_self_grad, ci, t, parts_aos_grad,
-//                //      	        		d_parts_aos_grad, stream, d_a,
-//                //      d_H, e, &packing_time_g, &time_for_gpu_g);
-//                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
-//                runner_doself1_launch_f4_g(
-//                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-//                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-//                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
-//                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
-//                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
-//              } /*End of GPU work Self*/
+              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
+              //        		  t, parts_aos_grad, &packing_time_g);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_g += runner_doself1_pack_f4_g(
+                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                  task_first_part_f4_g);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //      	        runner_doself1_launch_g(r, sched,
+                //      pack_vars_self_grad, ci, t, parts_aos_grad,
+                //      	        		d_parts_aos_grad, stream, d_a,
+                //      d_H, e, &packing_time_g, &time_for_gpu_g);
+                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
+                runner_doself1_launch_f4_g(
+                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
+                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
+                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
+                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
+              } /*End of GPU work Self*/
 #endif //GPUOFFLOAD_GRADIENT
           }
 #endif
@@ -1660,42 +1660,42 @@ void *runner_main2(void *data) {
 //            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
-//              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-//              //        		  t, parts_aos_forc, &packing_time_f);
-//              ticks tic_cpu_pack = getticks();
-//
-//              packing_time_f += runner_doself1_pack_f4_f(
-//                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-//                  task_first_part_f4_f);
-//
-//  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//
-//              //          int count = ci->hydro.count;
-//              //          for(int i = 0; i < count; i++){
-//              //        	  int pid = pack_vars_self_forc->count_parts - count +
-//              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
-//              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
-//              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
-//              //          }
-//              /* No pack tasks left in queue, flag that we want to run */
-//              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-//              /*Packed enough tasks let's go*/
-//              int launch = pack_vars_self_forc->launch;
-//              /* Do we have enough stuff to run the GPU ? */
-//              if (launch || launch_leftovers) {
-//                /*Launch GPU tasks*/
-//                //  	        runner_doself1_launch_f(r, sched,
-//                //  pack_vars_self_forc, ci, t, parts_aos_forc,
-//                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
-//                //  &time_for_gpu_f);
-//                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
-//                runner_doself1_launch_f4_f(
-//                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-//                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-//                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
-//                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
-//                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
-//              } /*End of GPU work Self*/
+              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
+              //        		  t, parts_aos_forc, &packing_time_f);
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_f += runner_doself1_pack_f4_f(
+                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                  task_first_part_f4_f);
+
+  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              //          int count = ci->hydro.count;
+              //          for(int i = 0; i < count; i++){
+              //        	  int pid = pack_vars_self_forc->count_parts - count +
+              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
+              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
+              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
+              //          }
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_self_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //  	        runner_doself1_launch_f(r, sched,
+                //  pack_vars_self_forc, ci, t, parts_aos_forc,
+                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
+                //  &time_for_gpu_f);
+                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
+                runner_doself1_launch_f4_f(
+                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
+                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
+                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
+                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
+                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
+              } /*End of GPU work Self*/
 #endif //GPUOFFLOAD_FORCE
           }
           else if (t->subtype == task_subtype_limiter)
@@ -1743,39 +1743,39 @@ void *runner_main2(void *data) {
           }
           else if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
-//		  ticks tic_cpu_pack = getticks();
-//
-//          message("Did a sub_pair density");
-//		  packing_time_pair += runner_dopair1_pack_f4(
-//			  r, sched, pack_vars_pair_dens, ci, cj, t,
-//			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-//
-//		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//		  /* Packed enough tasks or no pack tasks left in queue, flag that
-//		   * we want to run */
-//		  int launch = pack_vars_pair_dens->launch;
-//		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-//
-//		  /* Do we have enough stuff to run the GPU ? */
-//		  if (launch) n_full_p_d_bundles++;
-//		  if (launch_leftovers) n_partial_p_d_bundles++;
-//		  if (launch || launch_leftovers) {
-//
-//			/*Launch GPU tasks*/
-//			//				runner_dopair1_launch(r, sched,
-//			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-//			//						d_parts_aos_pair_dens,
-//			// stream, d_a, d_H, e, &packing_time_pair,
-//			//&time_for_density_gpu_pair);
-//			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
-//			runner_dopair1_launch_f4_one_memcpy(
-//				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-//				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-//				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-//				&packing_time_pair, &time_for_density_gpu_pair,
-//				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-//				pair_end);
-//		  }
+		  ticks tic_cpu_pack = getticks();
+
+          message("Did a sub_pair density");
+		  packing_time_pair += runner_dopair1_pack_f4(
+			  r, sched, pack_vars_pair_dens, ci, cj, t,
+			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+
+		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+		  /* Packed enough tasks or no pack tasks left in queue, flag that
+		   * we want to run */
+		  int launch = pack_vars_pair_dens->launch;
+		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+
+		  /* Do we have enough stuff to run the GPU ? */
+		  if (launch) n_full_p_d_bundles++;
+		  if (launch_leftovers) n_partial_p_d_bundles++;
+		  if (launch || launch_leftovers) {
+
+			/*Launch GPU tasks*/
+			//				runner_dopair1_launch(r, sched,
+			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+			//						d_parts_aos_pair_dens,
+			// stream, d_a, d_H, e, &packing_time_pair,
+			//&time_for_density_gpu_pair);
+			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
+			runner_dopair1_launch_f4_one_memcpy(
+				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+				&packing_time_pair, &time_for_density_gpu_pair,
+				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+				pair_end);
+		  }
 #endif
           }
 #ifdef EXTRA_HYDRO_LOOP
@@ -1784,36 +1784,36 @@ void *runner_main2(void *data) {
 //            fprintf(stderr, "split a g task\n");
           } else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
-//  	          ticks tic_cpu_pack = getticks();
-//
-//              packing_time_pair_g +=
-//                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-//                                           cj, t, parts_aos_pair_f4_g_send, e,
-//                                           fparti_fpartj_lparti_lpartj_grad);
-//
-//  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//
-//              /* No pack tasks left in queue, flag that we want to run */
-//              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-//              /*Packed enough tasks let's go*/
-//              int launch = pack_vars_pair_grad->launch;
-//              /* Do we have enough stuff to run the GPU ? */
-//              if (launch || launch_leftovers) {
-//                /*Launch GPU tasks*/
-//                //			runner_dopair1_launch_g(r, sched,
-//                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-//                //					d_parts_aos_pair_grad,
-//                // stream, d_a, d_H, e, &packing_time_pair_g,
-//                //&time_for_gpu_pair_g);
-//                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
-//                runner_dopair1_launch_f4_g_one_memcpy(
-//                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
-//                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
-//                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
-//                    &packing_time_pair_g, &time_for_gpu_pair_g,
-//                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
-//                    pair_end_g);
-//              }
+  	          ticks tic_cpu_pack = getticks();
+
+              packing_time_pair_g +=
+                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
+                                           cj, t, parts_aos_pair_f4_g_send, e,
+                                           fparti_fpartj_lparti_lpartj_grad);
+
+  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_pair_grad->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //			runner_dopair1_launch_g(r, sched,
+                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
+                //					d_parts_aos_pair_grad,
+                // stream, d_a, d_H, e, &packing_time_pair_g,
+                //&time_for_gpu_pair_g);
+                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
+                runner_dopair1_launch_f4_g_one_memcpy(
+                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
+                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
+                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_g, &time_for_gpu_pair_g,
+                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
+                    pair_end_g);
+              }
 #endif
           }
 #endif
@@ -1822,36 +1822,36 @@ void *runner_main2(void *data) {
 //            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
-//              ticks tic_cpu_pack = getticks();
-//
-//              packing_time_pair_f +=
-//                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-//                                           cj, t, parts_aos_pair_f4_f_send, e,
-//                                           fparti_fpartj_lparti_lpartj_forc);
-//
-//              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//
-//              /* No pack tasks left in queue, flag that we want to run */
-//              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-//              /*Packed enough tasks let's go*/
-//              int launch = pack_vars_pair_forc->launch;
-//              /* Do we have enough stuff to run the GPU ? */
-//              if (launch || launch_leftovers) {
-//                /*Launch GPU tasks*/
-//                //  			runner_dopair1_launch_f(r, sched,
-//                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-//                //  					d_parts_aos_pair_forc,
-//                //  stream, d_a, d_H, e, &packing_time_pair_f,
-//                //  &time_for_gpu_pair_f);
-//                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
-//                runner_dopair1_launch_f4_f_one_memcpy(
-//                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
-//                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
-//                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
-//                    &packing_time_pair_f, &time_for_gpu_pair_f,
-//                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
-//                    pair_end_f);
-//              } /* End of GPU work Pairs */
+              ticks tic_cpu_pack = getticks();
+
+              packing_time_pair_f +=
+                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
+                                           cj, t, parts_aos_pair_f4_f_send, e,
+                                           fparti_fpartj_lparti_lpartj_forc);
+
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+              /* No pack tasks left in queue, flag that we want to run */
+              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+              /*Packed enough tasks let's go*/
+              int launch = pack_vars_pair_forc->launch;
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                //  			runner_dopair1_launch_f(r, sched,
+                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
+                //  					d_parts_aos_pair_forc,
+                //  stream, d_a, d_H, e, &packing_time_pair_f,
+                //  &time_for_gpu_pair_f);
+                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
+                runner_dopair1_launch_f4_f_one_memcpy(
+                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
+                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
+                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair_f, &time_for_gpu_pair_f,
+                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
+                    pair_end_f);
+              } /* End of GPU work Pairs */
 #endif
           } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
@@ -2125,17 +2125,17 @@ void *runner_main2(void *data) {
       
       if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
-        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-      	  int qid = r->qid;
-      	  if (t->type == task_type_sub_pair){
-      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-      	  }
-      	  if (t->type == task_type_sub_self){
-      		  atomic_dec(&(sched->queues[qid].n_packs_self_left));
-      	  }
-    	  t = scheduler_done(sched, t);
-        }
-    	else{
+//        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+//      	  int qid = r->qid;
+//      	  if (t->type == task_type_sub_pair){
+//      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+//      	  }
+//      	  if (t->type == task_type_sub_self){
+//      		  atomic_dec(&(sched->queues[qid].n_packs_self_left));
+//      	  }
+//    	  t = scheduler_done(sched, t);
+//        }
+//    	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
 	t->toc = getticks();           
@@ -2143,7 +2143,7 @@ void *runner_main2(void *data) {
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
-    	}
+//    	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -2151,17 +2151,17 @@ void *runner_main2(void *data) {
       
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
-        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-      	  int qid = r->qid;
-      	  if (t->type == task_type_sub_pair){
-      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-      	  }
-      	  if (t->type == task_type_sub_self){
-      		  atomic_dec(&(sched->queues[qid].n_packs_self_left_g));
-      	  }
-      	  t = scheduler_done(sched, t);
-        }
-      	else{
+//        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+//      	  int qid = r->qid;
+//      	  if (t->type == task_type_sub_pair){
+//      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
+//      	  }
+//      	  if (t->type == task_type_sub_self){
+//      		  atomic_dec(&(sched->queues[qid].n_packs_self_left_g));
+//      	  }
+//      	  t = scheduler_done(sched, t);
+//        }
+//      	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
 	t->toc = getticks();           
@@ -2169,7 +2169,7 @@ void *runner_main2(void *data) {
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
-      	}
+//      	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -2177,25 +2177,25 @@ void *runner_main2(void *data) {
 
       else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
-      	if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-        	  int qid = r->qid;
-        	  if (t->type == task_type_sub_pair){
-        		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-        	  }
-        	  if (t->type == task_type_sub_self){
-        		  atomic_dec(&(sched->queues[qid].n_packs_self_left_f));
-        	  }
-      		t = scheduler_done(sched, t);
-      	}
-        /* Don't enqueue unpacks yet. Just signal the runners */
-      	else{
+//      	if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
+//        	  int qid = r->qid;
+//        	  if (t->type == task_type_sub_pair){
+//        		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
+//        	  }
+//        	  if (t->type == task_type_sub_self){
+//        		  atomic_dec(&(sched->queues[qid].n_packs_self_left_f));
+//        	  }
+//      		t = scheduler_done(sched, t);
+//      	}
+//        /* Don't enqueue unpacks yet. Just signal the runners */
+//      	else{
         t->skip = 1;
 	t->toc = getticks();           
 	t->total_ticks += t->toc - t->tic;
 //	signal_sleeping_runners(sched, t);
 //	enqueue_dependencies(sched, t);
         t = NULL;
-      	}
+//      	}
 #else
         t = scheduler_done(sched, t);
 #endif
diff --git a/src/space.h b/src/space.h
index 8e3dcabf15..7156d00d12 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,8 +55,8 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 512
-#define space_subsize_self_hydro_default 256
+#define space_subsize_pair_hydro_default 256
+#define space_subsize_self_hydro_default 128
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000
 #define space_subsize_pair_grav_default 256000000

From 50c6a746496170fd9d0feac758df4b64d20fb24b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 13:00:18 +0000
Subject: [PATCH 089/217] Changed a few parameters

---
 src/engine_maketasks.c   | 48 ++++++++++++++++++++--------------------
 src/runner_main_clean.cu |  4 ++--
 src/space.h              |  4 ++--
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 0893510f1e..788b9a9a9d 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4937,30 +4937,30 @@ void engine_maketasks(struct engine *e) {
      *                threadpool_auto_chunk_size, e); */
   }
 
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//	  struct task * t = &sched->tasks[i];
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-//    	t->type = task_type_pair;
-//      }
-//  }
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//	  struct task * t = &sched->tasks[i];
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-//    	t->type = task_type_pair;
-//      }
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-//        t->type = task_type_self;
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-//    	t->type = task_type_pair;
-//      }
-//  }
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+    	t->type = task_type_pair;
+      }
+  }
+  for (int i = 0; i < sched->nr_tasks; i++) {
+	  struct task * t = &sched->tasks[i];
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+    	t->type = task_type_pair;
+      }
+	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+        t->type = task_type_self;
+	  }
+      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+    	t->type = task_type_pair;
+      }
+  }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 68e17612e9..44b66570b0 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1578,7 +1578,7 @@ void *runner_main2(void *data) {
           //          struct timespec t0, t1; //
           //          clock_gettime(CLOCK_REALTIME, &t0);
           ticks tic_cpu_pack = getticks();
-          message("Did a sub_self density");
+//          message("Did a sub_self density");
 
           packing_time +=
               runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
@@ -1745,7 +1745,7 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
 		  ticks tic_cpu_pack = getticks();
 
-          message("Did a sub_pair density");
+//          message("Did a sub_pair density");
 		  packing_time_pair += runner_dopair1_pack_f4(
 			  r, sched, pack_vars_pair_dens, ci, cj, t,
 			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
diff --git a/src/space.h b/src/space.h
index 7156d00d12..f74ec421a1 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,8 +55,8 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 256
-#define space_subsize_self_hydro_default 128
+#define space_subsize_pair_hydro_default 256000000
+#define space_subsize_self_hydro_default 64
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000
 #define space_subsize_pair_grav_default 256000000

From d8086952f6b9cb96ea2c0446ef153db2dbf1735c Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 14:41:08 +0000
Subject: [PATCH 090/217] Removed redundnat commented out code from
 engine_maketasks()

---
 src/engine_maketasks.c | 70 ++----------------------------------------
 1 file changed, 2 insertions(+), 68 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 788b9a9a9d..30f826129f 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5004,12 +5004,6 @@ void engine_maketasks(struct engine *e) {
             sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
       }
 
-      /* pack -> unpack -> ghost_in */
-      if (t->ci->hydro.super->hydro.ghost_in == NULL && t->ci->nodeID == e->nodeID)
-        message("Ghost in for cell i is NULL\n");
-      if (t->cj->hydro.super->hydro.ghost_in == NULL && t->cj->nodeID == e->nodeID)
-        message("Ghost in for cell j is NULL\n");
-
       scheduler_addunlock(sched, t, last_created_pair_unpack);
       if (t->ci->nodeID == e->nodeID)
         scheduler_addunlock(sched, last_created_pair_unpack,
@@ -5022,14 +5016,6 @@ void engine_maketasks(struct engine *e) {
       engine_addlink(e, &t->ci->hydro.density_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.density_unpack, last_created_pair_unpack);
 
-      /*Useless as this ends up only setting one pair unpack as the unpack task
-       * for this cell whilst the cell interacts with many other cells and can
-       * be linked to another unpack task. Rely on links instead*/
-      //      t->ci->hydro.d_unpack = last_created_pair_unpack;
-      //      t->cj->hydro.d_unpack = last_created_pair_unpack;
-
-      //      t->ci->hydro.super->hydro.d_unpack = last_created_self_unpack;
-
       ++count_current_pair;
     } else {
       /* Abouzied: I need to implement the sub-self and sub-pair version */
@@ -5042,20 +5028,6 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks)
     error("We did not find the correct number of pair pack tasks!!");
 #endif
-  /* Loop over all the currently existing ghost_in tasks to add unpack
-   * dependency*/
-  //  for (int i = 0; i < sched->nr_tasks; i++) {
-  //    struct task *t = &sched->tasks[i];
-  //    if (t->type != task_type_ghost_in)
-  //      continue;
-  ////    if(t->ci->hydro.super == t->ci && t->ci->nodeID == e->nodeID)
-  ////          scheduler_addunlock(sched, t->ci->hydro.d_unpack, t);
-  //    for (struct link *l = t->ci->hydro.density_unpack; l != NULL; l =
-  //    l->next) {
-  ////      if(l->t->type == task_type_pair)
-  //    	  scheduler_addunlock(sched, l->t, t);
-  //    }
-  //  }
 
   /*Now create unpacks for all gpu_pack_g (gradient) tasks A. Nasar */
   count_current_self = 0;
@@ -5109,11 +5081,6 @@ void engine_maketasks(struct engine *e) {
                      last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.gradient_unpack,
                      last_created_pair_unpack);
-      /*Useless as this ends up only setting one pair unpack as the unpack task
-       * for this cell whilst the cell interacts with many other cells and can
-       * be linked to another unpack task. Rely on links instead*/
-      //      t->ci->hydro.g_unpack = last_created_pair_unpack;
-      //      t->cj->hydro.g_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -5133,20 +5100,7 @@ void engine_maketasks(struct engine *e) {
         "what it shoudl be %i",
         count_current_pair, sched->nr_pair_pack_tasks_g);
 #endif
-  /* Loop over all the currently existing extra_ghost tasks to add unpack
-   * dependency*/
-  //  for (int i = 0; i < sched->nr_tasks; i++) {
-  //    struct task *t = &sched->tasks[i];
-  //    if (t->type != task_type_extra_ghost)
-  //      continue;
-  ////    if(t->ci->nodeID == e->nodeID)
-  ////      scheduler_addunlock(sched, t->ci->hydro.g_unpack, t);
-  //    for (struct link *l = t->ci->hydro.gradient_unpack; l != NULL; l =
-  //    l->next) {
-  ////    	if(l->t->type == task_type_pair)
-  //    		scheduler_addunlock(sched, l->t, t);
-  //    }
-  //  }
+
   /*Now create unpacks for all gpu_pack_f (force) tasks*/
   count_current_self = 0;
   count_current_pair = 0;
@@ -5172,8 +5126,6 @@ void engine_maketasks(struct engine *e) {
                           t->ci->hydro.super->hydro.end_force);
       /*Creating links between a each cell and its unpack task*/
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_self_unpack);
-      //
-      //      t->ci->hydro.f_unpack = last_created_self_unpack;
 
       ++count_current_self;
     }
@@ -5196,11 +5148,6 @@ void engine_maketasks(struct engine *e) {
 
       engine_addlink(e, &t->ci->hydro.force_unpack, last_created_pair_unpack);
       engine_addlink(e, &t->cj->hydro.force_unpack, last_created_pair_unpack);
-      /*Useless as this ends up only setting one pair unpack as the unpack task
-       * for this cell whilst the cell interacts with many other cells and can
-       * be linked to another unpack task. Rely on links instead*/
-      //      t->ci->hydro.f_unpack = last_created_pair_unpack;
-      //      t->cj->hydro.f_unpack = last_created_pair_unpack;
 
       ++count_current_pair;
     } else {
@@ -5214,20 +5161,7 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_f)
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
-  /* Loop over all the currently existing end_force tasks to add unpack
-   * dependency*/
-  //  for (int i = 0; i < sched->nr_tasks; i++) {
-  //    struct task *t = &sched->tasks[i];
-  //    if (t->type != task_type_end_hydro_force)
-  //      continue;
-  ////    if(t->ci->nodeID == e->nodeID)
-  ////      scheduler_addunlock(sched, t->ci->hydro.f_unpack, t);
-  //    for (struct link *l = t->ci->hydro.force_unpack; l != NULL; l = l->next)
-  //    {
-  ////    	if(l->t->type == task_type_pair)
-  //    		scheduler_addunlock(sched, l->t, t);
-  //    }
-  //  }
+
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());

From 18513c93398e5ec773173940e286498013fc8bce Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 16:42:18 +0000
Subject: [PATCH 091/217] before adding time

---
 src/runner_main_clean.cu | 305 ++++-----------------------------------
 1 file changed, 28 insertions(+), 277 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 44b66570b0..867ba0aa87 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -970,14 +970,19 @@ void *runner_main2(void *data) {
 
     /* Loop while there are tasks... */
     tasks_done_gpu_inc = 0;
+    ticks hang_time = getticks();
     while (1) {
-
+      ticks tic_get_task = getticks();
       /* If there's no old task, try to get a new one. */
       if (t == NULL) {
         /* Get the task. */
         TIMER_TIC
         t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
+        hang_time += getticks() - tic_get_task;
+        if (hang_time > 1000000000000000000){
+        	message("I'm stuck runner %i", r->cpuid);
+        }
         /* Did I get anything? */
         if (t == NULL) break;
       }
@@ -1062,7 +1067,8 @@ void *runner_main2(void *data) {
             if (launch_leftovers) n_partial_d_bundles++;
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
-              signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
+              int t_packed = pack_vars_self_dens->tasks_packed;
+              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4(
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
@@ -1101,7 +1107,8 @@ void *runner_main2(void *data) {
               //      pack_vars_self_grad, ci, t, parts_aos_grad,
               //      	        		d_parts_aos_grad, stream, d_a,
               //      d_H, e, &packing_time_g, &time_for_gpu_g);
-              signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
+              int t_packed = pack_vars_self_grad->tasks_packed;
+              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_g(
                   r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                   parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
@@ -1141,7 +1148,8 @@ void *runner_main2(void *data) {
               //  pack_vars_self_forc, ci, t, parts_aos_forc,
               //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
               //  &time_for_gpu_f);
-              signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
+              int t_packed = pack_vars_self_forc->tasks_packed;
+              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1306,7 +1314,8 @@ void *runner_main2(void *data) {
                 //						d_parts_aos_pair_dens,
                 // stream, d_a, d_H, e, &packing_time_pair,
                 //&time_for_density_gpu_pair);
-                signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
+            	int t_packed = pack_vars_pair_dens->tasks_packed;
+                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1392,7 +1401,8 @@ void *runner_main2(void *data) {
                 //					d_parts_aos_pair_grad,
                 // stream, d_a, d_H, e, &packing_time_pair_g,
                 //&time_for_gpu_pair_g);
-                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
+            	int t_packed = pack_vars_pair_grad->tasks_packed;
+                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_g_one_memcpy(
                     r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
                     parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
@@ -1475,7 +1485,8 @@ void *runner_main2(void *data) {
                 //  					d_parts_aos_pair_forc,
                 //  stream, d_a, d_H, e, &packing_time_pair_f,
                 //  &time_for_gpu_pair_f);
-                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
+            	int t_packed = pack_vars_pair_forc->tasks_packed;
+                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_f_one_memcpy(
                     r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
                     parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
@@ -1571,132 +1582,16 @@ void *runner_main2(void *data) {
             time_for_density_cpu_sub +=
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-          /* GPU WORK */
-        } else if (t->subtype == task_subtype_gpu_pack) {
-          packed_self++;
-#ifdef GPUOFFLOAD_DENSITY
-          //          struct timespec t0, t1; //
-          //          clock_gettime(CLOCK_REALTIME, &t0);
-          ticks tic_cpu_pack = getticks();
-//          message("Did a sub_self density");
-
-          packing_time +=
-              runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
-                                     parts_aos_f4_send, task_first_part_f4);
-
-	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-          //      	  clock_gettime(CLOCK_REALTIME, &t1);
-          //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
-          //      			(t1.tv_nsec - t0.tv_nsec) /
-          //      1000000000.0;
-          //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
-          //        		  t, parts_aos_dens, &packing_time);
-          /* No pack tasks left in queue, flag that we want to run */
-          int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-          /*Packed enough tasks let's go*/
-          int launch = pack_vars_self_dens->launch;
-          /* Do we have enough stuff to run the GPU ? */
-          if (launch) n_full_d_bundles++;
-          if (launch_leftovers) n_partial_d_bundles++;
-          if (launch || launch_leftovers) {
-            /*Launch GPU tasks*/
-            signal_sleeping_runners(sched, t, pack_vars_self_dens->tasks_packed);
-            runner_doself1_launch_f4(
-                r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
-                parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
-                stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-                &unpack_time_self, task_first_part_self_dens_f4, devId,
-                task_first_part_f4, d_task_first_part_f4, self_end);
-            //	        runner_doself1_launch(r, sched,
-            // pack_vars_self_dens, ci, t, parts_aos_dens,
-            // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
-            // &time_for_density_gpu,
-            // &tot_time_for_hard_memcpys);
-          } /*End of GPU work Self*/
-#endif  //GPUOFFLOAD_DENSITY
-        } /* self / pack */
+          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
 //            fprintf(stderr, "split a g task\n");
           }
-          else if (t->subtype == task_subtype_gpu_pack_g) {
-#ifdef GPUOFFLOAD_GRADIENT
-              //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-              //        		  t, parts_aos_grad, &packing_time_g);
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_g += runner_doself1_pack_f4_g(
-                  r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                  task_first_part_f4_g);
-
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_grad->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //      	        runner_doself1_launch_g(r, sched,
-                //      pack_vars_self_grad, ci, t, parts_aos_grad,
-                //      	        		d_parts_aos_grad, stream, d_a,
-                //      d_H, e, &packing_time_g, &time_for_gpu_g);
-                signal_sleeping_runners(sched, t, pack_vars_self_grad->tasks_packed);
-                runner_doself1_launch_f4_g(
-                    r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
-                    parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
-                    d_parts_aos_grad_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
-                    d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
-              } /*End of GPU work Self*/
-#endif //GPUOFFLOAD_GRADIENT
-          }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
 //            fprintf(stderr, "split a f task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_f) {
-#ifdef GPUOFFLOAD_FORCE
-              //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-              //        		  t, parts_aos_forc, &packing_time_f);
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_f += runner_doself1_pack_f4_f(
-                  r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                  task_first_part_f4_f);
-
-  	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              //          int count = ci->hydro.count;
-              //          for(int i = 0; i < count; i++){
-              //        	  int pid = pack_vars_self_forc->count_parts - count +
-              //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
-              //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
-              //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
-              //          }
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_self_forc->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //  	        runner_doself1_launch_f(r, sched,
-                //  pack_vars_self_forc, ci, t, parts_aos_forc,
-                //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
-                //  &time_for_gpu_f);
-                signal_sleeping_runners(sched, t, pack_vars_self_forc->tasks_packed);
-                runner_doself1_launch_f4_f(
-                    r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
-                    parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
-                    d_parts_aos_forc_f4_recv, stream, d_a, d_H, e,
-                    &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
-                    d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
-              } /*End of GPU work Self*/
-#endif //GPUOFFLOAD_FORCE
           }
           else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
@@ -1741,119 +1636,17 @@ void *runner_main2(void *data) {
 //            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
-          else if (t->subtype == task_subtype_gpu_pack) {
-#ifdef GPUOFFLOAD_DENSITY
-		  ticks tic_cpu_pack = getticks();
-
-//          message("Did a sub_pair density");
-		  packing_time_pair += runner_dopair1_pack_f4(
-			  r, sched, pack_vars_pair_dens, ci, cj, t,
-			  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-
-		  t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-		  /* Packed enough tasks or no pack tasks left in queue, flag that
-		   * we want to run */
-		  int launch = pack_vars_pair_dens->launch;
-		  int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-
-		  /* Do we have enough stuff to run the GPU ? */
-		  if (launch) n_full_p_d_bundles++;
-		  if (launch_leftovers) n_partial_p_d_bundles++;
-		  if (launch || launch_leftovers) {
-
-			/*Launch GPU tasks*/
-			//				runner_dopair1_launch(r, sched,
-			// pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-			//						d_parts_aos_pair_dens,
-			// stream, d_a, d_H, e, &packing_time_pair,
-			//&time_for_density_gpu_pair);
-			signal_sleeping_runners(sched, t, pack_vars_pair_dens->tasks_packed);
-			runner_dopair1_launch_f4_one_memcpy(
-				r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-				parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-				d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-				&packing_time_pair, &time_for_density_gpu_pair,
-				&unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-				pair_end);
-		  }
-#endif
-          }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
 //            fprintf(stderr, "split a g task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_g) {
-#ifdef GPUOFFLOAD_GRADIENT
-  	          ticks tic_cpu_pack = getticks();
-
-              packing_time_pair_g +=
-                  runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
-                                           cj, t, parts_aos_pair_f4_g_send, e,
-                                           fparti_fpartj_lparti_lpartj_grad);
-
-  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_pair_grad->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //			runner_dopair1_launch_g(r, sched,
-                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-                //					d_parts_aos_pair_grad,
-                // stream, d_a, d_H, e, &packing_time_pair_g,
-                //&time_for_gpu_pair_g);
-                signal_sleeping_runners(sched, t, pack_vars_pair_grad->tasks_packed);
-                runner_dopair1_launch_f4_g_one_memcpy(
-                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
-                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
-                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_g, &time_for_gpu_pair_g,
-                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
-                    pair_end_g);
-              }
-#endif
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
 //            fprintf(stderr, "split a f task\n");
-          } else if (t->subtype == task_subtype_gpu_pack_f) {
-#ifdef GPUOFFLOAD_FORCE
-              ticks tic_cpu_pack = getticks();
-
-              packing_time_pair_f +=
-                  runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
-                                           cj, t, parts_aos_pair_f4_f_send, e,
-                                           fparti_fpartj_lparti_lpartj_forc);
-
-              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
-              /* No pack tasks left in queue, flag that we want to run */
-              int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-              /*Packed enough tasks let's go*/
-              int launch = pack_vars_pair_forc->launch;
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                //  			runner_dopair1_launch_f(r, sched,
-                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-                //  					d_parts_aos_pair_forc,
-                //  stream, d_a, d_H, e, &packing_time_pair_f,
-                //  &time_for_gpu_pair_f);
-                signal_sleeping_runners(sched, t, pack_vars_pair_forc->tasks_packed);
-                runner_dopair1_launch_f4_f_one_memcpy(
-                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
-                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
-                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_f, &time_for_gpu_pair_f,
-                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
-                    pair_end_f);
-              } /* End of GPU work Pairs */
-#endif
-          } else if (t->subtype == task_subtype_limiter)
+          }
+          else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_pair_stars_density(r, ci, cj, 1);
@@ -2125,25 +1918,11 @@ void *runner_main2(void *data) {
       
       if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
-//        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-//      	  int qid = r->qid;
-//      	  if (t->type == task_type_sub_pair){
-//      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-//      	  }
-//      	  if (t->type == task_type_sub_self){
-//      		  atomic_dec(&(sched->queues[qid].n_packs_self_left));
-//      	  }
-//    	  t = scheduler_done(sched, t);
-//        }
-//    	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	t->toc = getticks();           
-	t->total_ticks += t->toc - t->tic;
-//	signal_sleeping_runners(sched, t);
-//	enqueue_dependencies(sched, t);
+	    t->toc = getticks();
+	    t->total_ticks += t->toc - t->tic;
         t = NULL;
-//    	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -2151,25 +1930,11 @@ void *runner_main2(void *data) {
       
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
-//        if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-//      	  int qid = r->qid;
-//      	  if (t->type == task_type_sub_pair){
-//      		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-//      	  }
-//      	  if (t->type == task_type_sub_self){
-//      		  atomic_dec(&(sched->queues[qid].n_packs_self_left_g));
-//      	  }
-//      	  t = scheduler_done(sched, t);
-//        }
-//      	else{
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	t->toc = getticks();           
-	t->total_ticks += t->toc - t->tic;
-//	signal_sleeping_runners(sched, t);
-//	enqueue_dependencies(sched, t);
+	    t->toc = getticks();
+	    t->total_ticks += t->toc - t->tic;
         t = NULL;
-//      	}
 #else
         t = scheduler_done(sched, t);
 #endif
@@ -2177,25 +1942,11 @@ void *runner_main2(void *data) {
 
       else if (t->subtype == task_subtype_gpu_pack_f) {
 #ifdef GPUOFFLOAD_FORCE
-//      	if (t->type == task_type_sub_self || t->type == task_type_sub_pair){
-//        	  int qid = r->qid;
-//        	  if (t->type == task_type_sub_pair){
-//        		  atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-//        	  }
-//        	  if (t->type == task_type_sub_self){
-//        		  atomic_dec(&(sched->queues[qid].n_packs_self_left_f));
-//        	  }
-//      		t = scheduler_done(sched, t);
-//      	}
-//        /* Don't enqueue unpacks yet. Just signal the runners */
-//      	else{
+        /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	t->toc = getticks();           
-	t->total_ticks += t->toc - t->tic;
-//	signal_sleeping_runners(sched, t);
-//	enqueue_dependencies(sched, t);
+	    t->toc = getticks();
+	    t->total_ticks += t->toc - t->tic;
         t = NULL;
-//      	}
 #else
         t = scheduler_done(sched, t);
 #endif

From 306d32e3d0702a410a79075b83303c46eae952c3 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 16:57:04 +0000
Subject: [PATCH 092/217] Chnaged task_unlocks to cell_unlocktree. Modified
 timer function for hanging in runner_main

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       | 23 ++++++++++---------
 src/runner_main_clean.cu                      |  2 +-
 src/space.h                                   |  2 +-
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 36d2f23ea8..d4ce6525fc 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -29,7 +29,7 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.99   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index d7acca7f8d..f9d295bacc 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -212,7 +212,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
    * launch_leftovers statement)*/
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
-  task_unlock(t);
+  cell_unlocktree(ci);
   t->gpu_done = 1;
   //		cell_unlocktree(ci);
   //		// MATTHIEU signal_sleeping_runners(s, t);
@@ -334,8 +334,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
    * launch_leftovers statement)*/
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
-  	task_unlock(t);
-//  cell_unlocktree(ci);
+  cell_unlocktree(ci);
   //	// MATTHIEU signal_sleeping_runners(s, t);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
@@ -637,7 +636,8 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   atomic_dec(&(s->queues[qid].n_packs_pair_left));
   t->done = 1;
   /* Copies done. Release the lock ! */
-  task_unlock(t);
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
@@ -838,7 +838,8 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
   t->done = 1;
   /* Copies done. Release the lock ! */
-  task_unlock(t);
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
@@ -1041,7 +1042,8 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
   t->done = 1;
   /* Copies done. Release the lock ! */
-  task_unlock(t);
+  cell_unlocktree(ci);
+  cell_unlocktree(cj);
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
@@ -2894,11 +2896,10 @@ void runner_dopair1_launch_f4_one_memcpy(
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
 
-//        //		  /* Release the locks */
-//        cell_unlocktree(cii);
-//        //		  /* Release the locks */
-//        cell_unlocktree(cjj);
-        task_unlock(tii);
+        //		  /* Release the locks */
+        cell_unlocktree(cii);
+        //		  /* Release the locks */
+        cell_unlocktree(cjj);
 
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 867ba0aa87..7d70fdfa60 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -980,7 +980,7 @@ void *runner_main2(void *data) {
         t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
         hang_time += getticks() - tic_get_task;
-        if (hang_time > 1000000000000000000){
+        if (hang_time > 1000000000000000){
         	message("I'm stuck runner %i", r->cpuid);
         }
         /* Did I get anything? */
diff --git a/src/space.h b/src/space.h
index f74ec421a1..de67162e73 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,7 +55,7 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 256000000
+#define space_subsize_pair_hydro_default 4096
 #define space_subsize_self_hydro_default 64
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000

From 3e0c7b85aa18ae6fcc56ef5b961c1708a71d5dce Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 17:25:15 +0000
Subject: [PATCH 093/217] Remved unnecessary counter from engine_maketasks

---
 src/engine_maketasks.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 30f826129f..98100152d6 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4820,9 +4820,6 @@ void engine_maketasks(struct engine *e) {
   scheduler_reset(sched, engine_estimate_nr_tasks(e));
 
   ticks tic2 = getticks();
-  /*Initialise GPU task size in prep. for creation A. Nasar */
-  sched->target_gpu_tasks = 32*32*32;//s->nr_cells;  // OK AS LONG AS NOT SPLITTING
-  const int target_gpu_tasks = sched->target_gpu_tasks;
 
   /* Construct the first hydro loop over neighbours */
   if (e->policy & engine_policy_hydro)

From fe5d9459660719bdf8b548dc6d906cae312e62cb Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 17:43:17 +0000
Subject: [PATCH 094/217] Changed how s->nr_self_pack_tasks is incremented in
 scheduler.c

---
 src/scheduler.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index ae4572177d..9456e23ac5 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1768,27 +1768,24 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
   // #ifdef WITH_CUDA  A. Nasar
-  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack) {
-    atomic_inc(&s->nr_self_pack_tasks);
+  if(t->subtype == task_subtype_gpu_pack){
+	  if(t->type == task_type_self || t->type == task_type_sub_self)
+		    atomic_inc(&s->nr_self_pack_tasks);
+	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
+		    atomic_inc(&s->nr_pair_pack_tasks);
   }
-  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack) {
-    atomic_inc(&s->nr_pair_pack_tasks);
+  if(t->subtype == task_subtype_gpu_pack_f){
+	  if(t->type == task_type_self || t->type == task_type_sub_self)
+		    atomic_inc(&s->nr_self_pack_tasks_f);
+	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
+		    atomic_inc(&s->nr_pair_pack_tasks_f);
   }
-  // #ifdef WITH_CUDA
-  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack_g) {
-    atomic_inc(&s->nr_self_pack_tasks_g);
+  if(t->subtype == task_subtype_gpu_pack_g){
+	  if(t->type == task_type_self || t->type == task_type_sub_self)
+		    atomic_inc(&s->nr_self_pack_tasks_g);
+	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
+		    atomic_inc(&s->nr_pair_pack_tasks_g);
   }
-  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack_g) {
-    atomic_inc(&s->nr_pair_pack_tasks_g);
-  }
-  // #ifdef WITH_CUDA
-  if ((t->type == task_type_self || t->type == task_type_sub_self) && t->subtype == task_subtype_gpu_pack_f) {
-    atomic_inc(&s->nr_self_pack_tasks_f);
-  }
-  if ((t->type == task_type_pair || t->type == task_type_sub_pair) && t->subtype == task_subtype_gpu_pack_f) {
-    atomic_inc(&s->nr_pair_pack_tasks_f);
-  }
-
   // #endif
   /* Add an index for it. */
   // lock_lock( &s->lock );

From 744c4690aacf1c0f196ead9257ef16d5ea342046 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 21 Nov 2024 18:01:48 +0000
Subject: [PATCH 095/217] Commented out timing code in runner_main. The code
 ran for a full simulation so edit from prev commit seems to have done the
 trick.

---
 src/runner_main_clean.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7d70fdfa60..8d1c83b10b 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -972,17 +972,17 @@ void *runner_main2(void *data) {
     tasks_done_gpu_inc = 0;
     ticks hang_time = getticks();
     while (1) {
-      ticks tic_get_task = getticks();
+//      ticks tic_get_task = getticks();
       /* If there's no old task, try to get a new one. */
       if (t == NULL) {
         /* Get the task. */
         TIMER_TIC
         t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
-        hang_time += getticks() - tic_get_task;
-        if (hang_time > 1000000000000000){
-        	message("I'm stuck runner %i", r->cpuid);
-        }
+//        hang_time += getticks() - tic_get_task;
+//        if (hang_time > 1000000000000000){
+//        	message("I'm stuck runner %i", r->cpuid);
+//        }
         /* Did I get anything? */
         if (t == NULL) break;
       }

From 5b54b0f22275a3ce168ded398c1d990dc185bc79 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 14 Dec 2024 13:30:47 +0000
Subject: [PATCH 096/217] Made changes to task dump for debugging to account
 for (skip) cell-less unpack tasks in some loop. Allowed stealing (broken and
 code hangs). Changed if statements in scheduler steal function

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  3 +-
 src/engine_config.c                           | 10 +++----
 src/scheduler.c                               | 30 +++++++++----------
 src/task.c                                    |  9 ++++--
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index d4ce6525fc..ea0d137805 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,8 +7,9 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 8
+  max_top_level_cells: 16 
   tasks_per_cell: 200
+  deadlock_waiting_time_s:   10
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
diff --git a/src/engine_config.c b/src/engine_config.c
index 28cbb3f671..ff3ff5ec9f 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-  //  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-  //                 (e->policy & scheduler_flag_steal), e->nodeID,
-  //                 &e->threadpool);
+    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+                   (e->policy & scheduler_flag_steal), e->nodeID,
+                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-                 &e->threadpool);
+//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+//                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/scheduler.c b/src/scheduler.c
index 9456e23ac5..d2668e005c 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3134,8 +3134,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         if (res != NULL) break;
       }
 
-      /* If unsuccessful, try stealing from the other queues. A. Nasar commented
-       * out for GPU work*/
+      /* If unsuccessful, try stealing from the other queues. A. Nasar
+       * falg set to zero for GPU work*/
       if (s->flags & scheduler_flag_steal) {
 
         int count = 0, qids[nr_queues];
@@ -3159,39 +3159,39 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
           /* Lucky? */
           if (res != NULL) {
-
-            if ((res->type == task_type_self ||
-            	 res->type == task_type_sub_self)&&
+//		  if (res != NULL && res->subtype != task_subtype_gpu_pack
+//				  && res->subtype != task_subtype_gpu_pack_g
+//				  && res->subtype != task_subtype_gpu_pack_f
+//				  && res->subtype != task_subtype_gpu_unpack
+//				  && res->subtype != task_subtype_gpu_unpack_g
+//				  && res->subtype != task_subtype_gpu_unpack_f) {
+
+            if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_self_left);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
             }
-            if ((res->type == task_type_self ||
-            	 res->type == task_type_sub_self)&&
+            if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_self_left_g);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
             }
-            if ((res->type == task_type_self ||
-            	 res->type == task_type_sub_self)&&
+            if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_self_left_f);
               atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
             }
-            if ((res->type == task_type_pair ||
-            	 res->type == task_type_sub_pair)&&
+            if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_pair_left);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
             }
-            if ((res->type == task_type_pair ||
-            	 res->type == task_type_sub_pair)&&
+            if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_g);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
             }
-            if ((res->type == task_type_pair ||
-            	 res->type == task_type_sub_pair)&&
+            if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_f);
               atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);
diff --git a/src/task.c b/src/task.c
index 52b1e3fb43..b45bc4fdf8 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1745,8 +1745,13 @@ void task_dump_active(struct engine *e) {
 
       /* Get destination rank of MPI requests. */
       int paired = (t->cj != NULL);
-      int otherrank = t->ci->nodeID;
-      if (paired) otherrank = t->cj->nodeID;
+      int otherrank = 0;
+      if(t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
+    		  && t->subtype!= task_subtype_gpu_unpack_g)
+    	  otherrank = t->ci->nodeID;
+      if (paired && t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
+    		  && t->subtype!= task_subtype_gpu_unpack_g)
+    	  otherrank = t->cj->nodeID;;
 
       fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
               engine_rank, otherrank, taskID_names[t->type],

From 7a168d57bddff7b9d99c523d8eafd9c6d6840a0d Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 14 Dec 2024 13:59:23 +0000
Subject: [PATCH 097/217] Added comments to remin myself to threadpool some
 bits in GPU task mgmt in engine_maketasks.c

---
 src/engine_maketasks.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 98100152d6..920d9ba3be 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4933,7 +4933,7 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
-
+  /*These loops should really be threadmapped A. Nasar*/
   for (int i = 0; i < sched->nr_tasks; i++) {
 	  struct task * t = &sched->tasks[i];
 	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
@@ -4969,7 +4969,8 @@ void engine_maketasks(struct engine *e) {
   struct task *last_created_self_unpack = NULL;
   struct task *last_created_pair_unpack = NULL;
 
-  /* Loop over all the currently existing pack tasks */
+  /* Loop over all the currently existing pack tasks
+   * These loops should be thread-mapped too but will be a bit more tricky: A. Nasar*/
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];

From f7bf486dbf0311331117e8dd88e0d9b061d62d85 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sun, 15 Dec 2024 16:28:53 +0000
Subject: [PATCH 098/217] Added code to pass eta_neighbours (h/dx) to GPU code
 for calculating ideal N parts per task

---
 src/runner_main_clean.cu | 20 ++++++++++++++++++++
 src/space.h              |  3 +++
 swift.c                  |  4 ++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 8d1c83b10b..b730c17a2a 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -657,6 +657,26 @@ void *runner_main2(void *data) {
   if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
     error("MPI_Comm_size failed with error %i.", res);
 #endif
+  int parts_per_top_level_cell = space->nr_local_cells_with_particles /
+		  space->nr_parts; /*A. Nasar: What I think is a good approximation for
+		                               average N particles in each top level cell*/
+  int buff = 30; /*A. Nasar: Increase parts per recursed task-level cell by buffer to
+                             ensure we allocate enough memory*/
+//  int np_per_cell = ceil(2.0 * e->eta_neighbours); /*A. Nasar: 2h/dx roughly*/
+
+//  char *param_filename = &e->param_filename;
+//  struct swift_params *params =
+//      (struct swift_params *)malloc(sizeof(struct swift_params));
+//  if (params == NULL) error("Error allocating memory for the parameter file.");
+//  message("Reading runtime parameters from file '%s'", param_filename);
+//  parser_read_file(param_filename, params);
+
+  float eta_neighbours = e->s->eta_neighbours;
+  int np_per_cell = ceil(2.0 * eta_neighbours);
+  np_per_cell *= np_per_cell * np_per_cell;
+  fprintf(stderr, "np_per_cell target is %i, eta_neighbours %f\n", np_per_cell, eta_neighbours);
+  exit(0);
+
   int count_max_parts_tmp =
       2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
   message("max_parts %i\n", count_max_parts_tmp);
diff --git a/src/space.h b/src/space.h
index de67162e73..92952a822b 100644
--- a/src/space.h
+++ b/src/space.h
@@ -94,6 +94,9 @@ extern double engine_foreign_alloc_margin;
  */
 struct space {
 
+  /*Used to define GPU task memory allocation*/
+  float eta_neighbours;
+
   /*! Spatial extent. */
   double dim[3];
 
diff --git a/swift.c b/swift.c
index b63941cd63..7a9277ae5c 100644
--- a/swift.c
+++ b/swift.c
@@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) {
       hydro_props_init(&hydro_properties, &prog_const, &us, params);
     else
       bzero(&hydro_properties, sizeof(struct hydro_props));
-
+    float eta_neighbours = hydro_properties.eta_neighbours;
     /* Initialise the equation of state */
     if (with_hydro)
       eos_init(&eos, &prog_const, &us, params);
@@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) {
                with_self_gravity, with_star_formation, with_sinks,
                with_DM_particles, with_DM_background_particles, with_neutrinos,
                talking, dry_run, nr_nodes);
-
+    s.eta_neighbours = eta_neighbours;
     /* Initialise the line of sight properties. */
     if (with_line_of_sight) los_init(s.dim, &los_properties, params);
 

From ff7087c368e3ebc9dae37bb7147f2fffa3c3a151 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 16 Dec 2024 15:12:56 +0000
Subject: [PATCH 099/217] Removed conversion of sub-pairs to pairs, etc, in
 engine_maketasks for GPU tasks. Added some debug checks here and there. For
 some reason I get cells with way more particles than anticipated even though
 split size parameters in space.h set to very small values

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  4 +-
 src/engine_config.c                           | 10 ++--
 src/engine_maketasks.c                        | 58 ++++++++++---------
 src/runner_doiact_functions_hydro_gpu.h       |  4 +-
 src/runner_gpu_pack_functions.c               |  3 +-
 src/runner_main_clean.cu                      | 40 ++++++++-----
 src/scheduler.c                               |  2 -
 src/space.h                                   |  4 +-
 8 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index ea0d137805..21cf7ebae1 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,9 +7,10 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16 
+  max_top_level_cells: 8
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
+  cell_split_size: 100
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -37,3 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
+  replicate:  2
diff --git a/src/engine_config.c b/src/engine_config.c
index ff3ff5ec9f..70d3cfd1d4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                   (e->policy & scheduler_flag_steal), e->nodeID,
-                   &e->threadpool);
+//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+//                   (e->policy & scheduler_flag_steal), e->nodeID,
+//                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-//                 &e->threadpool);
+  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 920d9ba3be..c2fdd05f48 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4852,9 +4852,7 @@ void engine_maketasks(struct engine *e) {
   tic2 = getticks();
 
   /* Split the tasks. */
-//  message("before split");
   scheduler_splittasks(sched, /*fof_tasks=*/0, e->verbose);
-//  message("after split");
 
   if (e->verbose)
     message("Splitting tasks took %.3f %s.",
@@ -4933,31 +4931,37 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
-  /*These loops should really be threadmapped A. Nasar*/
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-    	t->type = task_type_pair;
-      }
-  }
-  for (int i = 0; i < sched->nr_tasks; i++) {
-	  struct task * t = &sched->tasks[i];
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-    	t->type = task_type_pair;
-      }
-	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-        t->type = task_type_self;
-	  }
-      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-    	t->type = task_type_pair;
-      }
-  }
+//  int unsplit = 0, split = 0;
+//  /*These loops should really be threadmapped A. Nasar*/
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//	  struct task * t = &sched->tasks[i];
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
+//        t->type = task_type_self;
+//        split++;
+//        message("sub_self");
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
+//    	t->type = task_type_pair;
+//        message("sub_pair");
+//    	split++;
+//      }
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
+//        t->type = task_type_self;
+//        message("sub_self");
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
+//    	t->type = task_type_pair;
+//        message("sub_pair");
+//      }
+//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
+//        t->type = task_type_self;
+//        message("sub_self");
+//	  }
+//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
+//    	t->type = task_type_pair;
+//        message("sub_pair");
+//      }
+//  }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index f9d295bacc..e6f4f8f47a 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2766,6 +2766,8 @@ void runner_dopair1_launch_f4_one_memcpy(
                       fparti_fpartj_lparti_lpartj_dens[tid].y;
         parts_in_bundle_cj += count_j;
         max_parts_j = max(max_parts_j, count_j);
+//        if(count_i > 100 || count_j > 100)
+//        	error("Sending data for excessive n parts %i %i", count_i, count_j);
       }
     }
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2786,7 +2788,7 @@ void runner_dopair1_launch_f4_one_memcpy(
               "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
               "is: %i\n ",
               cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
+      error("Something's up with your cuda code first_part %i bundle size %i", first_part_tmp_i, bundle_n_parts);
     }
 #endif
     /* LAUNCH THE GPU KERNELS for ci & cj */
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
index de0feff44d..617b9b8e3b 100644
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -88,7 +88,8 @@ void runner_doself1_gpu_pack_neat_aos_f4(
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
+    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i count %i\n"
+    		, count_max_parts_tmp, local_pack_position + count);
     error("0");
   }
 #endif
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index b730c17a2a..c0cb3f44d5 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -660,26 +660,19 @@ void *runner_main2(void *data) {
   int parts_per_top_level_cell = space->nr_local_cells_with_particles /
 		  space->nr_parts; /*A. Nasar: What I think is a good approximation for
 		                               average N particles in each top level cell*/
-  int buff = 30; /*A. Nasar: Increase parts per recursed task-level cell by buffer to
-                             ensure we allocate enough memory*/
-//  int np_per_cell = ceil(2.0 * e->eta_neighbours); /*A. Nasar: 2h/dx roughly*/
-
-//  char *param_filename = &e->param_filename;
-//  struct swift_params *params =
-//      (struct swift_params *)malloc(sizeof(struct swift_params));
-//  if (params == NULL) error("Error allocating memory for the parameter file.");
-//  message("Reading runtime parameters from file '%s'", param_filename);
-//  parser_read_file(param_filename, params);
-
   float eta_neighbours = e->s->eta_neighbours;
   int np_per_cell = ceil(2.0 * eta_neighbours);
   np_per_cell *= np_per_cell * np_per_cell;
-  fprintf(stderr, "np_per_cell target is %i, eta_neighbours %f\n", np_per_cell, eta_neighbours);
-  exit(0);
+  int buff = ceil(0.5 * np_per_cell); /*A. Nasar: Increase parts per recursed task-level cell by buffer to
+                             ensure we allocate enough memory*/
 
+  int tot_self_tasks = space->nr_parts / np_per_cell;
+
+//  int count_max_parts_tmp =
+//      2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
   int count_max_parts_tmp =
-      2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
-  message("max_parts %i\n", count_max_parts_tmp);
+      target_n_tasks * (np_per_cell + buff);
+  message("max_parts %i, n_tasks_GPU %i\n", count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -987,6 +980,9 @@ void *runner_main2(void *data) {
     sched->nr_packs_pair_forc_done = 0;
     sched->nr_packs_self_grad_done = 0;
     sched->nr_packs_pair_grad_done = 0;
+    int g100 = 0;
+    int l100 = 0;
+    int maxcount = 0;
 
     /* Loop while there are tasks... */
     tasks_done_gpu_inc = 0;
@@ -1312,7 +1308,15 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
 	          ticks tic_cpu_pack = getticks();
-
+	          if (ci == ci->hydro.super || cj == cj->hydro.super){
+	        	  g100++;
+//	        	  message("GPU working on top level cell");
+	          }
+//              if(ci->hydro.count > 100){
+//            	  g100++;
+//            	  maxcount = max(ci->hydro.count, maxcount);
+//              }
+//              else l100++;
 	          packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
@@ -1918,6 +1922,8 @@ void *runner_main2(void *data) {
           error("Unknown/invalid task type (%d).", t->type);
       }
       r->active_time += (getticks() - task_beg);
+//      if(g100 > 0)
+//    	  message("less than 100 %i more than 100 %i max count %i", l100, g100, maxcount);
 
 /* Mark that we have run this task on these cells */
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1978,6 +1984,8 @@ void *runner_main2(void *data) {
         t = scheduler_done(sched, t);
       }
     } /* main loop. */
+
+    message("Worked on %i supers of split cells", g100);
       // Stuff for writing debug data to file for validation
       ////        if (step % 10 == 0 || step == 1) {
       //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
diff --git a/src/scheduler.c b/src/scheduler.c
index d2668e005c..d7084102ba 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1368,8 +1368,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
         /* Otherwise, break it up if it is too large? */
       } else if (scheduler_doforcesplit && ci->split && cj->split &&
                  (ci->hydro.count > space_maxsize / cj->hydro.count)) {
-        // message( "force splitting pair with %i and %i parts." ,
-        // ci->hydro.count , cj->hydro.count );
 
         /* Replace the current task. */
         t->type = task_type_none;
diff --git a/src/space.h b/src/space.h
index 92952a822b..9ca4783ad5 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,8 +55,8 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 4096
-#define space_subsize_self_hydro_default 64
+#define space_subsize_pair_hydro_default 100
+#define space_subsize_self_hydro_default 100
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000
 #define space_subsize_pair_grav_default 256000000

From ca9deaf24b021b17783f38a88b43e35fcba5ef14 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 17 Dec 2024 13:49:28 +0000
Subject: [PATCH 100/217] Added more debug checks to figure out why we got
 cells with 8x expected particles. Also added a modified
 scheduler_splittasks() function to split GPU tasks but still no joy. Could it
 be that some cells are not given progeny with size 2h^3?

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   2 +-
 src/Makefile.am                               |   2 +-
 src/engine_maketasks.c                        |  22 +-
 src/runner_main_clean.cu                      |  29 ++-
 src/scheduler.c                               | 198 +++++++++++++++++-
 src/space.h                                   |   2 +-
 6 files changed, 237 insertions(+), 18 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 21cf7ebae1..2a8aa1e70b 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -10,7 +10,7 @@ Scheduler:
   max_top_level_cells: 8
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
-  cell_split_size: 100
+  cell_split_size: 80
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
diff --git a/src/Makefile.am b/src/Makefile.am
index bfb38a5929..99092acde4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 # Add the non-standard paths to the included library headers
-AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS)  $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS)
+AM_CFLAGS = $(HDF5_CPPFLAGS) $(GSL_INCS) $(FFTW_INCS) $(NUMA_INCS) $(GRACKLE_INCS)  $(SUNDIALS_INCS) $(CHEALPIX_CFLAGS) -O0
 
 # Add HIP Path
 AM_CFLAGS += -D__HIP_PLATFORM_AMD__
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index c2fdd05f48..cb30e14dd0 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4937,29 +4937,27 @@ void engine_maketasks(struct engine *e) {
 //	  struct task * t = &sched->tasks[i];
 //	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
 //        t->type = task_type_self;
-//        split++;
-//        message("sub_self");
+//        fprintf(stderr, "sub_self");
 //	  }
 //      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
 //    	t->type = task_type_pair;
-//        message("sub_pair");
-//    	split++;
+//        fprintf(stderr, "sub_pair");
 //      }
 //	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
 //        t->type = task_type_self;
-//        message("sub_self");
+//        fprintf(stderr, "sub_self");
 //	  }
 //      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
 //    	t->type = task_type_pair;
-//        message("sub_pair");
+//        fprintf(stderr, "sub_pair");
 //      }
 //	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
 //        t->type = task_type_self;
-//        message("sub_self");
+//        fprintf(stderr, "sub_self");
 //	  }
 //      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
 //    	t->type = task_type_pair;
-//        message("sub_pair");
+//        fprintf(stderr, "sub_pair");
 //      }
 //  }
 
@@ -5164,6 +5162,14 @@ void engine_maketasks(struct engine *e) {
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
 
+  for (int i = 0; i < sched->nr_tasks; i++) {
+    struct task *t = &sched->tasks[i];
+    if(t->ci != NULL){
+      if(t->ci->hydro.count > 80 && t->subtype == task_subtype_gpu_pack)
+    	  error("Count is %i task subtype (%s)",
+                  t->ci->hydro.count, subtaskID_names[t->subtype]);
+    }
+  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index c0cb3f44d5..7a292113fc 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1058,6 +1058,13 @@ void *runner_main2(void *data) {
           } else if (t->subtype == task_subtype_gpu_pack) {
             packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
+            if(ci->hydro.count > 2 * np_per_cell){
+          	  g100++;
+          	  maxcount = max(ci->hydro.count, maxcount);
+          	  error("SELF expecting %i got %i parts"
+          			  "Cell ID %i Cell split %d",
+						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+            }
             //          struct timespec t0, t1; //
             //          clock_gettime(CLOCK_REALTIME, &t0);
             ticks tic_cpu_pack = getticks();
@@ -1308,13 +1315,23 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
 	          ticks tic_cpu_pack = getticks();
-	          if (ci == ci->hydro.super || cj == cj->hydro.super){
-	        	  g100++;
-//	        	  message("GPU working on top level cell");
-	          }
-//              if(ci->hydro.count > 100){
+//	          if (ci == ci->hydro.super || cj == cj->hydro.super){
+//	        	  g100++;
+////	        	  message("GPU working on top level cell");
+//	          }
+//              if(ci->hydro.count > 2 * np_per_cell){
+//            	  g100++;
+//            	  maxcount = max(ci->hydro.count, maxcount);
+//            	  error("expecting %i got %i parts"
+//            			  "Cell ID %i Cell split %d",
+//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+//              }
+//              if(ci->hydro.count < 2 * np_per_cell){
 //            	  g100++;
 //            	  maxcount = max(ci->hydro.count, maxcount);
+//            	  error("expecting %i got %i parts"
+//            			  "Cell ID %i Cell split %d",
+//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
 //              }
 //              else l100++;
 	          packing_time_pair += runner_dopair1_pack_f4(
@@ -1985,7 +2002,7 @@ void *runner_main2(void *data) {
       }
     } /* main loop. */
 
-    message("Worked on %i supers of split cells", g100);
+//    message("Worked on %i supers w more than 100 parts", g100);
       // Stuff for writing debug data to file for validation
       ////        if (step % 10 == 0 || step == 1) {
       //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
diff --git a/src/scheduler.c b/src/scheduler.c
index d7084102ba..fe44db8404 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1388,6 +1388,199 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
   }   /* iterate over the current task. */
 }
 
+/**
+ * @brief Split a hydrodynamic task if too large.
+ *
+ * @param t The #task
+ * @param s The #scheduler we are working in.
+ */
+static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
+  /* Are we considering both stars and hydro when splitting? */
+  /* Note this is not very clean as the scheduler should not really
+     access the engine... */
+
+  /* Iterate on this task until we're done with it. */
+  int redo = 1;
+  while (redo) {
+    /* Reset the redo flag. */
+    redo = 0;
+
+    /* Is this a non-empty self-task? */
+    const int is_self =
+        (t->type == task_type_self) && (t->ci != NULL) &&
+        (t->ci->hydro.count > 0);
+
+    /* Is this a non-empty pair-task? */
+    const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) &&
+                        (t->cj != NULL) &&
+                        (t->ci->hydro.count > 0) &&
+                        (t->cj->hydro.count > 0);
+
+    /* Empty task? */
+    if (!is_self && !is_pair) {
+      t->type = task_type_none;
+      t->subtype = task_subtype_none;
+      t->ci = NULL;
+      t->cj = NULL;
+      t->skip = 1;
+      break;
+    }
+
+    /* Self-interaction? */
+    if (t->type == task_type_self) {
+      /* Get a handle on the cell involved. */
+      struct cell *ci = t->ci;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Is this cell even split and the task does not violate h ? */
+      if (cell_can_split_self_hydro_task(ci)) {
+
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Add the self tasks. */
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+
+          t->ci = ci->progeny[first_child];
+          cell_set_flag(t->ci, cell_flag_has_tasks);
+
+          for (int k = first_child + 1; k < 8; k++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[k] != NULL &&
+                ci->progeny[k]->hydro.count) {
+            	scheduler_splittask_hydro_GPU(
+                  scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
+                                    ci->progeny[k], NULL),
+                  s);
+            }
+          }
+
+          /* Make a task for each pair of progeny */
+          for (int j = 0; j < 8; j++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[j] != NULL &&
+                (ci->progeny[j]->hydro.count)) {
+              for (int k = j + 1; k < 8; k++) {
+                /* Do we have a second non-empty progenitor? */
+                if (ci->progeny[k] != NULL &&
+                    (ci->progeny[k]->hydro.count)) {
+                  scheduler_splittask_hydro_GPU(
+                      scheduler_addtask(s, task_type_pair, t->subtype,
+                                        sub_sid_flag[j][k], 0, ci->progeny[j],
+                                        ci->progeny[k]),
+                      s);
+                }
+              }
+            }
+          }
+
+      } /* Cell is split */
+
+    } /* Self interaction */
+
+    /* Pair interaction? */
+    else if (t->type == task_type_pair) {
+      /* Get a handle on the cells involved. */
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags
+         to make sure we get ci and cj swapped if needed. */
+      double shift[3];
+      const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift);
+
+#ifdef SWIFT_DEBUG_CHECKS
+      if (sid != t->flags)
+        error("Got pair task with incorrect flags: sid=%d flags=%lld", sid,
+              t->flags);
+#endif
+
+      /* Should this task be split-up? */
+      if (cell_can_split_pair_hydro_task(ci) &&
+          cell_can_split_pair_hydro_task(cj)) {
+
+        const int h_count_i = ci->hydro.count;
+        const int h_count_j = cj->hydro.count;
+
+//        const int s_count_i = ci->stars.count;
+//        const int s_count_j = cj->stars.count;
+//
+//        int do_sub_hydro = 1;
+//        if (h_count_i > 0 && h_count_j > 0) {
+//
+//          /* Note: Use division to avoid integer overflow. */
+//          do_sub_hydro =
+//              h_count_i * sid_scale[sid] < space_subsize_pair_hydro / h_count_j;
+//        }
+
+        /* Replace by a single sub-task? */
+//        if (scheduler_dosub &&
+//            (do_sub_hydro) &&
+//            !sort_is_corner(sid)) {
+//
+//          /* Make this task a sub task. */
+//          t->type = task_type_sub_pair;
+//
+//          /* Otherwise, split it. */
+//        } else {
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Loop over the sub-cell pairs for the current sid and add new tasks
+           * for them. */
+          struct cell_split_pair *csp = &cell_split_pairs[sid];
+
+          t->ci = ci->progeny[csp->pairs[0].pid];
+          t->cj = cj->progeny[csp->pairs[0].pjd];
+//          if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks);
+//          if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks);
+//
+//          t->flags = csp->pairs[0].sid;
+//          for (int k = 1; k < csp->count; k++) {
+//            scheduler_splittask_hydro_GPU(
+//                scheduler_addtask(s, task_type_pair, t->subtype,
+//                                  csp->pairs[k].sid, 0,
+//                                  ci->progeny[csp->pairs[k].pid],
+//                                  cj->progeny[csp->pairs[k].pjd]),
+//                s);
+//          }
+//        }
+
+        /* Otherwise, break it up if it is too large? */
+//      } else if (scheduler_doforcesplit && ci->split && cj->split &&
+//                 (ci->hydro.count > space_maxsize / cj->hydro.count)) {
+
+        /* Replace the current task. */
+        t->type = task_type_none;
+
+        for (int j = 0; j < 8; j++)
+          if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count)
+            for (int k = 0; k < 8; k++)
+              if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) {
+                struct task *tl =
+                    scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                      ci->progeny[j], cj->progeny[k]);
+                scheduler_splittask_hydro_GPU(tl, s);
+                tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
+                                                        &t->cj, shift);
+              }
+      }
+    } /* pair interaction? */
+  }   /* iterate over the current task. */
+}
+
 /**
  * @brief Split a gravity task if too large.
  *
@@ -1660,10 +1853,13 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
                t->subtype == task_subtype_gpu_pack_g ||
                t->subtype == task_subtype_gpu_pack_f) {
         scheduler_splittask_hydro(t, s);
-//      continue; /*Do nothing and grab next task to split*/
     } else if (t->subtype == task_subtype_gpu_unpack ||
             t->subtype == task_subtype_gpu_unpack_g ||
             t->subtype == task_subtype_gpu_unpack_f){
+    	/*Do nothing and grab next task to split.
+    	 *These tasks are cell-less so cannot split.
+    	 *Will remove this if statement if set on splitting
+    	 *b4 creating unpack tasks*/
     	continue;
     }
     else {
diff --git a/src/space.h b/src/space.h
index 9ca4783ad5..d930f78c98 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,7 +55,7 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 100
+#define space_subsize_pair_hydro_default 200
 #define space_subsize_self_hydro_default 100
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000

From 408b0e4b629ac8c005f4375d98e7074fa5fde8bc Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 17 Dec 2024 13:52:08 +0000
Subject: [PATCH 101/217] Hard-coded space_splitsize_default to 100 with no
 avail. Something fishy is going on...

---
 src/space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/space.h b/src/space.h
index d930f78c98..094cef0b5d 100644
--- a/src/space.h
+++ b/src/space.h
@@ -46,7 +46,7 @@ struct hydro_props;
 
 /* Some constants. */
 #define space_cellallocchunk 1000
-#define space_splitsize_default 400
+#define space_splitsize_default 100
 #define space_maxsize_default 8000000
 #define space_grid_split_threshold_default 400
 #define space_extra_parts_default 0

From fa1e7c45aad3ba344a8f0b48c05a05462d8b2ed2 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 17 Dec 2024 13:55:08 +0000
Subject: [PATCH 102/217] Hard-coded space_grid_split_threshold_default to 100
 and space_subsize_pair_hydro_default with no avail. Something fishy is going
 on...

---
 src/space.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/space.h b/src/space.h
index 094cef0b5d..fc6071afcc 100644
--- a/src/space.h
+++ b/src/space.h
@@ -48,14 +48,14 @@ struct hydro_props;
 #define space_cellallocchunk 1000
 #define space_splitsize_default 100
 #define space_maxsize_default 8000000
-#define space_grid_split_threshold_default 400
+#define space_grid_split_threshold_default 100
 #define space_extra_parts_default 0
 #define space_extra_gparts_default 0
 #define space_extra_sparts_default 100
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 200
+#define space_subsize_pair_hydro_default 100
 #define space_subsize_self_hydro_default 100
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000

From 26585eca588e7630f929675f5608457237b473cf Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 18 Dec 2024 10:29:53 +0000
Subject: [PATCH 103/217] I was barking up the wrong tree. The issue was that
 some particles will have a slightly larger smoothing length that should not
 be in the ideal sized box. Setting count_max_parts_tmp to 2x bigger solves
 the problem with GPU memory and over-running the buffer arrays

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml |  4 ++--
 src/engine_maketasks.c                         |  4 +++-
 src/runner_main_clean.cu                       | 16 ++++++++--------
 src/scheduler.c                                |  8 +++++++-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 2a8aa1e70b..1c28f75b51 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -8,7 +8,7 @@ InternalUnitSystem:
 
 Scheduler:
   max_top_level_cells: 8
-  tasks_per_cell: 200
+  tasks_per_cell: 1000
   deadlock_waiting_time_s:   10
   cell_split_size: 80
 # Parameters governing the time integration
@@ -31,7 +31,7 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.99   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index cb30e14dd0..c02b1fc4db 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5165,7 +5165,9 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
     struct task *t = &sched->tasks[i];
     if(t->ci != NULL){
-      if(t->ci->hydro.count > 80 && t->subtype == task_subtype_gpu_pack)
+//      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) || (!t->ci->split && t->cj->split)))
+//    	  error("one is split the other isn't");
+      if(t->ci->hydro.count > 80 && t->type == task_type_self)
     	  error("Count is %i task subtype (%s)",
                   t->ci->hydro.count, subtaskID_names[t->subtype]);
     }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7a292113fc..bd5465b76e 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -671,7 +671,7 @@ void *runner_main2(void *data) {
 //  int count_max_parts_tmp =
 //      2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
   int count_max_parts_tmp =
-      target_n_tasks * (np_per_cell + buff);
+      2 * target_n_tasks * (np_per_cell + buff);
   message("max_parts %i, n_tasks_GPU %i\n", count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
@@ -1058,13 +1058,13 @@ void *runner_main2(void *data) {
           } else if (t->subtype == task_subtype_gpu_pack) {
             packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
-            if(ci->hydro.count > 2 * np_per_cell){
-          	  g100++;
-          	  maxcount = max(ci->hydro.count, maxcount);
-          	  error("SELF expecting %i got %i parts"
-          			  "Cell ID %i Cell split %d",
-						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-            }
+//            if(ci->hydro.count > 2 * np_per_cell){
+//          	  g100++;
+//          	  maxcount = max(ci->hydro.count, maxcount);
+//          	  error("SELF expecting %i got %i parts"
+//          			  "Cell ID %i Cell split %d",
+//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+//            }
             //          struct timespec t0, t1; //
             //          clock_gettime(CLOCK_REALTIME, &t0);
             ticks tic_cpu_pack = getticks();
diff --git a/src/scheduler.c b/src/scheduler.c
index fe44db8404..65c36820d7 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1369,7 +1369,8 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
       } else if (scheduler_doforcesplit && ci->split && cj->split &&
                  (ci->hydro.count > space_maxsize / cj->hydro.count)) {
 
-        /* Replace the current task. */
+        /* Replace the current task.A. Nasar: Code does NOT go in here even
+         * with doforcesplit defined as 1 in scheduler.h */
         t->type = task_type_none;
 
         for (int j = 0; j < 8; j++)
@@ -1507,6 +1508,11 @@ static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
               t->flags);
 #endif
 
+      if((cell_can_split_pair_hydro_task(ci) &&
+              !cell_can_split_pair_hydro_task(cj))
+        || !cell_can_split_pair_hydro_task(ci) &&
+        cell_can_split_pair_hydro_task(cj))
+    	  error("for some reason cell i can be split and cell j not");
       /* Should this task be split-up? */
       if (cell_can_split_pair_hydro_task(ci) &&
           cell_can_split_pair_hydro_task(cj)) {

From 4063e37ed01e2120c03b10a220ee81bb7b3c61cf Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 18 Dec 2024 10:37:50 +0000
Subject: [PATCH 104/217] Added some comments

---
 src/runner_main_clean.cu | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index bd5465b76e..85495bded7 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -663,15 +663,18 @@ void *runner_main2(void *data) {
   float eta_neighbours = e->s->eta_neighbours;
   int np_per_cell = ceil(2.0 * eta_neighbours);
   np_per_cell *= np_per_cell * np_per_cell;
-  int buff = ceil(0.5 * np_per_cell); /*A. Nasar: Increase parts per recursed task-level cell by buffer to
-                             ensure we allocate enough memory*/
+  /*A. Nasar: Increase parts per recursed task-level cell by buffer to
+    ensure we allocate enough memory*/
+  int buff = ceil(0.5 * np_per_cell);
 
   int tot_self_tasks = space->nr_parts / np_per_cell;
 
-//  int count_max_parts_tmp =
-//      2 * target_n_tasks * space->nr_parts * nr_nodes / space->tot_cells;
+  /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
+   *  the allocated memory on buffers and GPU. This can happen if calculated h is
+   *  larger than cell width and splitting makes bigger than target cells*/
   int count_max_parts_tmp =
       2 * target_n_tasks * (np_per_cell + buff);
+
   message("max_parts %i, n_tasks_GPU %i\n", count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;

From fd849fcb6b8ab49e4dd9aad76471402e5ad6e5b4 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 18 Dec 2024 10:50:35 +0000
Subject: [PATCH 105/217] Okay. Now back to stealing. Code hangs when stealing
 enabled

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml |  2 +-
 src/engine_config.c                            | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 1c28f75b51..8cf4bf1316 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -38,4 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
+#  replicate:  2
diff --git a/src/engine_config.c b/src/engine_config.c
index 70d3cfd1d4..ff3ff5ec9f 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-//                   (e->policy & scheduler_flag_steal), e->nodeID,
-//                   &e->threadpool);
+    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+                   (e->policy & scheduler_flag_steal), e->nodeID,
+                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-                 &e->threadpool);
+//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+//                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be

From fd9cfa0eca6b71de241888f473c3495ffb3ace9e Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 18 Dec 2024 16:57:13 +0000
Subject: [PATCH 106/217] Testing to see what happens when only self-dens tasks
 are done on GPU. Code works fine and does not hang with stealing enabled.
 Issue is with pair tasks...

---
 src/engine_maketasks.c   |  12 ++-
 src/runner_main_clean.cu | 215 +++++++++++++++++++--------------------
 src/scheduler.c          |  17 ++--
 3 files changed, 125 insertions(+), 119 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index c02b1fc4db..3e80e14b8b 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5327,9 +5327,9 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype == task_subtype_density ||
-      t->subtype == task_subtype_gradient ||
-	  t->subtype == task_subtype_force){
+    if (t->subtype == task_subtype_density){// ||
+//      t->subtype == task_subtype_gradient ||
+//	  t->subtype == task_subtype_force){
     	t->implicit = 1;
     }
 //    if (t->subtype == task_subtype_gpu_pack ||
@@ -5340,6 +5340,12 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
+    if (t->subtype == task_subtype_gpu_pack_g ||
+	  t->subtype == task_subtype_gpu_pack_f ||
+	  t->subtype == task_subtype_gpu_unpack_g ||
+	  t->subtype == task_subtype_gpu_unpack_f){
+    	t->implicit = 1;
+    }
 //    if ((t->subtype == task_subtype_gpu_pack ||
 //      t->subtype == task_subtype_gpu_pack_g  ||
 //	  t->subtype == task_subtype_gpu_pack_f) &&
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 85495bded7..0e996a0c41 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -20,8 +20,8 @@
  ******************************************************************************/
 /* Config parameters. */
 #define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
@@ -998,10 +998,6 @@ void *runner_main2(void *data) {
         TIMER_TIC
         t = scheduler_gettask(sched, r->qid, prev);
         TIMER_TOC(timer_gettask);
-//        hang_time += getticks() - tic_get_task;
-//        if (hang_time > 1000000000000000){
-//        	message("I'm stuck runner %i", r->cpuid);
-//        }
         /* Did I get anything? */
         if (t == NULL) break;
       }
@@ -1256,7 +1252,7 @@ void *runner_main2(void *data) {
             /* Abouzied: To be commented out when the GPU pairs have been coded
              * up */
             cpu_pair++;
-#ifndef GPUOFFLOAD_DENSITY
+//#ifndef GPUOFFLOAD_DENSITY
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_dopair1_branch_density(r, ci, cj);
@@ -1265,112 +1261,112 @@ void *runner_main2(void *data) {
             time_for_density_cpu_pair +=
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-#endif
+//#endif
           }
           /* GPU WORK */
           else if (t->subtype == task_subtype_gpu_pack) {
             packed_pair++;
 #ifdef GPUOFFLOAD_DENSITY
-#ifdef DO_CORNERS
-            struct timespec t0, t1, dt;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            double shift[3] = {0.0};
-            t->corner_pair = 0;
-            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              //		  if((sid != 4 && sid != 10 && sid == 12) &&
-              // step > 1){
-              clock_gettime(CLOCK_REALTIME, &t0);
-              runner_dopair1_branch_density(r, ci, cj);
-              t->corner_pair = 1;
-              int qid = r->qid;
-              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-              /* Tell the cells they have been packed */
-              ci->pack_done++;
-              cj->pack_done++;
-              t->done = 1;
-              int launch = 0, launch_leftovers = 0;
-              if ((sched->queues[qid].n_packs_pair_left == 0))
-                launch_leftovers = 1;
-              /* Tasks done. Release the lock ! */
-              task_unlock(t);
-              /*schedule my dependencies (Only unpacks really)*/
-              enqueue_dependencies(sched, t);
-              /*Signal sleeping runners*/
-              signal_sleeping_runners(sched, t);
-              clock_gettime(CLOCK_REALTIME, &t1);
-              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-              if (launch_leftovers) {
-                pack_vars_pair_dens->launch_leftovers = 1;
-                runner_dopair1_launch_f4_one_memcpy(
-                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair, &time_for_density_gpu_pair,
-                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end);
-              }
-            } else {
-#endif  // DO_CORNERS
-
-	          ticks tic_cpu_pack = getticks();
-//	          if (ci == ci->hydro.super || cj == cj->hydro.super){
-//	        	  g100++;
-////	        	  message("GPU working on top level cell");
-//	          }
-//              if(ci->hydro.count > 2 * np_per_cell){
-//            	  g100++;
-//            	  maxcount = max(ci->hydro.count, maxcount);
-//            	  error("expecting %i got %i parts"
-//            			  "Cell ID %i Cell split %d",
-//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+//#ifdef DO_CORNERS
+//            struct timespec t0, t1, dt;
+//            clock_gettime(CLOCK_REALTIME, &t0);
+//            double shift[3] = {0.0};
+//            t->corner_pair = 0;
+//            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+//            clock_gettime(CLOCK_REALTIME, &t1);
+//            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+//                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+//            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
+//              //		  if((sid != 4 && sid != 10 && sid == 12) &&
+//              // step > 1){
+//              clock_gettime(CLOCK_REALTIME, &t0);
+//              runner_dopair1_branch_density(r, ci, cj);
+//              t->corner_pair = 1;
+//              int qid = r->qid;
+//              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+//              /* Tell the cells they have been packed */
+//              ci->pack_done++;
+//              cj->pack_done++;
+//              t->done = 1;
+//              int launch = 0, launch_leftovers = 0;
+//              if ((sched->queues[qid].n_packs_pair_left == 0))
+//                launch_leftovers = 1;
+//              /* Tasks done. Release the lock ! */
+//              task_unlock(t);
+//              /*schedule my dependencies (Only unpacks really)*/
+//              enqueue_dependencies(sched, t);
+//              /*Signal sleeping runners*/
+//              signal_sleeping_runners(sched, t);
+//              clock_gettime(CLOCK_REALTIME, &t1);
+//              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+//                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+//              if (launch_leftovers) {
+//                pack_vars_pair_dens->launch_leftovers = 1;
+//                runner_dopair1_launch_f4_one_memcpy(
+//                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+//                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+//                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+//                    &packing_time_pair, &time_for_density_gpu_pair,
+//                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+//                    pair_end);
 //              }
-//              if(ci->hydro.count < 2 * np_per_cell){
-//            	  g100++;
-//            	  maxcount = max(ci->hydro.count, maxcount);
-//            	  error("expecting %i got %i parts"
-//            			  "Cell ID %i Cell split %d",
-//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+//            } else {
+//#endif  // DO_CORNERS
+//
+//	          ticks tic_cpu_pack = getticks();
+////	          if (ci == ci->hydro.super || cj == cj->hydro.super){
+////	        	  g100++;
+//////	        	  message("GPU working on top level cell");
+////	          }
+////              if(ci->hydro.count > 2 * np_per_cell){
+////            	  g100++;
+////            	  maxcount = max(ci->hydro.count, maxcount);
+////            	  error("expecting %i got %i parts"
+////            			  "Cell ID %i Cell split %d",
+////						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+////              }
+////              if(ci->hydro.count < 2 * np_per_cell){
+////            	  g100++;
+////            	  maxcount = max(ci->hydro.count, maxcount);
+////            	  error("expecting %i got %i parts"
+////            			  "Cell ID %i Cell split %d",
+////						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
+////              }
+////              else l100++;
+//	          packing_time_pair += runner_dopair1_pack_f4(
+//                  r, sched, pack_vars_pair_dens, ci, cj, t,
+//                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+//
+//	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+//              /* Packed enough tasks or no pack tasks left in queue, flag that
+//               * we want to run */
+//              int launch = pack_vars_pair_dens->launch;
+//              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+//
+//              /* Do we have enough stuff to run the GPU ? */
+//              if (launch) n_full_p_d_bundles++;
+//              if (launch_leftovers) n_partial_p_d_bundles++;
+//              if (launch || launch_leftovers) {
+//
+//                /*Launch GPU tasks*/
+//                //				runner_dopair1_launch(r, sched,
+//                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+//                //						d_parts_aos_pair_dens,
+//                // stream, d_a, d_H, e, &packing_time_pair,
+//                //&time_for_density_gpu_pair);
+//            	int t_packed = pack_vars_pair_dens->tasks_packed;
+//                signal_sleeping_runners(sched, t, t_packed);
+//                runner_dopair1_launch_f4_one_memcpy(
+//                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+//                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+//                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+//                    &packing_time_pair, &time_for_density_gpu_pair,
+//                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+//                    pair_end);
 //              }
-//              else l100++;
-	          packing_time_pair += runner_dopair1_pack_f4(
-                  r, sched, pack_vars_pair_dens, ci, cj, t,
-                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-
-	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-              /* Packed enough tasks or no pack tasks left in queue, flag that
-               * we want to run */
-              int launch = pack_vars_pair_dens->launch;
-              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-	      
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch) n_full_p_d_bundles++;
-              if (launch_leftovers) n_partial_p_d_bundles++;
-              if (launch || launch_leftovers) {
-
-                /*Launch GPU tasks*/
-                //				runner_dopair1_launch(r, sched,
-                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-                //						d_parts_aos_pair_dens,
-                // stream, d_a, d_H, e, &packing_time_pair,
-                //&time_for_density_gpu_pair);
-            	int t_packed = pack_vars_pair_dens->tasks_packed;
-                signal_sleeping_runners(sched, t, t_packed);
-                runner_dopair1_launch_f4_one_memcpy(
-                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair, &time_for_density_gpu_pair,
-                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end);
-              }
-#ifdef DO_CORNERS
-            } /* End of GPU work Pairs */
-#endif        // DO_CORNERS
+//#ifdef DO_CORNERS
+//            } /* End of GPU work Pairs */
+//#endif        // DO_CORNERS
 #endif        // GPUOFFLOAD_DENSITY
           }   /* pair / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
@@ -1962,7 +1958,7 @@ void *runner_main2(void *data) {
       /* We're done with this task, see if we get a next one. */
       prev = t;
       
-      if (t->subtype == task_subtype_gpu_pack) {
+      if (t->subtype == task_subtype_gpu_pack && t->type == task_type_self) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
@@ -1973,7 +1969,10 @@ void *runner_main2(void *data) {
         t = scheduler_done(sched, t);
 #endif
       }
-      
+
+      else if (t->subtype == task_subtype_gpu_pack && t->type == task_type_pair) {
+    	  t = scheduler_done(sched, t);
+      }
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */
diff --git a/src/scheduler.c b/src/scheduler.c
index 65c36820d7..132b42f87b 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3338,7 +3338,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
        * falg set to zero for GPU work*/
       if (s->flags & scheduler_flag_steal) {
 
-        int count = 0, qids[nr_queues];
+        int count = 0, qids[nr_queues], act_qids[nr_queues];
 
         /* Make list of queues that have 1 or more tasks in them */
         for (int k = 0; k < nr_queues; k++) {
@@ -3354,7 +3354,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
           /* Try to get a task from that random queue */
           TIMER_TIC;
-          res = queue_gettask(&s->queues[qids[ind]], prev, 0);
+          int qstl = qids[ind];
+          res = queue_gettask(&s->queues[qstl], prev, 0);
           TIMER_TOC(timer_qsteal);
 
           /* Lucky? */
@@ -3369,32 +3370,32 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_self_left);
-              atomic_dec(&s->queues[qids[ind]].n_packs_self_left);
+              atomic_dec(&s->queues[qstl].n_packs_self_left);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_self_left_g);
-              atomic_dec(&s->queues[qids[ind]].n_packs_self_left_g);
+              atomic_dec(&s->queues[qstl].n_packs_self_left_g);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_self_left_f);
-              atomic_dec(&s->queues[qids[ind]].n_packs_self_left_f);
+              atomic_dec(&s->queues[qstl].n_packs_self_left_f);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_pair_left);
-              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left);
+              atomic_dec(&s->queues[qstl].n_packs_pair_left);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_g);
+              atomic_dec(&s->queues[qstl].n_packs_pair_left_g);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-              atomic_dec(&s->queues[qids[ind]].n_packs_pair_left_f);
+              atomic_dec(&s->queues[qstl].n_packs_pair_left_f);
             }
 
             /* Run with the task */

From e9cb4dd221086f1d0a6c25aa5bd0b4dd3f3a64f0 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 11:58:38 +0000
Subject: [PATCH 107/217] Reverted back to offloading only density tasks (self
 and pair)

---
 src/runner_main_clean.cu | 207 +++++++++++++++++++--------------------
 1 file changed, 102 insertions(+), 105 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 0e996a0c41..00ca42d63b 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1252,7 +1252,7 @@ void *runner_main2(void *data) {
             /* Abouzied: To be commented out when the GPU pairs have been coded
              * up */
             cpu_pair++;
-//#ifndef GPUOFFLOAD_DENSITY
+#ifndef GPUOFFLOAD_DENSITY
             struct timespec t0, t1, dt;
             clock_gettime(CLOCK_REALTIME, &t0);
             runner_dopair1_branch_density(r, ci, cj);
@@ -1261,112 +1261,112 @@ void *runner_main2(void *data) {
             time_for_density_cpu_pair +=
                 (t1.tv_sec - t0.tv_sec) +
                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-//#endif
+#endif
           }
           /* GPU WORK */
           else if (t->subtype == task_subtype_gpu_pack) {
             packed_pair++;
 #ifdef GPUOFFLOAD_DENSITY
-//#ifdef DO_CORNERS
-//            struct timespec t0, t1, dt;
-//            clock_gettime(CLOCK_REALTIME, &t0);
-//            double shift[3] = {0.0};
-//            t->corner_pair = 0;
-//            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-//            clock_gettime(CLOCK_REALTIME, &t1);
-//            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-//                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-//            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-//              //		  if((sid != 4 && sid != 10 && sid == 12) &&
-//              // step > 1){
-//              clock_gettime(CLOCK_REALTIME, &t0);
-//              runner_dopair1_branch_density(r, ci, cj);
-//              t->corner_pair = 1;
-//              int qid = r->qid;
-//              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-//              /* Tell the cells they have been packed */
-//              ci->pack_done++;
-//              cj->pack_done++;
-//              t->done = 1;
-//              int launch = 0, launch_leftovers = 0;
-//              if ((sched->queues[qid].n_packs_pair_left == 0))
-//                launch_leftovers = 1;
-//              /* Tasks done. Release the lock ! */
-//              task_unlock(t);
-//              /*schedule my dependencies (Only unpacks really)*/
-//              enqueue_dependencies(sched, t);
-//              /*Signal sleeping runners*/
-//              signal_sleeping_runners(sched, t);
-//              clock_gettime(CLOCK_REALTIME, &t1);
-//              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-//                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-//              if (launch_leftovers) {
-//                pack_vars_pair_dens->launch_leftovers = 1;
-//                runner_dopair1_launch_f4_one_memcpy(
-//                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-//                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-//                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-//                    &packing_time_pair, &time_for_density_gpu_pair,
-//                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-//                    pair_end);
+#ifdef DO_CORNERS
+            struct timespec t0, t1, dt;
+            clock_gettime(CLOCK_REALTIME, &t0);
+            double shift[3] = {0.0};
+            t->corner_pair = 0;
+            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
+            clock_gettime(CLOCK_REALTIME, &t1);
+            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
+              //		  if((sid != 4 && sid != 10 && sid == 12) &&
+              // step > 1){
+              clock_gettime(CLOCK_REALTIME, &t0);
+              runner_dopair1_branch_density(r, ci, cj);
+              t->corner_pair = 1;
+              int qid = r->qid;
+              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
+              /* Tell the cells they have been packed */
+              ci->pack_done++;
+              cj->pack_done++;
+              t->done = 1;
+              int launch = 0, launch_leftovers = 0;
+              if ((sched->queues[qid].n_packs_pair_left == 0))
+                launch_leftovers = 1;
+              /* Tasks done. Release the lock ! */
+              task_unlock(t);
+              /*schedule my dependencies (Only unpacks really)*/
+              enqueue_dependencies(sched, t);
+              /*Signal sleeping runners*/
+              signal_sleeping_runners(sched, t);
+              clock_gettime(CLOCK_REALTIME, &t1);
+              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
+                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+              if (launch_leftovers) {
+                pack_vars_pair_dens->launch_leftovers = 1;
+                runner_dopair1_launch_f4_one_memcpy(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+              }
+            } else {
+#endif  // DO_CORNERS
+
+	          ticks tic_cpu_pack = getticks();
+//	          if (ci == ci->hydro.super || cj == cj->hydro.super){
+//	        	  g100++;
+////	        	  message("GPU working on top level cell");
+//	          }
+//              if(ci->hydro.count > 2 * np_per_cell){
+//            	  g100++;
+//            	  maxcount = max(ci->hydro.count, maxcount);
+//            	  error("expecting %i got %i parts"
+//            			  "Cell ID %i Cell split %d",
+//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
 //              }
-//            } else {
-//#endif  // DO_CORNERS
-//
-//	          ticks tic_cpu_pack = getticks();
-////	          if (ci == ci->hydro.super || cj == cj->hydro.super){
-////	        	  g100++;
-//////	        	  message("GPU working on top level cell");
-////	          }
-////              if(ci->hydro.count > 2 * np_per_cell){
-////            	  g100++;
-////            	  maxcount = max(ci->hydro.count, maxcount);
-////            	  error("expecting %i got %i parts"
-////            			  "Cell ID %i Cell split %d",
-////						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-////              }
-////              if(ci->hydro.count < 2 * np_per_cell){
-////            	  g100++;
-////            	  maxcount = max(ci->hydro.count, maxcount);
-////            	  error("expecting %i got %i parts"
-////            			  "Cell ID %i Cell split %d",
-////						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-////              }
-////              else l100++;
-//	          packing_time_pair += runner_dopair1_pack_f4(
-//                  r, sched, pack_vars_pair_dens, ci, cj, t,
-//                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-//
-//	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-//              /* Packed enough tasks or no pack tasks left in queue, flag that
-//               * we want to run */
-//              int launch = pack_vars_pair_dens->launch;
-//              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-//
-//              /* Do we have enough stuff to run the GPU ? */
-//              if (launch) n_full_p_d_bundles++;
-//              if (launch_leftovers) n_partial_p_d_bundles++;
-//              if (launch || launch_leftovers) {
-//
-//                /*Launch GPU tasks*/
-//                //				runner_dopair1_launch(r, sched,
-//                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-//                //						d_parts_aos_pair_dens,
-//                // stream, d_a, d_H, e, &packing_time_pair,
-//                //&time_for_density_gpu_pair);
-//            	int t_packed = pack_vars_pair_dens->tasks_packed;
-//                signal_sleeping_runners(sched, t, t_packed);
-//                runner_dopair1_launch_f4_one_memcpy(
-//                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-//                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-//                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-//                    &packing_time_pair, &time_for_density_gpu_pair,
-//                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-//                    pair_end);
+//              if(ci->hydro.count < 2 * np_per_cell){
+//            	  g100++;
+//            	  maxcount = max(ci->hydro.count, maxcount);
+//            	  error("expecting %i got %i parts"
+//            			  "Cell ID %i Cell split %d",
+//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
 //              }
-//#ifdef DO_CORNERS
-//            } /* End of GPU work Pairs */
-//#endif        // DO_CORNERS
+//              else l100++;
+	          packing_time_pair += runner_dopair1_pack_f4(
+                  r, sched, pack_vars_pair_dens, ci, cj, t,
+                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+
+	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              /* Packed enough tasks or no pack tasks left in queue, flag that
+               * we want to run */
+              int launch = pack_vars_pair_dens->launch;
+              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch) n_full_p_d_bundles++;
+              if (launch_leftovers) n_partial_p_d_bundles++;
+              if (launch || launch_leftovers) {
+
+                /*Launch GPU tasks*/
+                //				runner_dopair1_launch(r, sched,
+                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
+                //						d_parts_aos_pair_dens,
+                // stream, d_a, d_H, e, &packing_time_pair,
+                //&time_for_density_gpu_pair);
+            	int t_packed = pack_vars_pair_dens->tasks_packed;
+                signal_sleeping_runners(sched, t, t_packed);
+                runner_dopair1_launch_f4_one_memcpy(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+              }
+#ifdef DO_CORNERS
+            } /* End of GPU work Pairs */
+#endif        // DO_CORNERS
 #endif        // GPUOFFLOAD_DENSITY
           }   /* pair / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
@@ -1957,8 +1957,8 @@ void *runner_main2(void *data) {
 
       /* We're done with this task, see if we get a next one. */
       prev = t;
-      
-      if (t->subtype == task_subtype_gpu_pack && t->type == task_type_self) {
+
+      if (t->subtype == task_subtype_gpu_pack) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
@@ -1970,9 +1970,6 @@ void *runner_main2(void *data) {
 #endif
       }
 
-      else if (t->subtype == task_subtype_gpu_pack && t->type == task_type_pair) {
-    	  t = scheduler_done(sched, t);
-      }
       else if (t->subtype == task_subtype_gpu_pack_g) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */

From e94fe6e7548081492d6e31889e9da50b8e9946ec Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 12:26:25 +0000
Subject: [PATCH 108/217] Weirdly code does not hang when using bigger test
 case and only density pairs and selfs, in comparison to same setup but with
 smaller test case

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 4 ++--
 src/engine_maketasks.c                         | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 8cf4bf1316..49c2fe7ccb 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -8,7 +8,7 @@ InternalUnitSystem:
 
 Scheduler:
   max_top_level_cells: 8
-  tasks_per_cell: 1000
+  tasks_per_cell: 200
   deadlock_waiting_time_s:   10
   cell_split_size: 80
 # Parameters governing the time integration
@@ -38,4 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-#  replicate:  2
+  replicate:  2
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 3e80e14b8b..dae3e4dc5f 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5323,7 +5323,7 @@ void engine_maketasks(struct engine *e) {
     message("took %.3f %s (including reweight).",
             clocks_from_ticks(getticks() - tic), clocks_getunit());
 
-  /* Loop over all the CPU hydro tasks to make implicit*/
+  /* Loop over all the CPU hydro tasks to make implicit (needs threadmapping)*/
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
@@ -5343,7 +5343,9 @@ void engine_maketasks(struct engine *e) {
     if (t->subtype == task_subtype_gpu_pack_g ||
 	  t->subtype == task_subtype_gpu_pack_f ||
 	  t->subtype == task_subtype_gpu_unpack_g ||
-	  t->subtype == task_subtype_gpu_unpack_f){
+	  t->subtype == task_subtype_gpu_unpack_f){// ||
+//	  (t->type == task_type_pair &&
+//	   t->subtype == task_subtype_gpu_pack)){
     	t->implicit = 1;
     }
 //    if ((t->subtype == task_subtype_gpu_pack ||

From 0e3196dbcee7bffb4b516a0016a23c7ebca6f77f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 12:44:32 +0000
Subject: [PATCH 109/217] Reverted to doing all tasks on GPU. Need to test on
 GHopper

---
 src/engine_maketasks.c   | 22 +++++++++++-----------
 src/runner_main_clean.cu |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index dae3e4dc5f..8648d75dd5 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5327,9 +5327,9 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype == task_subtype_density){// ||
-//      t->subtype == task_subtype_gradient ||
-//	  t->subtype == task_subtype_force){
+    if (t->subtype == task_subtype_density ||
+      t->subtype == task_subtype_gradient  ||
+	  t->subtype == task_subtype_force){
     	t->implicit = 1;
     }
 //    if (t->subtype == task_subtype_gpu_pack ||
@@ -5340,14 +5340,14 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
-    if (t->subtype == task_subtype_gpu_pack_g ||
-	  t->subtype == task_subtype_gpu_pack_f ||
-	  t->subtype == task_subtype_gpu_unpack_g ||
-	  t->subtype == task_subtype_gpu_unpack_f){// ||
-//	  (t->type == task_type_pair &&
-//	   t->subtype == task_subtype_gpu_pack)){
-    	t->implicit = 1;
-    }
+//    if (t->subtype == task_subtype_gpu_pack_g ||
+//	  t->subtype == task_subtype_gpu_pack_f ||
+//	  t->subtype == task_subtype_gpu_unpack_g ||
+//	  t->subtype == task_subtype_gpu_unpack_f){// ||
+////	  (t->type == task_type_pair &&
+////	   t->subtype == task_subtype_gpu_pack)){
+//    	t->implicit = 1;
+//    }
 //    if ((t->subtype == task_subtype_gpu_pack ||
 //      t->subtype == task_subtype_gpu_pack_g  ||
 //	  t->subtype == task_subtype_gpu_pack_f) &&
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 00ca42d63b..36ac81dbf1 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -20,8 +20,8 @@
  ******************************************************************************/
 /* Config parameters. */
 #define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1

From 3dfb81c63e26f936b1de77b60ce74e626ac2993f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 13:56:11 +0000
Subject: [PATCH 110/217] Deleted obsolete debug code

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 132b42f87b..4de81f5f78 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2818,10 +2818,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         } else if (t->subtype == task_subtype_gpu_pack) {  // A. Nasar
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
-          //          fprintf(stderr,"nqueues %i waiting %i active_count %i\n",
-          //          s->nr_queues, s->waiting, s->active_count);
-          //          if(qid==-1)fprintf(stderr,"queue id is negative\n");
-          //          else fprintf(stderr,"queue id is %i\n", qid);
         } else if (t->subtype == task_subtype_gpu_pack_f) {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;

From 57cb5d9b5195a136fcee25120ef7111c8596077f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 13:56:30 +0000
Subject: [PATCH 111/217] Deleted obsolete debug code

---
 src/scheduler.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 4de81f5f78..2abc8fb499 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2825,7 +2825,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
         } else if (t->subtype == task_subtype_gpu_unpack) {
-          ////          qid = t->ci->owner;
           qid = -1;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
           qid = -1;

From 02e18e47711951af50e2252eede6e9e1c789401b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 14:33:57 +0000
Subject: [PATCH 112/217] Put in a weird fix in scheduler_gettask but I think
 it is wrong and should be replaced with a continue; if the task in unpack

---
 src/scheduler.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 2abc8fb499..aff717e2f5 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3081,7 +3081,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
-    // A. Nasar Do the same for the pack tasks
+    /* Insert the task into that queue. */
+    queue_insert(&s->queues[qid], t);
+    /* A. Nasar: Increment counters required for the pack tasks */
     if (t->type == task_type_self || t->type == task_type_sub_self) {
       if (t->subtype == task_subtype_gpu_pack)
         atomic_inc(&s->queues[qid].n_packs_self_left);
@@ -3090,9 +3092,9 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
       if (t->subtype == task_subtype_gpu_pack_g)
         atomic_inc(&s->queues[qid].n_packs_self_left_g);
     }
-    if (t->type ==
-        task_type_pair || t->type == task_type_sub_pair) {  // A. Nasar NEED to think about how to do this with
-                           // MPI where ci may not be on this node/rank
+    /* A. Nasar NEED to think about how to do this with
+     MPI where ci may not be on this node/rank */
+    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (t->subtype == task_subtype_gpu_pack) {
           atomic_inc(&s->queues[qid].n_packs_pair_left);
       }
@@ -3103,8 +3105,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
           atomic_inc(&s->queues[qid].n_packs_pair_left_g);
       }
     }
-    /* Insert the task into that queue. */
-    queue_insert(&s->queues[qid], t);
   }
 }
 
@@ -3352,7 +3352,14 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           int qstl = qids[ind];
           res = queue_gettask(&s->queues[qstl], prev, 0);
           TIMER_TOC(timer_qsteal);
-
+          if (res != NULL && (res->subtype == task_subtype_gpu_unpack ||
+        	  res->subtype == task_subtype_gpu_unpack_f	||
+			  res->subtype == task_subtype_gpu_unpack_g))
+            /* Reduce the size of the list of non-empty queues */
+            qids[ind] = qids[--count];
+            //A. Nasar: This should probably be a
+//            "continue;"
+            //Instead of reducing size of the list
           /* Lucky? */
           if (res != NULL) {
 //		  if (res != NULL && res->subtype != task_subtype_gpu_pack
@@ -3396,7 +3403,6 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             /* Run with the task */
             break;
           } else {
-
             /* Reduce the size of the list of non-empty queues */
             qids[ind] = qids[--count];
           }

From 153558ea22c7b0830c5bbbb73bfde1371ccf408f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 15:17:42 +0000
Subject: [PATCH 113/217] Deleted commented out obsolete code

---
 src/scheduler.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index aff717e2f5..4ec5fbe66f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3352,22 +3352,18 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           int qstl = qids[ind];
           res = queue_gettask(&s->queues[qstl], prev, 0);
           TIMER_TOC(timer_qsteal);
-          if (res != NULL && (res->subtype == task_subtype_gpu_unpack ||
-        	  res->subtype == task_subtype_gpu_unpack_f	||
-			  res->subtype == task_subtype_gpu_unpack_g))
+          if (res != NULL && (res->type == task_type_pair) &&
+        	  (res->subtype == task_subtype_gpu_unpack ||
+        	   res->subtype == task_subtype_gpu_unpack_f	||
+			   res->subtype == task_subtype_gpu_unpack_g)){
             /* Reduce the size of the list of non-empty queues */
             qids[ind] = qids[--count];
             //A. Nasar: This should probably be a
-//            "continue;"
+            continue;
+          }
             //Instead of reducing size of the list
           /* Lucky? */
           if (res != NULL) {
-//		  if (res != NULL && res->subtype != task_subtype_gpu_pack
-//				  && res->subtype != task_subtype_gpu_pack_g
-//				  && res->subtype != task_subtype_gpu_pack_f
-//				  && res->subtype != task_subtype_gpu_unpack
-//				  && res->subtype != task_subtype_gpu_unpack_g
-//				  && res->subtype != task_subtype_gpu_unpack_f) {
 
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {

From f71201ef2923146fd05db2fff6cb4357f12641d6 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 18:00:24 +0000
Subject: [PATCH 114/217] Changed condition to launch_leftovers to if leftover
 tasks <=1 instead of ==0 for when stealing is active

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/engine_maketasks.c                        | 22 +++++------
 src/runner_doiact_functions_hydro_gpu.h       |  2 +-
 src/runner_main_clean.cu                      |  4 +-
 src/scheduler.c                               | 38 +++++++++++++------
 5 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 49c2fe7ccb..76eb3249a2 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -38,4 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
+#  replicate:  2
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 8648d75dd5..dae3e4dc5f 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5327,9 +5327,9 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype == task_subtype_density ||
-      t->subtype == task_subtype_gradient  ||
-	  t->subtype == task_subtype_force){
+    if (t->subtype == task_subtype_density){// ||
+//      t->subtype == task_subtype_gradient ||
+//	  t->subtype == task_subtype_force){
     	t->implicit = 1;
     }
 //    if (t->subtype == task_subtype_gpu_pack ||
@@ -5340,14 +5340,14 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
-//    if (t->subtype == task_subtype_gpu_pack_g ||
-//	  t->subtype == task_subtype_gpu_pack_f ||
-//	  t->subtype == task_subtype_gpu_unpack_g ||
-//	  t->subtype == task_subtype_gpu_unpack_f){// ||
-////	  (t->type == task_type_pair &&
-////	   t->subtype == task_subtype_gpu_pack)){
-//    	t->implicit = 1;
-//    }
+    if (t->subtype == task_subtype_gpu_pack_g ||
+	  t->subtype == task_subtype_gpu_pack_f ||
+	  t->subtype == task_subtype_gpu_unpack_g ||
+	  t->subtype == task_subtype_gpu_unpack_f){// ||
+//	  (t->type == task_type_pair &&
+//	   t->subtype == task_subtype_gpu_pack)){
+    	t->implicit = 1;
+    }
 //    if ((t->subtype == task_subtype_gpu_pack ||
 //      t->subtype == task_subtype_gpu_pack_g  ||
 //	  t->subtype == task_subtype_gpu_pack_f) &&
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index e6f4f8f47a..5ff3d9c6c1 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -641,7 +641,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left <= 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 36ac81dbf1..00ca42d63b 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -20,8 +20,8 @@
  ******************************************************************************/
 /* Config parameters. */
 #define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
+//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
diff --git a/src/scheduler.c b/src/scheduler.c
index 4ec5fbe66f..49a62098e6 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3352,19 +3352,34 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           int qstl = qids[ind];
           res = queue_gettask(&s->queues[qstl], prev, 0);
           TIMER_TOC(timer_qsteal);
-          if (res != NULL && (res->type == task_type_pair) &&
-        	  (res->subtype == task_subtype_gpu_unpack ||
-        	   res->subtype == task_subtype_gpu_unpack_f	||
-			   res->subtype == task_subtype_gpu_unpack_g)){
-            /* Reduce the size of the list of non-empty queues */
-            qids[ind] = qids[--count];
-            //A. Nasar: This should probably be a
-            continue;
-          }
-            //Instead of reducing size of the list
+
+//          if (res != NULL && (res->type == task_type_pair) &&
+//        	  (res->subtype == task_subtype_gpu_unpack ||
+//        	   res->subtype == task_subtype_gpu_unpack_f	||
+//			   res->subtype == task_subtype_gpu_unpack_g)){
+//            continue;
+//          }
+
+//          if (res != NULL && res->type == task_type_pair &&
+//        	    res->subtype == task_subtype_gpu_pack &&
+//			    s->queues[qstl].n_packs_pair_left < 2){ //Condition we want to avoid is =< 1
+//            continue;
+//          }
+//
+//          if (res != NULL && res->type == task_type_pair &&
+//        	    res->subtype == task_subtype_gpu_pack_g &&
+//			    s->queues[qstl].n_packs_pair_left_g < 2){
+//            continue;
+//          }
+//
+//          if (res != NULL && res->type == task_type_pair &&
+//        	    res->subtype == task_subtype_gpu_pack_f &&
+//			    s->queues[qstl].n_packs_pair_left_f < 2){
+//            continue;
+//          }
+
           /* Lucky? */
           if (res != NULL) {
-
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&s->queues[qid].n_packs_self_left);
@@ -3395,7 +3410,6 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
               atomic_inc(&s->queues[qid].n_packs_pair_left_f);
               atomic_dec(&s->queues[qstl].n_packs_pair_left_f);
             }
-
             /* Run with the task */
             break;
           } else {

From 707928b60540bb736e93520a81f23107854275df Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 18:02:47 +0000
Subject: [PATCH 115/217] Changed condition to launch_leftovers to if leftover
 tasks <=1 instead of ==0 for when stealing is active for selfs too. Code
 still hangs but after 60-70 steps. Not perfect but good enough for the first
 paper

---
 src/runner_doiact_functions_hydro_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 5ff3d9c6c1..be60d1b906 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -205,7 +205,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_self_left < 2)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||

From c5716b6ed6ffff60db2e13b4090bd75334cfcedc Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 19 Dec 2024 18:20:02 +0000
Subject: [PATCH 116/217] Extended <2 condition to other GPU task subtypes but
 the code now hangs. Need to double check if I put a bug in somewhere or if it
 is just not good enough to let me run a few steps for the paper

---
 src/engine_maketasks.c                  | 22 +++++++++++-----------
 src/runner_doiact_functions_hydro_gpu.h | 10 +++++-----
 src/runner_main_clean.cu                |  4 ++--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index dae3e4dc5f..8648d75dd5 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5327,9 +5327,9 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype == task_subtype_density){// ||
-//      t->subtype == task_subtype_gradient ||
-//	  t->subtype == task_subtype_force){
+    if (t->subtype == task_subtype_density ||
+      t->subtype == task_subtype_gradient  ||
+	  t->subtype == task_subtype_force){
     	t->implicit = 1;
     }
 //    if (t->subtype == task_subtype_gpu_pack ||
@@ -5340,14 +5340,14 @@ void engine_maketasks(struct engine *e) {
 //	  t->subtype == task_subtype_gpu_unpack_f){
 //    	t->implicit = 1;
 //    }
-    if (t->subtype == task_subtype_gpu_pack_g ||
-	  t->subtype == task_subtype_gpu_pack_f ||
-	  t->subtype == task_subtype_gpu_unpack_g ||
-	  t->subtype == task_subtype_gpu_unpack_f){// ||
-//	  (t->type == task_type_pair &&
-//	   t->subtype == task_subtype_gpu_pack)){
-    	t->implicit = 1;
-    }
+//    if (t->subtype == task_subtype_gpu_pack_g ||
+//	  t->subtype == task_subtype_gpu_pack_f ||
+//	  t->subtype == task_subtype_gpu_unpack_g ||
+//	  t->subtype == task_subtype_gpu_unpack_f){// ||
+////	  (t->type == task_type_pair &&
+////	   t->subtype == task_subtype_gpu_pack)){
+//    	t->implicit = 1;
+//    }
 //    if ((t->subtype == task_subtype_gpu_pack ||
 //      t->subtype == task_subtype_gpu_pack_g  ||
 //	  t->subtype == task_subtype_gpu_pack_f) &&
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index be60d1b906..e7563a813c 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -326,7 +326,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_g == 0))
+  if ((s->queues[qid].n_packs_self_left_g < 2))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -446,7 +446,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_f == 0))
+  if ((s->queues[qid].n_packs_self_left_f < 2))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -641,7 +641,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left <= 1)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left < 2)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -843,7 +843,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_g == 0))
+  if ((s->queues[qid].n_packs_pair_left_g < 2))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -1047,7 +1047,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_f == 0))
+  if ((s->queues[qid].n_packs_pair_left_f < 2))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 00ca42d63b..7f84c69e74 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -20,8 +20,8 @@
  ******************************************************************************/
 /* Config parameters. */
 #define GPUOFFLOAD_DENSITY 1   // off-load hydro density to GPU
-//#define GPUOFFLOAD_GRADIENT 1  // off-load hydro density to GPU
-//#define GPUOFFLOAD_FORCE 1     // off-load hydro density to GPU
+#define GPUOFFLOAD_GRADIENT 1  // off-load hydro gradient to GPU
+#define GPUOFFLOAD_FORCE 1     // off-load hydro force to GPU
 
 // #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1

From b78ba84698a752570169484d19afa0611389d22a Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 23 Dec 2024 12:35:57 +0000
Subject: [PATCH 117/217] Minor changes for testing. Commented out debug code
 for checking if tasks not split to target level for GPU

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  4 ++--
 src/engine_maketasks.c                        | 22 +++++++++----------
 src/engine_unskip.c                           |  3 ---
 src/runner_main_clean.cu                      |  2 +-
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 76eb3249a2..2a8aa1e70b 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -31,11 +31,11 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.99   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-#  replicate:  2
+  replicate:  2
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 8648d75dd5..0631796228 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5161,17 +5161,17 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_f)
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
-
-  for (int i = 0; i < sched->nr_tasks; i++) {
-    struct task *t = &sched->tasks[i];
-    if(t->ci != NULL){
-//      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) || (!t->ci->split && t->cj->split)))
-//    	  error("one is split the other isn't");
-      if(t->ci->hydro.count > 80 && t->type == task_type_self)
-    	  error("Count is %i task subtype (%s)",
-                  t->ci->hydro.count, subtaskID_names[t->subtype]);
-    }
-  }
+/*Debug code to check if some tasks are not split to desired level in tree for GPU*/
+//  for (int i = 0; i < sched->nr_tasks; i++) {
+//    struct task *t = &sched->tasks[i];
+//    if(t->ci != NULL){
+////      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) || (!t->ci->split && t->cj->split)))
+////    	  error("one is split the other isn't");
+//      if(t->ci->hydro.count > 80 && t->type == task_type_self)
+//    	  error("Count is %i task subtype (%s)",
+//                  t->ci->hydro.count, subtaskID_names[t->subtype]);
+//    }
+//  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
diff --git a/src/engine_unskip.c b/src/engine_unskip.c
index 0d0ac13f74..43af8b5aed 100644
--- a/src/engine_unskip.c
+++ b/src/engine_unskip.c
@@ -79,9 +79,6 @@ struct unskip_data {
  */
 static void engine_do_unskip_hydro(struct cell *c, struct engine *e) {
 
-  //  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_pack); A. Nasar
-  //  scheduler_activate(&e->sched, c->hydro.super->hydro.gpu_unpack);
-
   /* Early abort (are we below the level where tasks are)? */
   if (!cell_get_flag(c, cell_flag_has_tasks)) return;
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7f84c69e74..e9399c40bb 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -673,7 +673,7 @@ void *runner_main2(void *data) {
    *  the allocated memory on buffers and GPU. This can happen if calculated h is
    *  larger than cell width and splitting makes bigger than target cells*/
   int count_max_parts_tmp =
-      2 * target_n_tasks * (np_per_cell + buff);
+      8 * target_n_tasks * (np_per_cell + buff);
 
   message("max_parts %i, n_tasks_GPU %i\n", count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;

From b533d1670717193d5e36ce26413b55293e568aff Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 23 Dec 2024 16:02:06 +0000
Subject: [PATCH 118/217] Reverted condition for launch leftovers to
 n_packs_left < 1 instead of < 2. Removed some commented out code in
 runner_main. Edited scheduler_gettask so stealing code is a bit cleaner when
 stealing GPU tasks

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       | 12 +--
 src/runner_main_clean.cu                      | 85 +++----------------
 src/scheduler.c                               | 43 ++++++----
 4 files changed, 44 insertions(+), 98 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 2a8aa1e70b..3976888b0a 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -38,4 +38,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
+#  replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index e7563a813c..79594c23d9 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -205,7 +205,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left < 2)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -326,7 +326,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_g < 2))
+  if ((s->queues[qid].n_packs_self_left_g < 1))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -446,7 +446,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_f < 2))
+  if ((s->queues[qid].n_packs_self_left_f < 1))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -641,7 +641,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left < 2)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -843,7 +843,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_g < 2))
+  if ((s->queues[qid].n_packs_pair_left_g < 1))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -1047,7 +1047,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_f < 2))
+  if ((s->queues[qid].n_packs_pair_left_f < 1))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index e9399c40bb..7a594a1039 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1057,15 +1057,6 @@ void *runner_main2(void *data) {
           } else if (t->subtype == task_subtype_gpu_pack) {
             packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
-//            if(ci->hydro.count > 2 * np_per_cell){
-//          	  g100++;
-//          	  maxcount = max(ci->hydro.count, maxcount);
-//          	  error("SELF expecting %i got %i parts"
-//          			  "Cell ID %i Cell split %d",
-//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-//            }
-            //          struct timespec t0, t1; //
-            //          clock_gettime(CLOCK_REALTIME, &t0);
             ticks tic_cpu_pack = getticks();
 
             packing_time +=
@@ -1074,14 +1065,9 @@ void *runner_main2(void *data) {
 
 	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
-            //      	  clock_gettime(CLOCK_REALTIME, &t1);
-            //      	  packing_time += (t1.tv_sec - t0.tv_sec) +
-            //      			(t1.tv_nsec - t0.tv_nsec) /
-            //      1000000000.0;
-            //          runner_doself1_pack(r, sched, pack_vars_self_dens, ci,
-            //        		  t, parts_aos_dens, &packing_time);
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
+            if(sched->queues[r->qid].n_packs_self_left < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1097,19 +1083,13 @@ void *runner_main2(void *data) {
                   stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
                   &unpack_time_self, task_first_part_self_dens_f4, devId,
                   task_first_part_f4, d_task_first_part_f4, self_end);
-              //	        runner_doself1_launch(r, sched,
-              // pack_vars_self_dens, ci, t, parts_aos_dens,
-              // d_parts_aos_dens, stream, d_a, d_H, e, &packing_time,
-              // &time_for_density_gpu,
-              // &tot_time_for_hard_memcpys);
             } /*End of GPU work Self*/
 #endif
           } /* self / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_self_g++;
 #ifdef GPUOFFLOAD_GRADIENT
-            //          runner_doself1_pack_g(r, sched, pack_vars_self_grad, ci,
-            //        		  t, parts_aos_grad, &packing_time_g);
+
             ticks tic_cpu_pack = getticks();
 
             packing_time_g += runner_doself1_pack_f4_g(
@@ -1120,15 +1100,12 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
+            if(sched->queues[r->qid].n_packs_self_left_g < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
-              //      	        runner_doself1_launch_g(r, sched,
-              //      pack_vars_self_grad, ci, t, parts_aos_grad,
-              //      	        		d_parts_aos_grad, stream, d_a,
-              //      d_H, e, &packing_time_g, &time_for_gpu_g);
               int t_packed = pack_vars_self_grad->tasks_packed;
               signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_g(
@@ -1142,8 +1119,6 @@ void *runner_main2(void *data) {
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_self_f++;
 #ifdef GPUOFFLOAD_FORCE
-            //          runner_doself1_pack_f(r, sched, pack_vars_self_forc, ci,
-            //        		  t, parts_aos_forc, &packing_time_f);
             ticks tic_cpu_pack = getticks();
 
             packing_time_f += runner_doself1_pack_f4_f(
@@ -1152,24 +1127,14 @@ void *runner_main2(void *data) {
 
 	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
-            //          int count = ci->hydro.count;
-            //          for(int i = 0; i < count; i++){
-            //        	  int pid = pack_vars_self_forc->count_parts - count +
-            //        i; 	  if(parts_aos_forc_f4_send[pid].ux_m.w <
-            //        1e-9)fprintf(stderr, "zero mass after packing %i %f\n",
-            //        pid, parts_aos_forc_f4_send[pid].ux_m.w);
-            //          }
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
+            if(sched->queues[r->qid].n_packs_self_left_f < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
-              //  	        runner_doself1_launch_f(r, sched,
-              //  pack_vars_self_forc, ci, t, parts_aos_forc,
-              //  d_parts_aos_forc, stream, d_a, d_H, e, &packing_time_f,
-              //  &time_for_gpu_f);
               int t_packed = pack_vars_self_forc->tasks_packed;
               signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_f(
@@ -1314,25 +1279,7 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
 	          ticks tic_cpu_pack = getticks();
-//	          if (ci == ci->hydro.super || cj == cj->hydro.super){
-//	        	  g100++;
-////	        	  message("GPU working on top level cell");
-//	          }
-//              if(ci->hydro.count > 2 * np_per_cell){
-//            	  g100++;
-//            	  maxcount = max(ci->hydro.count, maxcount);
-//            	  error("expecting %i got %i parts"
-//            			  "Cell ID %i Cell split %d",
-//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-//              }
-//              if(ci->hydro.count < 2 * np_per_cell){
-//            	  g100++;
-//            	  maxcount = max(ci->hydro.count, maxcount);
-//            	  error("expecting %i got %i parts"
-//            			  "Cell ID %i Cell split %d",
-//						  np_per_cell, ci->hydro.count, ci->cellID, ci->split);
-//              }
-//              else l100++;
+
 	          packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
@@ -1342,18 +1289,13 @@ void *runner_main2(void *data) {
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
 
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
               if (launch || launch_leftovers) {
-
                 /*Launch GPU tasks*/
-                //				runner_dopair1_launch(r, sched,
-                // pack_vars_pair_dens, ci, t, parts_aos_pair_dens,
-                //						d_parts_aos_pair_dens,
-                // stream, d_a, d_H, e, &packing_time_pair,
-                //&time_for_density_gpu_pair);
             	int t_packed = pack_vars_pair_dens->tasks_packed;
                 signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_one_memcpy(
@@ -1382,7 +1324,6 @@ void *runner_main2(void *data) {
             packing_time_pair += (t1.tv_sec - t0.tv_sec) +
                                  (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
             if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              //          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
               clock_gettime(CLOCK_REALTIME, &t0);
               runner_dopair1_branch_gradient(r, ci, cj);
               t->corner_pair = 1;
@@ -1416,10 +1357,6 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-        //          runner_dopair1_pack_g(r, sched, pack_vars_pair_grad,
-        //          ci,
-        //        		  cj, t, parts_aos_pair_grad, e,
-        //        &packing_time_g);
   	          ticks tic_cpu_pack = getticks();
 
               packing_time_pair_g +=
@@ -1431,16 +1368,13 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-              /*Packed enough tasks let's go*/
+              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
+              /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
+
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-                //			runner_dopair1_launch_g(r, sched,
-                // pack_vars_pair_grad, ci, t, parts_aos_pair_grad,
-                //					d_parts_aos_pair_grad,
-                // stream, d_a, d_H, e, &packing_time_pair_g,
-                //&time_for_gpu_pair_g);
             	int t_packed = pack_vars_pair_grad->tasks_packed;
                 signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_g_one_memcpy(
@@ -1515,6 +1449,7 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
+              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */
diff --git a/src/scheduler.c b/src/scheduler.c
index 49a62098e6..88e8f4d323 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2733,6 +2733,10 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  */
 void scheduler_start(struct scheduler *s) {
   for (int i = 0; i < s->nr_queues; i++) {  // A. Nasar
+//	if(s->queues[i].n_packs_self_left_f > 0 && s->e->time > 0.0){
+//		message("time %f", s->e->time);
+//		error("We did not complete all density pack tasks. n left %i", s->queues[i].n_packs_self_left_f);
+//	}
     s->queues[i].n_packs_self_left = 0;
     s->queues[i].n_packs_pair_left = 0;
     s->queues[i].n_packs_self_left_f = 0;
@@ -3346,11 +3350,12 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
           /* Pick a queue at random among the non-empty ones */
           const int ind = rand_r(&seed) % count;
-
+          /*Get a pointer to the queue we're stealing from*/
+          int qstl = qids[ind];
+      	  struct queue * q_stl = &s->queues[qstl];
           /* Try to get a task from that random queue */
           TIMER_TIC;
-          int qstl = qids[ind];
-          res = queue_gettask(&s->queues[qstl], prev, 0);
+          res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
 
 //          if (res != NULL && (res->type == task_type_pair) &&
@@ -3379,36 +3384,42 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 //          }
 
           /* Lucky? */
-          if (res != NULL) {
+          if (res != NULL){
+        	/*Get a pointer to our queue for re-use*/
+        	struct queue * q = &s->queues[qid];
+//        		  && res->subtype != task_subtype_gpu_pack
+//        		  && res->subtype != task_subtype_gpu_pack_f
+//				  && res->subtype != task_subtype_gpu_pack_g) {
+        	/*Move counter from the robbed to the robber*/
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {
-              atomic_inc(&s->queues[qid].n_packs_self_left);
-              atomic_dec(&s->queues[qstl].n_packs_self_left);
+              atomic_inc(&q->n_packs_self_left);
+              atomic_dec(&q_stl->n_packs_self_left);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_g) {
-              atomic_inc(&s->queues[qid].n_packs_self_left_g);
-              atomic_dec(&s->queues[qstl].n_packs_self_left_g);
+              atomic_inc(&q->n_packs_self_left_g);
+              atomic_dec(&q_stl->n_packs_self_left_g);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_f) {
-              atomic_inc(&s->queues[qid].n_packs_self_left_f);
-              atomic_dec(&s->queues[qstl].n_packs_self_left_f);
+              atomic_inc(&q->n_packs_self_left_f);
+              atomic_dec(&q_stl->n_packs_self_left_f);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack) {
-              atomic_inc(&s->queues[qid].n_packs_pair_left);
-              atomic_dec(&s->queues[qstl].n_packs_pair_left);
+              atomic_inc(&q->n_packs_pair_left);
+              atomic_dec(&q_stl->n_packs_pair_left);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_g) {
-              atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-              atomic_dec(&s->queues[qstl].n_packs_pair_left_g);
+              atomic_inc(&q->n_packs_pair_left_g);
+              atomic_dec(&q_stl->n_packs_pair_left_g);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_f) {
-              atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-              atomic_dec(&s->queues[qstl].n_packs_pair_left_f);
+              atomic_inc(&q->n_packs_pair_left_f);
+              atomic_dec(&q_stl->n_packs_pair_left_f);
             }
             /* Run with the task */
             break;

From ceb4d6f4b7dc36e9c389a39e93b65eb22954ab6f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 30 Dec 2024 14:45:02 +0000
Subject: [PATCH 119/217] Allowed stealing engine_config. Changed debug if
 statement to error rather than exit(0). Removed if statment querying for n
 tasks left in scheduler from within runner_main as it was causing kernel
 launches with invalid configuration (0 blocks). Reverted space.h to default
 parameters. Added parameters to control subcell size for splitting in
 gresho.yml (cleaner way of doing it)

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml |  6 ++++--
 src/engine_config.c                            | 10 +++++-----
 src/runner_doiact_functions_hydro_gpu.h        |  4 +---
 src/runner_main_clean.cu                       | 16 ++++++++--------
 src/space.h                                    |  4 ++--
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 3976888b0a..03536fed53 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,10 +7,12 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 8
+  max_top_level_cells: 16
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
   cell_split_size: 80
+  cell_sub_size_pair_hydro:  60 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  60     # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -31,7 +33,7 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.99   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
diff --git a/src/engine_config.c b/src/engine_config.c
index ff3ff5ec9f..70d3cfd1d4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                   (e->policy & scheduler_flag_steal), e->nodeID,
-                   &e->threadpool);
+//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+//                   (e->policy & scheduler_flag_steal), e->nodeID,
+//                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-//                 &e->threadpool);
+  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 79594c23d9..26948183b1 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -3066,13 +3066,11 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
 #ifdef CUDA_DEBUG
     cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
     if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
+      error(
           "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
           "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
           cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
           max_parts_i, max_parts_j);
-      exit(0);
     }
 #endif
   }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7a594a1039..33aea761dd 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -673,9 +673,9 @@ void *runner_main2(void *data) {
    *  the allocated memory on buffers and GPU. This can happen if calculated h is
    *  larger than cell width and splitting makes bigger than target cells*/
   int count_max_parts_tmp =
-      8 * target_n_tasks * (np_per_cell + buff);
+      2 * target_n_tasks * (np_per_cell + buff);
 
-  message("max_parts %i, n_tasks_GPU %i\n", count_max_parts_tmp, target_n_tasks);
+  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell, count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -1067,7 +1067,7 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-            if(sched->queues[r->qid].n_packs_self_left < 1) launch_leftovers = 1;
+//            if(sched->queues[r->qid].n_packs_self_left < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1100,7 +1100,7 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-            if(sched->queues[r->qid].n_packs_self_left_g < 1) launch_leftovers = 1;
+//            if(sched->queues[r->qid].n_packs_self_left_g < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1129,7 +1129,7 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-            if(sched->queues[r->qid].n_packs_self_left_f < 1) launch_leftovers = 1;
+//            if(sched->queues[r->qid].n_packs_self_left_f < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1289,7 +1289,7 @@ void *runner_main2(void *data) {
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
+//              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
 
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
@@ -1368,7 +1368,7 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
+//              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
 
@@ -1449,7 +1449,7 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
+//              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */
diff --git a/src/space.h b/src/space.h
index fc6071afcc..870ec7aae1 100644
--- a/src/space.h
+++ b/src/space.h
@@ -55,8 +55,8 @@ struct hydro_props;
 #define space_extra_bparts_default 0
 #define space_extra_sinks_default 0
 #define space_expected_max_nr_strays_default 100
-#define space_subsize_pair_hydro_default 100
-#define space_subsize_self_hydro_default 100
+#define space_subsize_pair_hydro_default 256000000
+#define space_subsize_self_hydro_default 32000
 #define space_subsize_pair_stars_default 256000000
 #define space_subsize_self_stars_default 32000
 #define space_subsize_pair_grav_default 256000000

From 4929cf80f59b1dc4f13ab3d226e546712c9d6bab Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 30 Dec 2024 15:54:41 +0000
Subject: [PATCH 120/217] Added simple debug code to scream when we have way
 more particles than anticipated. The code works fine for h=1.9dx and lower
 but we end up with 8x particles per cell if we set h=1.99dx due to some cells
 being un-splittable due to h condition

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 10 +++++-----
 src/runner_main_clean.cu                       | 12 +++++++++++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 03536fed53..b96f552291 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,12 +7,12 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 8
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
   cell_split_size: 80
-  cell_sub_size_pair_hydro:  60 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
-  cell_sub_size_self_hydro:  60     # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+  cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  50     # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -33,11 +33,11 @@ Statistics:
 
 # Parameters for the hydrodynamics scheme
 SPH:
-  resolution_eta:        1.2348   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  resolution_eta:        1.9   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
   CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
   
 # Parameters related to the initial conditions
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-#  replicate:  2
+  replicate:  2
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 33aea761dd..ea55f1eb16 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -983,6 +983,8 @@ void *runner_main2(void *data) {
     sched->nr_packs_pair_forc_done = 0;
     sched->nr_packs_self_grad_done = 0;
     sched->nr_packs_pair_grad_done = 0;
+    int n_cells = 0;
+    int n_w_prts_gtr_target = 0;
     int g100 = 0;
     int l100 = 0;
     int maxcount = 0;
@@ -1067,7 +1069,12 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-//            if(sched->queues[r->qid].n_packs_self_left < 1) launch_leftovers = 1;
+            n_cells++;
+            if(ci->hydro.count > 1.5 * np_per_cell){
+            	n_w_prts_gtr_target++;
+            	message("count %i target %i", ci->hydro.count, np_per_cell);
+            }
+//            	error("There's %i parts in a cell when it should be %i max", ci->hydro.count, np_per_cell);
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1936,6 +1943,9 @@ void *runner_main2(void *data) {
       }
     } /* main loop. */
 
+    message("cpu %i packed %i cells with %i containing more parts than target",
+    		r->cpuid, n_cells, n_w_prts_gtr_target);
+
 //    message("Worked on %i supers w more than 100 parts", g100);
       // Stuff for writing debug data to file for validation
       ////        if (step % 10 == 0 || step == 1) {

From 392e2ce6fe901bdf3ef262fd56dbb3090438da6b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 30 Dec 2024 17:04:51 +0000
Subject: [PATCH 121/217] Added code to monitor how many GPU tasks a thread
 steals. Code still hangs with s->waiting greater than zero and the dumper
 thread shows that some unpacks are waiting (for packs) and some end_force
 tasks are waiting (for unpack_f tasks)

---
 src/engine_config.c | 10 +++++-----
 src/queue.h         |  8 ++++++++
 src/scheduler.c     | 22 +++++++++++++++++++++-
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/engine_config.c b/src/engine_config.c
index 70d3cfd1d4..ff3ff5ec9f 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-//                   (e->policy & scheduler_flag_steal), e->nodeID,
-//                   &e->threadpool);
+    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+                   (e->policy & scheduler_flag_steal), e->nodeID,
+                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-                 &e->threadpool);
+//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+//                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/queue.h b/src/queue.h
index 9ba441d55e..f04015b00a 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -84,6 +84,14 @@ struct queue {
   int n_packs_pair_left_f;
   int n_packs_pair_left_g;
 
+  int n_packs_self_stolen;   /*Number of density pack tasks left in queue*/
+  int n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
+  int n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
+
+  int n_packs_pair_stolen;
+  int n_packs_pair_stolen_f;
+  int n_packs_pair_stolen_g;
+
 } __attribute__((aligned(queue_struct_align)));
 
 /* Function prototypes. */
diff --git a/src/scheduler.c b/src/scheduler.c
index 88e8f4d323..5382279d70 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2743,6 +2743,14 @@ void scheduler_start(struct scheduler *s) {
     s->queues[i].n_packs_pair_left_f = 0;
     s->queues[i].n_packs_self_left_g = 0;
     s->queues[i].n_packs_pair_left_g = 0;
+
+    s->queues[i].n_packs_self_stolen = 0;
+    s->queues[i].n_packs_pair_stolen = 0;
+    s->queues[i].n_packs_self_stolen_f = 0;
+    s->queues[i].n_packs_pair_stolen_f = 0;
+    s->queues[i].n_packs_self_stolen_g = 0;
+    s->queues[i].n_packs_pair_stolen_g = 0;
+
   }
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
@@ -3384,7 +3392,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 //          }
 
           /* Lucky? */
-          if (res != NULL){
+//          if (res != NULL && res->subtype != task_subtype_gpu_unpack
+//        		  && res->subtype != task_subtype_gpu_unpack_f
+//				  && res->subtype != task_subtype_gpu_unpack_g){
+		  if (res != NULL){
         	/*Get a pointer to our queue for re-use*/
         	struct queue * q = &s->queues[qid];
 //        		  && res->subtype != task_subtype_gpu_pack
@@ -3394,31 +3405,37 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_self_left);
+              atomic_inc(&q->n_packs_self_stolen);
               atomic_dec(&q_stl->n_packs_self_left);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_self_left_g);
+              atomic_inc(&q->n_packs_self_stolen_g);
               atomic_dec(&q_stl->n_packs_self_left_g);
             }
             if ((res->type == task_type_self)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_self_left_f);
+              atomic_inc(&q->n_packs_self_stolen_f);
               atomic_dec(&q_stl->n_packs_self_left_f);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_pair_left);
+              atomic_inc(&q->n_packs_pair_stolen);
               atomic_dec(&q_stl->n_packs_pair_left);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_pair_left_g);
+              atomic_inc(&q->n_packs_pair_stolen_g);
               atomic_dec(&q_stl->n_packs_pair_left_g);
             }
             if ((res->type == task_type_pair)&&
                 res->subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_pair_left_f);
+              atomic_inc(&q->n_packs_pair_stolen_f);
               atomic_dec(&q_stl->n_packs_pair_left_f);
             }
             /* Run with the task */
@@ -3442,6 +3459,9 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       pthread_mutex_lock(&s->sleep_mutex);
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
+    	struct queue qq = s->queues[qid];
+    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
+    			qq.n_packs_self_stolen, qq.n_packs_self_left, qq.n_packs_pair_stolen, qq.n_packs_pair_left);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);

From 29e074bfbaea5299e4bce2eac2243b8049af18f2 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 6 Jan 2025 15:22:34 +0000
Subject: [PATCH 122/217] Very weird behaviour found. Added some debug code set
 to crash run if launch_leftovers = 1 whilst the number of tasks left is equal
 or greater to 1 and the code sets launc_leftovers to 1 even when the counter
 shows that there is still 1 pack task of that type left

---
 src/runner_main_clean.cu | 21 +++++++++++++++++++--
 src/scheduler.c          |  4 ++--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index ea55f1eb16..746379a900 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1297,7 +1297,11 @@ void *runner_main2(void *data) {
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
 //              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
-
+              if(launch_leftovers &&
+            		  (sched->queues[r->qid].n_packs_pair_left < 0
+            				  || sched->queues[r->qid].n_packs_pair_left >= 1))
+            	  error("Somethig's wrong. n_packs_pair_left = %i when it should be zero",
+            			  sched->queues[r->qid].n_packs_pair_left);
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
@@ -1313,6 +1317,7 @@ void *runner_main2(void *data) {
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
               }
+              pack_vars_pair_dens->launch_leftovers = 0;
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif        // DO_CORNERS
@@ -1378,7 +1383,11 @@ void *runner_main2(void *data) {
 //              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
-
+              if(launch_leftovers &&
+            		  (sched->queues[r->qid].n_packs_pair_left_g < 0
+            				  || sched->queues[r->qid].n_packs_pair_left_g >= 1))
+            	  error("Somethig's wrong. n_packs_pair_left_g = %i when it should be zero",
+            			  sched->queues[r->qid].n_packs_pair_left_g);
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1392,6 +1401,7 @@ void *runner_main2(void *data) {
                     &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
                     pair_end_g);
               }
+              pack_vars_pair_grad->launch_leftovers = 0;
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif        // DO_CORNERS
@@ -1457,6 +1467,11 @@ void *runner_main2(void *data) {
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
 //              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
+              if(launch_leftovers &&
+            		  (sched->queues[r->qid].n_packs_pair_left_f < 0
+            				  || sched->queues[r->qid].n_packs_pair_left_f >= 1))
+            	  error("Somethig's wrong. n_packs_pair_left_f = %i when it should be zero",
+            			  sched->queues[r->qid].n_packs_pair_left_f);
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */
@@ -1476,6 +1491,8 @@ void *runner_main2(void *data) {
                     &packing_time_pair_f, &time_for_gpu_pair_f,
                     &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
                     pair_end_f);
+
+                pack_vars_pair_forc->launch_leftovers = 0;
               } /* End of GPU work Pairs */
 #ifdef DO_CORNERS
             }
diff --git a/src/scheduler.c b/src/scheduler.c
index 5382279d70..52c42d9013 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3460,8 +3460,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
     	struct queue qq = s->queues[qid];
-    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
-    			qq.n_packs_self_stolen, qq.n_packs_self_left, qq.n_packs_pair_stolen, qq.n_packs_pair_left);
+//    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
+//    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);

From 949fd47e77d9a30fce8aef459320b3bf44a60c17 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 7 Jan 2025 11:58:28 +0000
Subject: [PATCH 123/217] Moving tasks_left counter incrementation from
 scheduler.c to queue_insert() causes counters to go negative. Code hangs when
 changing condition to launch_leftovers to if (tasks_left == 0)

---
 src/queue.c                             | 23 +++++++++++++++++++++-
 src/queue.h                             | 12 ++++++------
 src/runner_doiact_functions_hydro_gpu.h | 12 ++++++------
 src/runner_main_clean.cu                |  6 +++---
 src/scheduler.c                         | 26 ++-----------------------
 5 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/src/queue.c b/src/queue.c
index 30601667cd..d488e74a07 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -171,7 +171,28 @@ void queue_insert(struct queue *q, struct task *t) {
 
       /* Clean up the incoming DEQ. */
       queue_get_incoming(q);
-
+      /* A. Nasar: Increment counters required for the pack tasks */
+      if (t->type == task_type_self || t->type == task_type_sub_self) {
+        if (t->subtype == task_subtype_gpu_pack)
+          atomic_inc(&q->n_packs_self_left);
+        if (t->subtype == task_subtype_gpu_pack_f)
+          atomic_inc(&q->n_packs_self_left_f);
+        if (t->subtype == task_subtype_gpu_pack_g)
+          atomic_inc(&q->n_packs_self_left_g);
+      }
+      /* A. Nasar NEED to think about how to do this with
+       MPI where ci may not be on this node/rank */
+      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+        if (t->subtype == task_subtype_gpu_pack) {
+            atomic_inc(&q->n_packs_pair_left);
+        }
+        if (t->subtype == task_subtype_gpu_pack_f) {
+            atomic_inc(&q->n_packs_pair_left_f);
+        }
+        if (t->subtype == task_subtype_gpu_pack_g) {
+            atomic_inc(&q->n_packs_pair_left_g);
+        }
+      }
       /* Release the queue lock. */
       if (lock_unlock(&q->lock) != 0) {
         error("Unlocking the qlock failed.\n");
diff --git a/src/queue.h b/src/queue.h
index f04015b00a..61420d7fe7 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -76,13 +76,13 @@ struct queue {
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
   /*Number of pack tasks left in queue A. Nasar */
-  int n_packs_self_left;   /*Number of density pack tasks left in queue*/
-  int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
-  int n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
+  volatile int n_packs_self_left;   /*Number of density pack tasks left in queue*/
+  volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
+  volatile int n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
 
-  int n_packs_pair_left;
-  int n_packs_pair_left_f;
-  int n_packs_pair_left_g;
+  volatile int n_packs_pair_left;
+  volatile int n_packs_pair_left_f;
+  volatile int n_packs_pair_left_g;
 
   int n_packs_self_stolen;   /*Number of density pack tasks left in queue*/
   int n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 26948183b1..1484518bdb 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -205,7 +205,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -326,7 +326,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_g < 1))
+  if ((s->queues[qid].n_packs_self_left_g == 0))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -446,7 +446,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_f < 1))
+  if ((s->queues[qid].n_packs_self_left_f == 0))
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -641,7 +641,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -843,7 +843,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_g < 1))
+  if (s->queues[qid].n_packs_pair_left_g == 0)
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -1047,7 +1047,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_f < 1))
+  if (s->queues[qid].n_packs_pair_left_f == 0)
     pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 746379a900..cbba08d33c 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -675,7 +675,7 @@ void *runner_main2(void *data) {
   int count_max_parts_tmp =
       2 * target_n_tasks * (np_per_cell + buff);
 
-  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell, count_max_parts_tmp, target_n_tasks);
+//  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell, count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -1300,7 +1300,7 @@ void *runner_main2(void *data) {
               if(launch_leftovers &&
             		  (sched->queues[r->qid].n_packs_pair_left < 0
             				  || sched->queues[r->qid].n_packs_pair_left >= 1))
-            	  error("Somethig's wrong. n_packs_pair_left = %i when it should be zero",
+            	  error("Something's wrong. n_packs_pair_left = %i when it should be zero",
             			  sched->queues[r->qid].n_packs_pair_left);
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
@@ -1386,7 +1386,7 @@ void *runner_main2(void *data) {
               if(launch_leftovers &&
             		  (sched->queues[r->qid].n_packs_pair_left_g < 0
             				  || sched->queues[r->qid].n_packs_pair_left_g >= 1))
-            	  error("Somethig's wrong. n_packs_pair_left_g = %i when it should be zero",
+            	  error("Something's wrong. n_packs_pair_left_g = %i when it should be zero",
             			  sched->queues[r->qid].n_packs_pair_left_g);
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
diff --git a/src/scheduler.c b/src/scheduler.c
index 52c42d9013..066a406273 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3095,28 +3095,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     atomic_inc(&s->waiting);
     /* Insert the task into that queue. */
     queue_insert(&s->queues[qid], t);
-    /* A. Nasar: Increment counters required for the pack tasks */
-    if (t->type == task_type_self || t->type == task_type_sub_self) {
-      if (t->subtype == task_subtype_gpu_pack)
-        atomic_inc(&s->queues[qid].n_packs_self_left);
-      if (t->subtype == task_subtype_gpu_pack_f)
-        atomic_inc(&s->queues[qid].n_packs_self_left_f);
-      if (t->subtype == task_subtype_gpu_pack_g)
-        atomic_inc(&s->queues[qid].n_packs_self_left_g);
-    }
-    /* A. Nasar NEED to think about how to do this with
-     MPI where ci may not be on this node/rank */
-    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (t->subtype == task_subtype_gpu_pack) {
-          atomic_inc(&s->queues[qid].n_packs_pair_left);
-      }
-      if (t->subtype == task_subtype_gpu_pack_f) {
-          atomic_inc(&s->queues[qid].n_packs_pair_left_f);
-      }
-      if (t->subtype == task_subtype_gpu_pack_g) {
-          atomic_inc(&s->queues[qid].n_packs_pair_left_g);
-      }
-    }
   }
 }
 
@@ -3359,8 +3337,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           /* Pick a queue at random among the non-empty ones */
           const int ind = rand_r(&seed) % count;
           /*Get a pointer to the queue we're stealing from*/
-          int qstl = qids[ind];
-      	  struct queue * q_stl = &s->queues[qstl];
+          int qstl_id = qids[ind];
+      	  struct queue * q_stl = &s->queues[qstl_id];
           /* Try to get a task from that random queue */
           TIMER_TIC;
           res = queue_gettask(q_stl, prev, 0);

From 5abe90bd967c82543263e50b1b55464acfbd7de0 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 7 Jan 2025 13:47:53 +0000
Subject: [PATCH 124/217] Changed if statements to atomic_CAS. Code hangs but
 the CAS does not seem to be working as intended. Probably incorrect usage of
 the CAS

---
 src/queue.c                             | 46 ++++++++++-----------
 src/runner_doiact_functions_hydro_gpu.h | 54 +++++++++++++++++--------
 2 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/src/queue.c b/src/queue.c
index d488e74a07..0a3014d2d4 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -171,35 +171,35 @@ void queue_insert(struct queue *q, struct task *t) {
 
       /* Clean up the incoming DEQ. */
       queue_get_incoming(q);
-      /* A. Nasar: Increment counters required for the pack tasks */
-      if (t->type == task_type_self || t->type == task_type_sub_self) {
-        if (t->subtype == task_subtype_gpu_pack)
-          atomic_inc(&q->n_packs_self_left);
-        if (t->subtype == task_subtype_gpu_pack_f)
-          atomic_inc(&q->n_packs_self_left_f);
-        if (t->subtype == task_subtype_gpu_pack_g)
-          atomic_inc(&q->n_packs_self_left_g);
-      }
-      /* A. Nasar NEED to think about how to do this with
-       MPI where ci may not be on this node/rank */
-      if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-        if (t->subtype == task_subtype_gpu_pack) {
-            atomic_inc(&q->n_packs_pair_left);
-        }
-        if (t->subtype == task_subtype_gpu_pack_f) {
-            atomic_inc(&q->n_packs_pair_left_f);
-        }
-        if (t->subtype == task_subtype_gpu_pack_g) {
-            atomic_inc(&q->n_packs_pair_left_g);
-        }
-      }
+
       /* Release the queue lock. */
       if (lock_unlock(&q->lock) != 0) {
         error("Unlocking the qlock failed.\n");
       }
     }
   }
-
+  /* A. Nasar: Increment counters required for the pack tasks */
+  if (t->type == task_type_self || t->type == task_type_sub_self) {
+    if (t->subtype == task_subtype_gpu_pack)
+      atomic_inc(&q->n_packs_self_left);
+    if (t->subtype == task_subtype_gpu_pack_f)
+      atomic_inc(&q->n_packs_self_left_f);
+    if (t->subtype == task_subtype_gpu_pack_g)
+      atomic_inc(&q->n_packs_self_left_g);
+  }
+  /* A. Nasar NEED to think about how to do this with
+   MPI where ci may not be on this node/rank */
+  if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+    if (t->subtype == task_subtype_gpu_pack) {
+        atomic_inc(&q->n_packs_pair_left);
+    }
+    if (t->subtype == task_subtype_gpu_pack_f) {
+        atomic_inc(&q->n_packs_pair_left_f);
+    }
+    if (t->subtype == task_subtype_gpu_pack_g) {
+        atomic_inc(&q->n_packs_pair_left_g);
+    }
+  }
   /* Increase the incoming count. */
   atomic_inc(&q->count_incoming);
 }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 1484518bdb..5598238822 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -205,7 +205,11 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
+
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left), 1);
+
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self");
+//  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -326,8 +330,11 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_g == 0))
-    pack_vars->launch_leftovers = 1;
+
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left_g), 1);
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self g");
+//  if ((s->queues[qid].n_packs_self_left_g < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -446,8 +453,12 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_f == 0))
-    pack_vars->launch_leftovers = 1;
+
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left_f), 1);
+
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self f");
+//  if ((s->queues[qid].n_packs_self_left_f < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -632,8 +643,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   }
 
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left));
   t->done = 1;
   /* Copies done. Release the lock ! */
   cell_unlocktree(ci);
@@ -641,7 +650,13 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+//  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left), 1);
+
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair");
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -834,8 +849,6 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   }
 
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
   t->done = 1;
   /* Copies done. Release the lock ! */
   cell_unlocktree(ci);
@@ -843,8 +856,13 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if (s->queues[qid].n_packs_pair_left_g == 0)
-    pack_vars->launch_leftovers = 1;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left_g), 1);
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair g");
+//  if ((s->queues[qid].n_packs_pair_left_g < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -1038,8 +1056,6 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   }
 
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
   t->done = 1;
   /* Copies done. Release the lock ! */
   cell_unlocktree(ci);
@@ -1047,8 +1063,14 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if (s->queues[qid].n_packs_pair_left_f == 0)
-    pack_vars->launch_leftovers = 1;
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left_f), 1);
+
+  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair f");
+//  if ((s->queues[qid].n_packs_pair_left_f < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||

From 5082ca9d51ba63e73430901628760d2a2ace1ee8 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 7 Jan 2025 13:52:30 +0000
Subject: [PATCH 125/217] Changed comp val to int in CAS instead of int *.
 Removed debug code. CAS seems to work

---
 src/runner_doiact_functions_hydro_gpu.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 5598238822..95b02c2049 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -206,9 +206,8 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left), 1);
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left), 1);
 
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self");
 //  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -331,8 +330,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left_g), 1);
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self g");
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left_g), 1);
 //  if ((s->queues[qid].n_packs_self_left_g < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -454,9 +452,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_self_left_f), 1);
-
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers self f");
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left_f), 1);
 //  if ((s->queues[qid].n_packs_self_left_f < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -654,9 +650,8 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   int qid = r->qid;
   atomic_dec(&(s->queues[qid].n_packs_pair_left));
 //  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left), 1);
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left), 1);
 
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair");
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -859,8 +854,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
   atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left_g), 1);
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair g");
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left_g), 1);
 //  if ((s->queues[qid].n_packs_pair_left_g < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -1066,9 +1060,8 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
   atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  atomic_cas(&pack_vars->launch_leftovers, &(s->queues[qid].n_packs_pair_left_f), 1);
+  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left_f), 1);
 
-  if(pack_vars->launch_leftovers == 1)message("Launching leftovers pair f");
 //  if ((s->queues[qid].n_packs_pair_left_f < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)

From 38953c40af0cfeea229151bc45c68563de448f4d Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 7 Jan 2025 13:54:40 +0000
Subject: [PATCH 126/217] Removed debug code which throws error if launching
 leftovers but still have GPU tasks in the queue

---
 src/runner_main_clean.cu | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index cbba08d33c..500ce645c7 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1297,11 +1297,11 @@ void *runner_main2(void *data) {
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
 //              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
-              if(launch_leftovers &&
-            		  (sched->queues[r->qid].n_packs_pair_left < 0
-            				  || sched->queues[r->qid].n_packs_pair_left >= 1))
-            	  error("Something's wrong. n_packs_pair_left = %i when it should be zero",
-            			  sched->queues[r->qid].n_packs_pair_left);
+//              if(launch_leftovers &&
+//            		  (sched->queues[r->qid].n_packs_pair_left < 0
+//            				  || sched->queues[r->qid].n_packs_pair_left >= 1))
+//            	  error("Something's wrong. n_packs_pair_left = %i when it should be zero",
+//            			  sched->queues[r->qid].n_packs_pair_left);
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
@@ -1383,11 +1383,11 @@ void *runner_main2(void *data) {
 //              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
-              if(launch_leftovers &&
-            		  (sched->queues[r->qid].n_packs_pair_left_g < 0
-            				  || sched->queues[r->qid].n_packs_pair_left_g >= 1))
-            	  error("Something's wrong. n_packs_pair_left_g = %i when it should be zero",
-            			  sched->queues[r->qid].n_packs_pair_left_g);
+//              if(launch_leftovers &&
+//            		  (sched->queues[r->qid].n_packs_pair_left_g < 0
+//            				  || sched->queues[r->qid].n_packs_pair_left_g >= 1))
+//            	  error("Something's wrong. n_packs_pair_left_g = %i when it should be zero",
+//            			  sched->queues[r->qid].n_packs_pair_left_g);
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1466,12 +1466,12 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-//              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
-              if(launch_leftovers &&
-            		  (sched->queues[r->qid].n_packs_pair_left_f < 0
-            				  || sched->queues[r->qid].n_packs_pair_left_f >= 1))
-            	  error("Somethig's wrong. n_packs_pair_left_f = %i when it should be zero",
-            			  sched->queues[r->qid].n_packs_pair_left_f);
+////              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
+//              if(launch_leftovers &&
+//            		  (sched->queues[r->qid].n_packs_pair_left_f < 0
+//            				  || sched->queues[r->qid].n_packs_pair_left_f >= 1))
+//            	  error("Somethig's wrong. n_packs_pair_left_f = %i when it should be zero",
+//            			  sched->queues[r->qid].n_packs_pair_left_f);
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */

From c7e6a6c1e09368c7cb1e02bfeb575129b9ea4efc Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 7 Jan 2025 16:54:44 +0000
Subject: [PATCH 127/217] Combined atomic dec and CAS calls into one operation.
 Code seems to run for longer before hanging. Need to investigate why!

---
 src/queue.c                             | 33 ++++++++++++++++++++-----
 src/runner_doiact_functions_hydro_gpu.h | 25 ++++++++++---------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/queue.c b/src/queue.c
index 0a3014d2d4..0a330ce0e1 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -180,24 +180,45 @@ void queue_insert(struct queue *q, struct task *t) {
   }
   /* A. Nasar: Increment counters required for the pack tasks */
   if (t->type == task_type_self || t->type == task_type_sub_self) {
-    if (t->subtype == task_subtype_gpu_pack)
+    if (t->subtype == task_subtype_gpu_pack){
+//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left);
-    if (t->subtype == task_subtype_gpu_pack_f)
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
+    }
+    if (t->subtype == task_subtype_gpu_pack_f){
+//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left_f);
-    if (t->subtype == task_subtype_gpu_pack_g)
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
+    }
+    if (t->subtype == task_subtype_gpu_pack_g){
+//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left_g);
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
+    }
   }
   /* A. Nasar NEED to think about how to do this with
    MPI where ci may not be on this node/rank */
   if (t->type == task_type_pair || t->type == task_type_sub_pair) {
     if (t->subtype == task_subtype_gpu_pack) {
-        atomic_inc(&q->n_packs_pair_left);
+//      pthread_mutex_lock(&q->sleep_mutex);
+      atomic_inc(&q->n_packs_pair_left);
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_f) {
-        atomic_inc(&q->n_packs_pair_left_f);
+//      pthread_mutex_lock(&q->sleep_mutex);
+      atomic_inc(&q->n_packs_pair_left_f);
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_g) {
-        atomic_inc(&q->n_packs_pair_left_g);
+//      pthread_mutex_lock(&q->sleep_mutex);
+      atomic_inc(&q->n_packs_pair_left_g);
+//      pthread_cond_broadcast(&q->sleep_cond);
+//      pthread_mutex_unlock(&q->sleep_mutex);
     }
   }
   /* Increase the incoming count. */
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 95b02c2049..7023ab49cc 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1,4 +1,5 @@
 #include "scheduler.h"
+#include <atomic.h>
 struct pack_vars_self {
   /*List of tasks and respective cells to be packed*/
   struct task **task_list;
@@ -200,13 +201,13 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   ci->pack_done++;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left));
+//  atomic_dec(&(s->queues[qid].n_packs_self_left));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
 
 //  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -324,13 +325,13 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   ci->pack_done_g++;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+//  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left_g), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left_g), 1), 1);
 //  if ((s->queues[qid].n_packs_self_left_g < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -446,13 +447,13 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   ci->pack_done_f++;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+//  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_self_left_f), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left_f), 1), 1);
 //  if ((s->queues[qid].n_packs_self_left_f < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -648,9 +649,9 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left));
 //  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
 
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -853,8 +854,8 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left_g), 1);
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
 //  if ((s->queues[qid].n_packs_pair_left_g < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -1059,8 +1060,8 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  atomic_cas(&pack_vars->launch_leftovers, (s->queues[qid].n_packs_pair_left_f), 1);
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
 
 //  if ((s->queues[qid].n_packs_pair_left_f < 1))
 //    pack_vars->launch_leftovers = 1;

From f3e313846c31602b0413d337846a0a7f7f0deac5 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 13:00:46 +0000
Subject: [PATCH 128/217] Removed un-necesseary mods from swift.c

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml |  2 +-
 src/cuda/part_gpu.h                            |  2 +-
 src/engine_config.c                            | 10 +++++-----
 swift.c                                        |  4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index b96f552291..bb9c4a943f 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-  replicate:  2
+#  replicate:  2
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index 92d12f45bd..a9153fecd5 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
+#include "/usr/local/cuda-12.6/targets/x86_64-linux/include/vector_types.h"
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/engine_config.c b/src/engine_config.c
index ff3ff5ec9f..70d3cfd1d4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                   (e->policy & scheduler_flag_steal), e->nodeID,
-                   &e->threadpool);
+//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+//                   (e->policy & scheduler_flag_steal), e->nodeID,
+//                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-//                 &e->threadpool);
+  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/swift.c b/swift.c
index 7a9277ae5c..b63941cd63 100644
--- a/swift.c
+++ b/swift.c
@@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) {
       hydro_props_init(&hydro_properties, &prog_const, &us, params);
     else
       bzero(&hydro_properties, sizeof(struct hydro_props));
-    float eta_neighbours = hydro_properties.eta_neighbours;
+
     /* Initialise the equation of state */
     if (with_hydro)
       eos_init(&eos, &prog_const, &us, params);
@@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) {
                with_self_gravity, with_star_formation, with_sinks,
                with_DM_particles, with_DM_background_particles, with_neutrinos,
                talking, dry_run, nr_nodes);
-    s.eta_neighbours = eta_neighbours;
+
     /* Initialise the line of sight properties. */
     if (with_line_of_sight) los_init(s.dim, &los_properties, params);
 

From a58d9681662c2a6bd5b9fb7c7ac2c5618f45d8bf Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 13:03:09 +0000
Subject: [PATCH 129/217] Added comment to explain changes in task.c

---
 src/task.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/task.c b/src/task.c
index b45bc4fdf8..4142a76674 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1746,12 +1746,13 @@ void task_dump_active(struct engine *e) {
       /* Get destination rank of MPI requests. */
       int paired = (t->cj != NULL);
       int otherrank = 0;
+      //A. N.: Mods requied to stop code crashing when debugging GPU tasks
       if(t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
     		  && t->subtype!= task_subtype_gpu_unpack_g)
     	  otherrank = t->ci->nodeID;
       if (paired && t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
     		  && t->subtype!= task_subtype_gpu_unpack_g)
-    	  otherrank = t->cj->nodeID;;
+    	  otherrank = t->cj->nodeID;
 
       fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
               engine_rank, otherrank, taskID_names[t->type],

From 4513dca5a6c5086ea04356b59e38d8a5b5ccf8b1 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 13:04:26 +0000
Subject: [PATCH 130/217] Removed un-necessary code from space.h

---
 src/space.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/space.h b/src/space.h
index 870ec7aae1..825dc49d60 100644
--- a/src/space.h
+++ b/src/space.h
@@ -94,9 +94,6 @@ extern double engine_foreign_alloc_margin;
  */
 struct space {
 
-  /*Used to define GPU task memory allocation*/
-  float eta_neighbours;
-
   /*! Spatial extent. */
   double dim[3];
 

From cf16ae4ae4afc6a3eaf920d6b50f19f7c61ce991 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 13:57:48 +0000
Subject: [PATCH 131/217] Reverted swift.c and space.h as eta_neighbours was
 indeed required for GPU memory allocation. Removed packs_stolen counter
 update from scheduler_gettask(). Cleaned up scheduler.c

---
 src/scheduler.c | 272 ++++--------------------------------------------
 src/space.h     |   3 +
 swift.c         |   4 +-
 3 files changed, 26 insertions(+), 253 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 066a406273..f5b2dd972e 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1369,205 +1369,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
       } else if (scheduler_doforcesplit && ci->split && cj->split &&
                  (ci->hydro.count > space_maxsize / cj->hydro.count)) {
 
-        /* Replace the current task.A. Nasar: Code does NOT go in here even
-         * with doforcesplit defined as 1 in scheduler.h */
-        t->type = task_type_none;
-
-        for (int j = 0; j < 8; j++)
-          if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count)
-            for (int k = 0; k < 8; k++)
-              if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) {
-                struct task *tl =
-                    scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
-                                      ci->progeny[j], cj->progeny[k]);
-                scheduler_splittask_hydro(tl, s);
-                tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
-                                                        &t->cj, shift);
-              }
-      }
-    } /* pair interaction? */
-  }   /* iterate over the current task. */
-}
-
-/**
- * @brief Split a hydrodynamic task if too large.
- *
- * @param t The #task
- * @param s The #scheduler we are working in.
- */
-static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
-  /* Are we considering both stars and hydro when splitting? */
-  /* Note this is not very clean as the scheduler should not really
-     access the engine... */
-
-  /* Iterate on this task until we're done with it. */
-  int redo = 1;
-  while (redo) {
-    /* Reset the redo flag. */
-    redo = 0;
-
-    /* Is this a non-empty self-task? */
-    const int is_self =
-        (t->type == task_type_self) && (t->ci != NULL) &&
-        (t->ci->hydro.count > 0);
-
-    /* Is this a non-empty pair-task? */
-    const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) &&
-                        (t->cj != NULL) &&
-                        (t->ci->hydro.count > 0) &&
-                        (t->cj->hydro.count > 0);
-
-    /* Empty task? */
-    if (!is_self && !is_pair) {
-      t->type = task_type_none;
-      t->subtype = task_subtype_none;
-      t->ci = NULL;
-      t->cj = NULL;
-      t->skip = 1;
-      break;
-    }
-
-    /* Self-interaction? */
-    if (t->type == task_type_self) {
-      /* Get a handle on the cell involved. */
-      struct cell *ci = t->ci;
-
-      /* Foreign task? */
-      if (ci->nodeID != s->nodeID) {
-        t->skip = 1;
-        break;
-      }
-
-      /* Is this cell even split and the task does not violate h ? */
-      if (cell_can_split_self_hydro_task(ci)) {
-
-          /* Take a step back (we're going to recycle the current task)... */
-          redo = 1;
-
-          /* Add the self tasks. */
-          int first_child = 0;
-          while (ci->progeny[first_child] == NULL) first_child++;
-
-          t->ci = ci->progeny[first_child];
-          cell_set_flag(t->ci, cell_flag_has_tasks);
-
-          for (int k = first_child + 1; k < 8; k++) {
-            /* Do we have a non-empty progenitor? */
-            if (ci->progeny[k] != NULL &&
-                ci->progeny[k]->hydro.count) {
-            	scheduler_splittask_hydro_GPU(
-                  scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
-                                    ci->progeny[k], NULL),
-                  s);
-            }
-          }
-
-          /* Make a task for each pair of progeny */
-          for (int j = 0; j < 8; j++) {
-            /* Do we have a non-empty progenitor? */
-            if (ci->progeny[j] != NULL &&
-                (ci->progeny[j]->hydro.count)) {
-              for (int k = j + 1; k < 8; k++) {
-                /* Do we have a second non-empty progenitor? */
-                if (ci->progeny[k] != NULL &&
-                    (ci->progeny[k]->hydro.count)) {
-                  scheduler_splittask_hydro_GPU(
-                      scheduler_addtask(s, task_type_pair, t->subtype,
-                                        sub_sid_flag[j][k], 0, ci->progeny[j],
-                                        ci->progeny[k]),
-                      s);
-                }
-              }
-            }
-          }
-
-      } /* Cell is split */
-
-    } /* Self interaction */
-
-    /* Pair interaction? */
-    else if (t->type == task_type_pair) {
-      /* Get a handle on the cells involved. */
-      struct cell *ci = t->ci;
-      struct cell *cj = t->cj;
-
-      /* Foreign task? */
-      if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
-        t->skip = 1;
-        break;
-      }
-
-      /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags
-         to make sure we get ci and cj swapped if needed. */
-      double shift[3];
-      const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift);
-
-#ifdef SWIFT_DEBUG_CHECKS
-      if (sid != t->flags)
-        error("Got pair task with incorrect flags: sid=%d flags=%lld", sid,
-              t->flags);
-#endif
-
-      if((cell_can_split_pair_hydro_task(ci) &&
-              !cell_can_split_pair_hydro_task(cj))
-        || !cell_can_split_pair_hydro_task(ci) &&
-        cell_can_split_pair_hydro_task(cj))
-    	  error("for some reason cell i can be split and cell j not");
-      /* Should this task be split-up? */
-      if (cell_can_split_pair_hydro_task(ci) &&
-          cell_can_split_pair_hydro_task(cj)) {
-
-        const int h_count_i = ci->hydro.count;
-        const int h_count_j = cj->hydro.count;
-
-//        const int s_count_i = ci->stars.count;
-//        const int s_count_j = cj->stars.count;
-//
-//        int do_sub_hydro = 1;
-//        if (h_count_i > 0 && h_count_j > 0) {
-//
-//          /* Note: Use division to avoid integer overflow. */
-//          do_sub_hydro =
-//              h_count_i * sid_scale[sid] < space_subsize_pair_hydro / h_count_j;
-//        }
-
-        /* Replace by a single sub-task? */
-//        if (scheduler_dosub &&
-//            (do_sub_hydro) &&
-//            !sort_is_corner(sid)) {
-//
-//          /* Make this task a sub task. */
-//          t->type = task_type_sub_pair;
-//
-//          /* Otherwise, split it. */
-//        } else {
-          /* Take a step back (we're going to recycle the current task)... */
-          redo = 1;
-
-          /* Loop over the sub-cell pairs for the current sid and add new tasks
-           * for them. */
-          struct cell_split_pair *csp = &cell_split_pairs[sid];
-
-          t->ci = ci->progeny[csp->pairs[0].pid];
-          t->cj = cj->progeny[csp->pairs[0].pjd];
-//          if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks);
-//          if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks);
-//
-//          t->flags = csp->pairs[0].sid;
-//          for (int k = 1; k < csp->count; k++) {
-//            scheduler_splittask_hydro_GPU(
-//                scheduler_addtask(s, task_type_pair, t->subtype,
-//                                  csp->pairs[k].sid, 0,
-//                                  ci->progeny[csp->pairs[k].pid],
-//                                  cj->progeny[csp->pairs[k].pjd]),
-//                s);
-//          }
-//        }
-
-        /* Otherwise, break it up if it is too large? */
-//      } else if (scheduler_doforcesplit && ci->split && cj->split &&
-//                 (ci->hydro.count > space_maxsize / cj->hydro.count)) {
-
         /* Replace the current task. */
         t->type = task_type_none;
 
@@ -1578,7 +1379,7 @@ static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
                 struct task *tl =
                     scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
                                       ci->progeny[j], cj->progeny[k]);
-                scheduler_splittask_hydro_GPU(tl, s);
+                scheduler_splittask_hydro(tl, s);
                 tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
                                                         &t->cj, shift);
               }
@@ -2733,24 +2534,18 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  */
 void scheduler_start(struct scheduler *s) {
   for (int i = 0; i < s->nr_queues; i++) {  // A. Nasar
-//	if(s->queues[i].n_packs_self_left_f > 0 && s->e->time > 0.0){
-//		message("time %f", s->e->time);
-//		error("We did not complete all density pack tasks. n left %i", s->queues[i].n_packs_self_left_f);
-//	}
     s->queues[i].n_packs_self_left = 0;
     s->queues[i].n_packs_pair_left = 0;
     s->queues[i].n_packs_self_left_f = 0;
     s->queues[i].n_packs_pair_left_f = 0;
     s->queues[i].n_packs_self_left_g = 0;
     s->queues[i].n_packs_pair_left_g = 0;
-
     s->queues[i].n_packs_self_stolen = 0;
     s->queues[i].n_packs_pair_stolen = 0;
     s->queues[i].n_packs_self_stolen_f = 0;
     s->queues[i].n_packs_pair_stolen_f = 0;
     s->queues[i].n_packs_self_stolen_g = 0;
     s->queues[i].n_packs_pair_stolen_g = 0;
-
   }
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
@@ -3306,6 +3101,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
   /* Check qid. */
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
 
+  /*Get a pointer to our queue for re-use*/
+  struct queue * q = &s->queues[qid];
   /* Loop as long as there are tasks... */
   while (s->waiting > 0 && res == NULL) {
     /* Try more than once before sleeping. */
@@ -3319,11 +3116,10 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
         if (res != NULL) break;
       }
 
-      /* If unsuccessful, try stealing from the other queues. A. Nasar
-       * falg set to zero for GPU work*/
+      /* If unsuccessful, try stealing from the other queues. */
       if (s->flags & scheduler_flag_steal) {
 
-        int count = 0, qids[nr_queues], act_qids[nr_queues];
+        int count = 0, qids[nr_queues];
 
         /* Make list of queues that have 1 or more tasks in them */
         for (int k = 0; k < nr_queues; k++) {
@@ -3336,6 +3132,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
           /* Pick a queue at random among the non-empty ones */
           const int ind = rand_r(&seed) % count;
+
           /*Get a pointer to the queue we're stealing from*/
           int qstl_id = qids[ind];
       	  struct queue * q_stl = &s->queues[qstl_id];
@@ -3344,74 +3141,47 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
 
-//          if (res != NULL && (res->type == task_type_pair) &&
-//        	  (res->subtype == task_subtype_gpu_unpack ||
-//        	   res->subtype == task_subtype_gpu_unpack_f	||
-//			   res->subtype == task_subtype_gpu_unpack_g)){
-//            continue;
-//          }
-
-//          if (res != NULL && res->type == task_type_pair &&
-//        	    res->subtype == task_subtype_gpu_pack &&
-//			    s->queues[qstl].n_packs_pair_left < 2){ //Condition we want to avoid is =< 1
-//            continue;
-//          }
-//
-//          if (res != NULL && res->type == task_type_pair &&
-//        	    res->subtype == task_subtype_gpu_pack_g &&
-//			    s->queues[qstl].n_packs_pair_left_g < 2){
-//            continue;
-//          }
-//
-//          if (res != NULL && res->type == task_type_pair &&
-//        	    res->subtype == task_subtype_gpu_pack_f &&
-//			    s->queues[qstl].n_packs_pair_left_f < 2){
-//            continue;
-//          }
-
           /* Lucky? */
 //          if (res != NULL && res->subtype != task_subtype_gpu_unpack
 //        		  && res->subtype != task_subtype_gpu_unpack_f
 //				  && res->subtype != task_subtype_gpu_unpack_g){
 		  if (res != NULL){
-        	/*Get a pointer to our queue for re-use*/
-        	struct queue * q = &s->queues[qid];
-//        		  && res->subtype != task_subtype_gpu_pack
-//        		  && res->subtype != task_subtype_gpu_pack_f
-//				  && res->subtype != task_subtype_gpu_pack_g) {
+        	/*A.Nasar: Get task type*/
+        	enum task_types type = res->type;
+        	enum task_subtypes subtype = res->subtype;
         	/*Move counter from the robbed to the robber*/
-            if ((res->type == task_type_self)&&
-                res->subtype == task_subtype_gpu_pack) {
+            if ((type == task_type_self || type == task_type_sub_self)&&
+                subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_self_left);
               atomic_inc(&q->n_packs_self_stolen);
               atomic_dec(&q_stl->n_packs_self_left);
             }
-            if ((res->type == task_type_self)&&
-                res->subtype == task_subtype_gpu_pack_g) {
+            if ((type == task_type_self || type == task_type_sub_self)&&
+                subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_self_left_g);
               atomic_inc(&q->n_packs_self_stolen_g);
               atomic_dec(&q_stl->n_packs_self_left_g);
             }
-            if ((res->type == task_type_self)&&
-                res->subtype == task_subtype_gpu_pack_f) {
+            if ((type == task_type_self || type == task_type_sub_self)&&
+                subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_self_left_f);
               atomic_inc(&q->n_packs_self_stolen_f);
               atomic_dec(&q_stl->n_packs_self_left_f);
             }
-            if ((res->type == task_type_pair)&&
-                res->subtype == task_subtype_gpu_pack) {
+            if ((type == task_type_pair || type == task_type_sub_pair)&&
+                subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_pair_left);
               atomic_inc(&q->n_packs_pair_stolen);
               atomic_dec(&q_stl->n_packs_pair_left);
             }
-            if ((res->type == task_type_pair)&&
-                res->subtype == task_subtype_gpu_pack_g) {
+            if ((type == task_type_pair || type == task_type_sub_pair)&&
+                subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_pair_left_g);
               atomic_inc(&q->n_packs_pair_stolen_g);
               atomic_dec(&q_stl->n_packs_pair_left_g);
             }
-            if ((res->type == task_type_pair)&&
-                res->subtype == task_subtype_gpu_pack_f) {
+            if ((type == task_type_pair || type == task_type_sub_pair)&&
+                subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_pair_left_f);
               atomic_inc(&q->n_packs_pair_stolen_f);
               atomic_dec(&q_stl->n_packs_pair_left_f);
diff --git a/src/space.h b/src/space.h
index 825dc49d60..870ec7aae1 100644
--- a/src/space.h
+++ b/src/space.h
@@ -94,6 +94,9 @@ extern double engine_foreign_alloc_margin;
  */
 struct space {
 
+  /*Used to define GPU task memory allocation*/
+  float eta_neighbours;
+
   /*! Spatial extent. */
   double dim[3];
 
diff --git a/swift.c b/swift.c
index b63941cd63..7a9277ae5c 100644
--- a/swift.c
+++ b/swift.c
@@ -1108,7 +1108,7 @@ int main(int argc, char *argv[]) {
       hydro_props_init(&hydro_properties, &prog_const, &us, params);
     else
       bzero(&hydro_properties, sizeof(struct hydro_props));
-
+    float eta_neighbours = hydro_properties.eta_neighbours;
     /* Initialise the equation of state */
     if (with_hydro)
       eos_init(&eos, &prog_const, &us, params);
@@ -1388,7 +1388,7 @@ int main(int argc, char *argv[]) {
                with_self_gravity, with_star_formation, with_sinks,
                with_DM_particles, with_DM_background_particles, with_neutrinos,
                talking, dry_run, nr_nodes);
-
+    s.eta_neighbours = eta_neighbours;
     /* Initialise the line of sight properties. */
     if (with_line_of_sight) los_init(s.dim, &los_properties, params);
 

From 7871c95493a3c7adfb42f55c5c3bee90ee8862fb Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 13:59:10 +0000
Subject: [PATCH 132/217] Reverted swift.c and space.h as eta_neighbours was
 indeed required for GPU memory allocation. Removed packs_stolen counter
 update from scheduler_gettask(). Cleaned up scheduler.c

---
 src/scheduler.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index f5b2dd972e..8aa1558e74 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3142,9 +3142,6 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           TIMER_TOC(timer_qsteal);
 
           /* Lucky? */
-//          if (res != NULL && res->subtype != task_subtype_gpu_unpack
-//        		  && res->subtype != task_subtype_gpu_unpack_f
-//				  && res->subtype != task_subtype_gpu_unpack_g){
 		  if (res != NULL){
         	/*A.Nasar: Get task type*/
         	enum task_types type = res->type;
@@ -3153,37 +3150,31 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_self_left);
-              atomic_inc(&q->n_packs_self_stolen);
               atomic_dec(&q_stl->n_packs_self_left);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_self_left_g);
-              atomic_inc(&q->n_packs_self_stolen_g);
               atomic_dec(&q_stl->n_packs_self_left_g);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_self_left_f);
-              atomic_inc(&q->n_packs_self_stolen_f);
               atomic_dec(&q_stl->n_packs_self_left_f);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_pair_left);
-              atomic_inc(&q->n_packs_pair_stolen);
               atomic_dec(&q_stl->n_packs_pair_left);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_pair_left_g);
-              atomic_inc(&q->n_packs_pair_stolen_g);
               atomic_dec(&q_stl->n_packs_pair_left_g);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_pair_left_f);
-              atomic_inc(&q->n_packs_pair_stolen_f);
               atomic_dec(&q_stl->n_packs_pair_left_f);
             }
             /* Run with the task */

From 3bf586e8cb474bd9049eee51cc009bd80a002362 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 14:31:34 +0000
Subject: [PATCH 133/217] Cleaned runner_main a bit whilst checking for source
 of new code hang issue

---
 src/runner_main_clean.cu | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 500ce645c7..868394f7f4 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1107,7 +1107,6 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
-//            if(sched->queues[r->qid].n_packs_self_left_g < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1136,7 +1135,6 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
-//            if(sched->queues[r->qid].n_packs_self_left_f < 1) launch_leftovers = 1;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
             /* Do we have enough stuff to run the GPU ? */
@@ -1296,12 +1294,7 @@ void *runner_main2(void *data) {
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-//              if(sched->queues[r->qid].n_packs_pair_left < 1) launch_leftovers = 1;
-//              if(launch_leftovers &&
-//            		  (sched->queues[r->qid].n_packs_pair_left < 0
-//            				  || sched->queues[r->qid].n_packs_pair_left >= 1))
-//            	  error("Something's wrong. n_packs_pair_left = %i when it should be zero",
-//            			  sched->queues[r->qid].n_packs_pair_left);
+
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
@@ -1380,14 +1373,8 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
-//              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
-//              if(launch_leftovers &&
-//            		  (sched->queues[r->qid].n_packs_pair_left_g < 0
-//            				  || sched->queues[r->qid].n_packs_pair_left_g >= 1))
-//            	  error("Something's wrong. n_packs_pair_left_g = %i when it should be zero",
-//            			  sched->queues[r->qid].n_packs_pair_left_g);
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1466,22 +1453,11 @@ void *runner_main2(void *data) {
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
-////              if(sched->queues[r->qid].n_packs_pair_left_g < 1) launch_leftovers = 1;
-//              if(launch_leftovers &&
-//            		  (sched->queues[r->qid].n_packs_pair_left_f < 0
-//            				  || sched->queues[r->qid].n_packs_pair_left_f >= 1))
-//            	  error("Somethig's wrong. n_packs_pair_left_f = %i when it should be zero",
-//            			  sched->queues[r->qid].n_packs_pair_left_f);
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-                //  			runner_dopair1_launch_f(r, sched,
-                //  pack_vars_pair_forc, ci, t, parts_aos_pair_forc,
-                //  					d_parts_aos_pair_forc,
-                //  stream, d_a, d_H, e, &packing_time_pair_f,
-                //  &time_for_gpu_pair_f);
             	int t_packed = pack_vars_pair_forc->tasks_packed;
                 signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_f_one_memcpy(

From 091819df407f1c783ba4cf467d36c1e2db60122e Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 8 Jan 2025 14:38:58 +0000
Subject: [PATCH 134/217] Cleaned up un-necessary code from queue.c

---
 src/queue.c | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/queue.c b/src/queue.c
index 0a330ce0e1..92296a9639 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -181,44 +181,26 @@ void queue_insert(struct queue *q, struct task *t) {
   /* A. Nasar: Increment counters required for the pack tasks */
   if (t->type == task_type_self || t->type == task_type_sub_self) {
     if (t->subtype == task_subtype_gpu_pack){
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_f){
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left_f);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_g){
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_self_left_g);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
   }
   /* A. Nasar NEED to think about how to do this with
    MPI where ci may not be on this node/rank */
   if (t->type == task_type_pair || t->type == task_type_sub_pair) {
     if (t->subtype == task_subtype_gpu_pack) {
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_pair_left);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_f) {
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_pair_left_f);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
     if (t->subtype == task_subtype_gpu_pack_g) {
-//      pthread_mutex_lock(&q->sleep_mutex);
       atomic_inc(&q->n_packs_pair_left_g);
-//      pthread_cond_broadcast(&q->sleep_cond);
-//      pthread_mutex_unlock(&q->sleep_mutex);
     }
   }
   /* Increase the incoming count. */

From ea17e54b04d56381766262fedc9ef11efc7aff01 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 9 Jan 2025 16:23:41 +0000
Subject: [PATCH 135/217] Reverted back to signalling runners individually for
 each task which fixed code hanging without task stealing. Tried out a few
 ideas to try and prevent code from hanging with stealing ON to no avail

---
 src/cuda/part_gpu.h                     |  2 +-
 src/engine_config.c                     | 10 ++++-----
 src/engine_maketasks.c                  |  7 +++---
 src/runner_doiact_functions_hydro_gpu.h | 30 ++++++++++++++++++++-----
 src/runner_main_clean.cu                | 12 +++++-----
 src/scheduler.c                         | 12 ++++++----
 6 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index a9153fecd5..92d12f45bd 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.6/targets/x86_64-linux/include/vector_types.h"
+#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/engine_config.c b/src/engine_config.c
index 70d3cfd1d4..ff3ff5ec9f 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,12 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-//    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-//                   (e->policy & scheduler_flag_steal), e->nodeID,
-//                   &e->threadpool);
+    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+                   (e->policy & scheduler_flag_steal), e->nodeID,
+                   &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-                 &e->threadpool);
+//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+//                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 0631796228..d0e17ced5b 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4964,6 +4964,7 @@ void engine_maketasks(struct engine *e) {
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
   const int pack_size = sched->pack_size;
+  const int pack_size_pair = sched->pack_size_pair;
 
   int count_current_self = 0;
   int count_current_pair = 0;
@@ -4999,7 +5000,7 @@ void engine_maketasks(struct engine *e) {
     }
 
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (count_current_pair % pack_size == 0) {
+      if (count_current_pair % pack_size_pair == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
       }
@@ -5062,7 +5063,7 @@ void engine_maketasks(struct engine *e) {
     }
 
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (count_current_pair % pack_size == 0) {
+      if (count_current_pair % pack_size_pair == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack_g, 0, 0, NULL, NULL);
       }
@@ -5131,7 +5132,7 @@ void engine_maketasks(struct engine *e) {
     }
 
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (count_current_pair % pack_size == 0) {
+      if (count_current_pair % pack_size_pair == 0) {
         last_created_pair_unpack = scheduler_addtask(
             sched, task_type_pair, task_subtype_gpu_unpack_f, 0, 0, NULL, NULL);
       }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 7023ab49cc..fa20c75c42 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1473,7 +1473,10 @@ void runner_doself1_launch_f4(
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         /* Release the lock */
         cell_unlocktree(cii);
 
@@ -1872,7 +1875,10 @@ void runner_doself1_launch_f4_g(
 
         /* Record things for debugging */
         cii->gpu_done_g++;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         /* Release the lock */
         cell_unlocktree(cii);
 
@@ -2274,7 +2280,10 @@ void runner_doself1_launch_f4_f(
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         /* Release the lock */
         cell_unlocktree(cii);
 
@@ -2913,7 +2922,10 @@ void runner_dopair1_launch_f4_one_memcpy(
         /* Record things for debugging */
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         //		  /* Release the locks */
         cell_unlocktree(cii);
         //		  /* Release the locks */
@@ -3638,7 +3650,10 @@ void runner_dopair1_launch_f4_g_one_memcpy(
         /* Record things for debugging */
         cii->gpu_done_pair_g++;
         cjj->gpu_done_pair_g++;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         /* Release the locks */
         cell_unlocktree(cii);
         /* Release the locks */
@@ -4390,7 +4405,10 @@ void runner_dopair1_launch_f4_f_one_memcpy(
         /* Record things for debugging */
         cii->gpu_done_pair_f++;
         cjj->gpu_done_pair_f++;
-
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
         //		  /* Release the locks */
         cell_unlocktree(cii);
         //		  /* Release the locks */
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 868394f7f4..8e8cd01daa 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1083,7 +1083,7 @@ void *runner_main2(void *data) {
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_dens->tasks_packed;
-              signal_sleeping_runners(sched, t, t_packed);
+//              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4(
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
@@ -1113,7 +1113,7 @@ void *runner_main2(void *data) {
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_grad->tasks_packed;
-              signal_sleeping_runners(sched, t, t_packed);
+//              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_g(
                   r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                   parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
@@ -1141,7 +1141,7 @@ void *runner_main2(void *data) {
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_forc->tasks_packed;
-              signal_sleeping_runners(sched, t, t_packed);
+//              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1301,7 +1301,7 @@ void *runner_main2(void *data) {
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
             	int t_packed = pack_vars_pair_dens->tasks_packed;
-                signal_sleeping_runners(sched, t, t_packed);
+//                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1379,7 +1379,7 @@ void *runner_main2(void *data) {
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
             	int t_packed = pack_vars_pair_grad->tasks_packed;
-                signal_sleeping_runners(sched, t, t_packed);
+//                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_g_one_memcpy(
                     r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
                     parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
@@ -1459,7 +1459,7 @@ void *runner_main2(void *data) {
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
             	int t_packed = pack_vars_pair_forc->tasks_packed;
-                signal_sleeping_runners(sched, t, t_packed);
+//                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_f_one_memcpy(
                     r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
                     parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
diff --git a/src/scheduler.c b/src/scheduler.c
index 8aa1558e74..50bffcd1bb 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3123,6 +3123,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
         /* Make list of queues that have 1 or more tasks in them */
         for (int k = 0; k < nr_queues; k++) {
+        	if(k == qid) continue;
           if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
             qids[count++] = k;
           }
@@ -3132,15 +3133,18 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
           /* Pick a queue at random among the non-empty ones */
           const int ind = rand_r(&seed) % count;
-
           /*Get a pointer to the queue we're stealing from*/
           int qstl_id = qids[ind];
+          if(qid == qstl_id){
+        	  /* Reduce the size of the list of non-empty queues */
+        	  qids[ind] = qids[--count];
+        	  continue;
+          }
       	  struct queue * q_stl = &s->queues[qstl_id];
           /* Try to get a task from that random queue */
           TIMER_TIC;
           res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
-
           /* Lucky? */
 		  if (res != NULL){
         	/*A.Nasar: Get task type*/
@@ -3199,8 +3203,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
     	struct queue qq = s->queues[qid];
-//    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
-//    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
+    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
+    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);

From 4ab67c270e587713c1cf64c35edd0566fc1b1631 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 9 Jan 2025 17:29:41 +0000
Subject: [PATCH 136/217] Added additional switch in runner_main to double
 check whether to launch_leftovers. Seems to run for slightly longer but still
 hangs. Need to test on GH to see if I can get away with this code for writing
 the paper!

---
 src/runner_main_clean.cu | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 8e8cd01daa..4543e37db0 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1077,6 +1077,11 @@ void *runner_main2(void *data) {
 //            	error("There's %i parts in a cell when it should be %i max", ci->hydro.count, np_per_cell);
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
+
+            if ((sched->queues[r->qid].n_packs_self_left < 1)){
+            	launch_leftovers = 1;
+            	pack_vars_self_dens->launch_leftovers = 1;
+            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch) n_full_d_bundles++;
             if (launch_leftovers) n_partial_d_bundles++;
@@ -1109,6 +1114,11 @@ void *runner_main2(void *data) {
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
+
+            if ((sched->queues[r->qid].n_packs_self_left_g < 1)){
+              launch_leftovers = 1;
+          	  pack_vars_self_grad->launch_leftovers = 1;
+            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1137,6 +1147,11 @@ void *runner_main2(void *data) {
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
+
+            if ((sched->queues[r->qid].n_packs_self_left_f < 1)){
+              launch_leftovers = 1;
+          	  pack_vars_self_forc->launch_leftovers = 1;
+            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1298,6 +1313,11 @@ void *runner_main2(void *data) {
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
+
+              if ((sched->queues[r->qid].n_packs_pair_left < 1)){
+            	  launch_leftovers = 1;
+            	  pack_vars_pair_dens->launch_leftovers = 1;
+              }
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
             	int t_packed = pack_vars_pair_dens->tasks_packed;
@@ -1375,6 +1395,11 @@ void *runner_main2(void *data) {
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
+
+              if ((sched->queues[r->qid].n_packs_pair_left_g < 1)){
+            	  launch_leftovers = 1;
+            	  pack_vars_pair_grad->launch_leftovers = 1;
+              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1455,6 +1480,10 @@ void *runner_main2(void *data) {
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
+              if ((sched->queues[r->qid].n_packs_pair_left_f < 1)){
+            	  launch_leftovers = 1;
+            	  pack_vars_pair_forc->launch_leftovers = 1;
+              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/

From f8463e8119231ed72ccb2814cd4842819f3c7e61 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 10 Jan 2025 16:11:41 +0000
Subject: [PATCH 137/217] Code hanging. Unsure why. Made significant changes.
 Changed counters so that they are an array attached to scheduler struct where
 the array indices represent the scheduler's queues. The code hangs at step 0
 suggesting there is a mistake in the new implementation. Double check &
 REVISE

---
 src/queue.c                             | 25 ---------
 src/runner_doiact_functions_hydro_gpu.h | 75 ++++++++++++++++++-------
 src/runner_main_clean.cu                | 53 ++++++++---------
 src/scheduler.c                         | 59 +++++++++++++++++++
 src/scheduler.h                         |  7 +++
 5 files changed, 148 insertions(+), 71 deletions(-)

diff --git a/src/queue.c b/src/queue.c
index 92296a9639..790b6b1335 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -178,31 +178,6 @@ void queue_insert(struct queue *q, struct task *t) {
       }
     }
   }
-  /* A. Nasar: Increment counters required for the pack tasks */
-  if (t->type == task_type_self || t->type == task_type_sub_self) {
-    if (t->subtype == task_subtype_gpu_pack){
-      atomic_inc(&q->n_packs_self_left);
-    }
-    if (t->subtype == task_subtype_gpu_pack_f){
-      atomic_inc(&q->n_packs_self_left_f);
-    }
-    if (t->subtype == task_subtype_gpu_pack_g){
-      atomic_inc(&q->n_packs_self_left_g);
-    }
-  }
-  /* A. Nasar NEED to think about how to do this with
-   MPI where ci may not be on this node/rank */
-  if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-    if (t->subtype == task_subtype_gpu_pack) {
-      atomic_inc(&q->n_packs_pair_left);
-    }
-    if (t->subtype == task_subtype_gpu_pack_f) {
-      atomic_inc(&q->n_packs_pair_left_f);
-    }
-    if (t->subtype == task_subtype_gpu_pack_g) {
-      atomic_inc(&q->n_packs_pair_left_g);
-    }
-  }
   /* Increase the incoming count. */
   atomic_inc(&q->count_incoming);
 }
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index fa20c75c42..4c79b9d52a 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -167,6 +167,13 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->s_d_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
 
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
@@ -200,16 +207,16 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done++;
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
+//  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_self_left));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
+//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
 
-//  if ((s->queues[qid].n_packs_self_left < 1)) pack_vars->launch_leftovers = 1;
+  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -293,7 +300,13 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
-
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->s_g_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -324,7 +337,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_g++;
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
+//  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
   t->done = 1;
   pack_vars->tasks_packed++;
@@ -415,7 +428,13 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
-
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->s_f_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -446,7 +465,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_f++;
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
+//  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
   t->done = 1;
   pack_vars->tasks_packed++;
@@ -573,6 +592,11 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
+  int qid = r->qid;
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->p_d_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -648,10 +672,10 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
+//  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
-//  if ((s->queues[qid].n_packs_pair_left < 1)) pack_vars->launch_leftovers = 1;
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
+  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
+//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
 
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -779,6 +803,12 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
 
+  int qid = r->qid;
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->p_g_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
   space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
@@ -853,11 +883,11 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
-  int qid = r->qid;
+//  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
-//  if ((s->queues[qid].n_packs_pair_left_g < 1))
-//    pack_vars->launch_leftovers = 1;
+//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
+  if ((s->p_g_left[qid] < 1))
+    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -985,6 +1015,14 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
 
+  /* Record that we have now done a packing (self) */
+  int qid = r->qid;
+//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_dec(&(s->p_f_left[qid]));
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
   space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
@@ -1058,13 +1096,10 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
+//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
 
-//  if ((s->queues[qid].n_packs_pair_left_f < 1))
-//    pack_vars->launch_leftovers = 1;
+  if ((s->p_f_left[qid] < 1))
+    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4543e37db0..4a520db3c6 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -994,16 +994,17 @@ void *runner_main2(void *data) {
     ticks hang_time = getticks();
     while (1) {
 //      ticks tic_get_task = getticks();
+      //A. Nasar: Get qid for re-use later
+      int qid = r->qid;
       /* If there's no old task, try to get a new one. */
       if (t == NULL) {
         /* Get the task. */
         TIMER_TIC
-        t = scheduler_gettask(sched, r->qid, prev);
+        t = scheduler_gettask(sched, qid, prev);
         TIMER_TOC(timer_gettask);
         /* Did I get anything? */
         if (t == NULL) break;
       }
-
       /* Get the cells. */
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
@@ -1078,10 +1079,10 @@ void *runner_main2(void *data) {
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
 
-            if ((sched->queues[r->qid].n_packs_self_left < 1)){
-            	launch_leftovers = 1;
-            	pack_vars_self_dens->launch_leftovers = 1;
-            }
+//            if ((sched->s_d_left[qid] < 1)){
+//            	launch_leftovers = 1;
+//            	pack_vars_self_dens->launch_leftovers = 1;
+//            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch) n_full_d_bundles++;
             if (launch_leftovers) n_partial_d_bundles++;
@@ -1115,10 +1116,10 @@ void *runner_main2(void *data) {
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
 
-            if ((sched->queues[r->qid].n_packs_self_left_g < 1)){
-              launch_leftovers = 1;
-          	  pack_vars_self_grad->launch_leftovers = 1;
-            }
+//            if ((sched->s_g_left[qid] < 1)){
+//              launch_leftovers = 1;
+//          	  pack_vars_self_grad->launch_leftovers = 1;
+//            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1148,10 +1149,10 @@ void *runner_main2(void *data) {
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
 
-            if ((sched->queues[r->qid].n_packs_self_left_f < 1)){
-              launch_leftovers = 1;
-          	  pack_vars_self_forc->launch_leftovers = 1;
-            }
+//            if ((sched->s_f_left[qid] < 1)){
+//              launch_leftovers = 1;
+//          	  pack_vars_self_forc->launch_leftovers = 1;
+//            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1314,10 +1315,10 @@ void *runner_main2(void *data) {
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
 
-              if ((sched->queues[r->qid].n_packs_pair_left < 1)){
-            	  launch_leftovers = 1;
-            	  pack_vars_pair_dens->launch_leftovers = 1;
-              }
+//              if ((sched->p_d_left[qid] < 1)){
+//            	  launch_leftovers = 1;
+//            	  pack_vars_pair_dens->launch_leftovers = 1;
+//              }
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
             	int t_packed = pack_vars_pair_dens->tasks_packed;
@@ -1396,10 +1397,10 @@ void *runner_main2(void *data) {
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
 
-              if ((sched->queues[r->qid].n_packs_pair_left_g < 1)){
-            	  launch_leftovers = 1;
-            	  pack_vars_pair_grad->launch_leftovers = 1;
-              }
+//              if ((sched->p_g_left[qid] < 1)){
+//            	  launch_leftovers = 1;
+//            	  pack_vars_pair_grad->launch_leftovers = 1;
+//              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1480,10 +1481,10 @@ void *runner_main2(void *data) {
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
-              if ((sched->queues[r->qid].n_packs_pair_left_f < 1)){
-            	  launch_leftovers = 1;
-            	  pack_vars_pair_forc->launch_leftovers = 1;
-              }
+//              if ((sched->p_f_left[qid] < 1)){
+//            	  launch_leftovers = 1;
+//            	  pack_vars_pair_forc->launch_leftovers = 1;
+//              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
diff --git a/src/scheduler.c b/src/scheduler.c
index 50bffcd1bb..42bc40794f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2546,6 +2546,12 @@ void scheduler_start(struct scheduler *s) {
     s->queues[i].n_packs_pair_stolen_f = 0;
     s->queues[i].n_packs_self_stolen_g = 0;
     s->queues[i].n_packs_pair_stolen_g = 0;
+    s->s_d_left[i] = 0;
+    s->s_g_left[i] = 0;
+    s->s_f_left[i] = 0;
+    s->p_d_left[i] = 0;
+    s->p_g_left[i] = 0;
+    s->p_f_left[i] = 0;
   }
   /* Re-wait the tasks. */
   if (s->active_count > 1000) {
@@ -2890,6 +2896,37 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     atomic_inc(&s->waiting);
     /* Insert the task into that queue. */
     queue_insert(&s->queues[qid], t);
+    /* A. Nasar: Increment counters required for the pack tasks */
+    if (t->type == task_type_self || t->type == task_type_sub_self) {
+      if (t->subtype == task_subtype_gpu_pack){
+        atomic_inc(&s->queues[qid].n_packs_self_left);
+        atomic_inc(&s->s_d_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_f){
+        atomic_inc(&s->queues[qid].n_packs_self_left_f);
+        atomic_inc(&s->s_f_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_g){
+        atomic_inc(&s->queues[qid].n_packs_self_left_g);
+        atomic_inc(&s->s_g_left[qid]);
+      }
+    }
+    /* A. Nasar NEED to think about how to do this with
+     MPI where ci may not be on this node/rank */
+    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+      if (t->subtype == task_subtype_gpu_pack) {
+        atomic_inc(&s->queues[qid].n_packs_pair_left);
+        atomic_inc(&s->p_d_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_f) {
+        atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+        atomic_inc(&s->p_f_left[qid]);
+      }
+      if (t->subtype == task_subtype_gpu_pack_g) {
+        atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+        atomic_inc(&s->p_g_left[qid]);
+      }
+    }
   }
 }
 
@@ -3155,31 +3192,43 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                 subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_self_left);
               atomic_dec(&q_stl->n_packs_self_left);
+              atomic_inc(&s->s_d_left[qid]);
+              atomic_dec(&s->s_d_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_self_left_g);
               atomic_dec(&q_stl->n_packs_self_left_g);
+              atomic_inc(&s->s_g_left[qid]);
+              atomic_dec(&s->s_g_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_self_left_f);
               atomic_dec(&q_stl->n_packs_self_left_f);
+              atomic_inc(&s->s_f_left[qid]);
+              atomic_dec(&s->s_f_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack) {
               atomic_inc(&q->n_packs_pair_left);
               atomic_dec(&q_stl->n_packs_pair_left);
+              atomic_inc(&s->p_d_left[qid]);
+              atomic_dec(&s->p_d_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_g) {
               atomic_inc(&q->n_packs_pair_left_g);
               atomic_dec(&q_stl->n_packs_pair_left_g);
+              atomic_inc(&s->p_g_left[qid]);
+              atomic_dec(&s->p_g_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_f) {
               atomic_inc(&q->n_packs_pair_left_f);
               atomic_dec(&q_stl->n_packs_pair_left_f);
+              atomic_inc(&s->p_f_left[qid]);
+              atomic_dec(&s->p_f_left[qstl_id]);
             }
             /* Run with the task */
             break;
@@ -3251,6 +3300,16 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   /* Initialize each queue. */
   for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
 
+  /* Initialize each queue. */
+  for (int k = 0; k < nr_queues; k++){
+	  s->s_d_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+	  s->s_g_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+	  s->s_f_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+	  s->p_d_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+	  s->p_g_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+	  s->p_f_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+  }
+
   /* Init the sleep mutex and cond. */
   if (pthread_cond_init(&s->sleep_cond, NULL) != 0 ||
       pthread_mutex_init(&s->sleep_mutex, NULL) != 0)
diff --git a/src/scheduler.h b/src/scheduler.h
index 9cdc65ccfa..5551b167b8 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -67,6 +67,13 @@ struct scheduler {
   int nr_packs_pair_forc_done;
   int nr_packs_self_grad_done;
   int nr_packs_pair_grad_done;
+
+  volatile int *s_d_left;
+  volatile int *s_g_left;
+  volatile int *s_f_left;
+  volatile int *p_d_left;
+  volatile int *p_g_left;
+  volatile int *p_f_left;
   /* Actual number of GPU tasks. */
   int nr_gpu_tasks;
   /* Number of tasks we want*/

From 05739035149860e0786ff6130e6fb8fcc2e85785 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 27 Jan 2025 10:19:10 +0000
Subject: [PATCH 138/217] Reverted back to decrementing counters attached to
 queues. Code still hangs but at least runs for a few steps

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       | 66 +++++++++----------
 src/scheduler.c                               |  4 +-
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index bb9c4a943f..b96f552291 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  ./greshoVortex.hdf5     # The file to read
   periodic:   1
-#  replicate:  2
+  replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 4c79b9d52a..c0b7542fee 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -170,10 +170,10 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_dec(&(s->s_d_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
+//  atomic_dec(&(s->s_d_left[qid]));
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
 
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
@@ -214,9 +214,9 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
 
-  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
+//  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -303,10 +303,10 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_dec(&(s->s_g_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
+//  atomic_dec(&(s->s_g_left[qid]));
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -431,10 +431,10 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_dec(&(s->s_f_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
+//  atomic_dec(&(s->s_f_left[qid]));
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -593,10 +593,10 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
   int qid = r->qid;
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_dec(&(s->p_d_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
+//  atomic_dec(&(s->p_d_left[qid]));
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -674,8 +674,8 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
 //  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
-  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
-//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
+//  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
 
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -804,10 +804,10 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   int tasks_packed = pack_vars->tasks_packed;
 
   int qid = r->qid;
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_dec(&(s->p_g_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
+//  atomic_dec(&(s->p_g_left[qid]));
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -885,9 +885,9 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
 //  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
-  if ((s->p_g_left[qid] < 1))
-    pack_vars->launch_leftovers = 1;
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
+//  if ((s->p_g_left[qid] < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -1018,10 +1018,10 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  pthread_mutex_lock(&s->sleep_mutex);
+//  pthread_mutex_lock(&s->sleep_mutex);
   atomic_dec(&(s->p_f_left[qid]));
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
+//  pthread_cond_broadcast(&s->sleep_cond);
+//  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -1096,10 +1096,10 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-//  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
+  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
 
-  if ((s->p_f_left[qid] < 1))
-    pack_vars->launch_leftovers = 1;
+//  if ((s->p_f_left[qid] < 1))
+//    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
diff --git a/src/scheduler.c b/src/scheduler.c
index 42bc40794f..21644a5deb 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3252,8 +3252,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
     	struct queue qq = s->queues[qid];
-    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
-    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
+//    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
+//    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);

From 8dc1b50ee42079d48d3e3b3894129e44cc89a74d Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <dc-nasa1@login8a.pri.cosma.local>
Date: Mon, 27 Jan 2025 13:46:22 +0000
Subject: [PATCH 139/217] Make queue debug variables volatile. More uniform
 naming convention. Replace atomic operations on queue counters by queue
 locking operations

---
 src/cuda/part_gpu.h                     |  4 +-
 src/queue.h                             | 16 ++---
 src/runner_doiact_functions_hydro_gpu.h | 80 ++++++++++++++++++----
 src/scheduler.c                         | 90 ++++++++++++++++++-------
 src/scheduler.h                         |  2 +-
 5 files changed, 146 insertions(+), 46 deletions(-)

diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index 92d12f45bd..37b019d421 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,9 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-#include "/usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h"
+//#include </usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h>
+
+#include <vector_types.h>
 
 typedef struct part_soa {
   /*Task ID*/
diff --git a/src/queue.h b/src/queue.h
index 61420d7fe7..4ace573b5b 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -76,21 +76,21 @@ struct queue {
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
   /*Number of pack tasks left in queue A. Nasar */
-  volatile int n_packs_self_left;   /*Number of density pack tasks left in queue*/
+  volatile int n_packs_self_left_d; /*Number of density pack tasks left in queue*/
   volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
   volatile int n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
 
-  volatile int n_packs_pair_left;
+  volatile int n_packs_pair_left_d;
   volatile int n_packs_pair_left_f;
   volatile int n_packs_pair_left_g;
 
-  int n_packs_self_stolen;   /*Number of density pack tasks left in queue*/
-  int n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
-  int n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
+  volatile int n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/
+  volatile int n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
+  volatile int n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
 
-  int n_packs_pair_stolen;
-  int n_packs_pair_stolen_f;
-  int n_packs_pair_stolen_g;
+  volatile int n_packs_pair_stolen_d;
+  volatile int n_packs_pair_stolen_f;
+  volatile int n_packs_pair_stolen_g;
 
 } __attribute__((aligned(queue_struct_align)));
 
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index c0b7542fee..5b3002c133 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -140,14 +140,14 @@ void runner_doself1_pack(struct runner *r, struct scheduler *s,
   ci->pack_done++;
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left));
+  atomic_dec(&(s->queues[qid].n_packs_self_left_d));
   t->done = 1;
   /* Release the lock on the cell */
   task_unlock(t);
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_self_left_d == 0)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -207,14 +207,20 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done++;
   /* Record that we have now done a packing (self) */
-//  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_self_left));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left), 1), 1);
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_self_left_d);
+
+  if (s->queues[qid].n_packs_self_left_d < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+  
 
 //  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -344,7 +350,18 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left_g), 1), 1);
+  
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_self_left_g);
+
+  if (s->queues[qid].n_packs_self_left_g < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+
+
+  
 //  if ((s->queues[qid].n_packs_self_left_g < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -472,8 +489,18 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_self_left_f), 1), 1);
-//  if ((s->queues[qid].n_packs_self_left_f < 1))
+
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_self_left_f);
+
+  if (s->queues[qid].n_packs_self_left_f < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+
+
+  //  if ((s->queues[qid].n_packs_self_left_f < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
@@ -559,13 +586,13 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s,
 
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+  atomic_dec(&(s->queues[qid].n_packs_pair_left_d));
   t->done = 1;
 
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left_d == 0)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -675,8 +702,16 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 //  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
 //  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left), 1), 1);
 
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_pair_left_d);
+
+  if (s->queues[qid].n_packs_pair_left_d < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+  
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -885,7 +920,17 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Record that we have now done a packing (self) */
 //  int qid = r->qid;
 //  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_g), 1), 1);
+
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_pair_left_g);
+
+  if (s->queues[qid].n_packs_pair_left_g < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+
+  
 //  if ((s->p_g_left[qid] < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
@@ -1096,8 +1141,17 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  atomic_cas(&pack_vars->launch_leftovers, __sync_sub_and_fetch(&(s->queues[qid].n_packs_pair_left_f), 1), 1);
 
+  lock_lock(&s->queues[qid].lock);
+
+  atomic_dec(&s->queues[qid].n_packs_pair_left_f);
+
+  if (s->queues[qid].n_packs_pair_left_f < 1) 
+    pack_vars->launch_leftovers = 1;
+      
+  lock_unlock(&s->queues[qid].lock);
+
+  
 //  if ((s->p_f_left[qid] < 1))
 //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
diff --git a/src/scheduler.c b/src/scheduler.c
index 21644a5deb..5624624016 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1771,9 +1771,9 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   // #ifdef WITH_CUDA  A. Nasar
   if(t->subtype == task_subtype_gpu_pack){
 	  if(t->type == task_type_self || t->type == task_type_sub_self)
-		    atomic_inc(&s->nr_self_pack_tasks);
+		    atomic_inc(&s->nr_self_pack_tasks_d);
 	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
-		    atomic_inc(&s->nr_pair_pack_tasks);
+		    atomic_inc(&s->nr_pair_pack_tasks_d);
   }
   if(t->subtype == task_subtype_gpu_pack_f){
 	  if(t->type == task_type_self || t->type == task_type_sub_self)
@@ -1987,8 +1987,8 @@ void scheduler_reset(struct scheduler *s, int size) {
   /* Reset the counters. */
   s->size = size;
   s->nr_tasks = 0;
-  s->nr_self_pack_tasks = 0;  // A. Nasar
-  s->nr_pair_pack_tasks = 0;
+  s->nr_self_pack_tasks_d = 0;  // A. Nasar
+  s->nr_pair_pack_tasks_d = 0;
   s->nr_self_pack_tasks_f = 0;
   s->nr_pair_pack_tasks_f = 0;
   s->nr_self_pack_tasks_g = 0;
@@ -2534,14 +2534,14 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements,
  */
 void scheduler_start(struct scheduler *s) {
   for (int i = 0; i < s->nr_queues; i++) {  // A. Nasar
-    s->queues[i].n_packs_self_left = 0;
-    s->queues[i].n_packs_pair_left = 0;
+    s->queues[i].n_packs_self_left_d = 0;
+    s->queues[i].n_packs_pair_left_d = 0;
     s->queues[i].n_packs_self_left_f = 0;
     s->queues[i].n_packs_pair_left_f = 0;
     s->queues[i].n_packs_self_left_g = 0;
     s->queues[i].n_packs_pair_left_g = 0;
-    s->queues[i].n_packs_self_stolen = 0;
-    s->queues[i].n_packs_pair_stolen = 0;
+    s->queues[i].n_packs_self_stolen_d = 0;
+    s->queues[i].n_packs_pair_stolen_d = 0;
     s->queues[i].n_packs_self_stolen_f = 0;
     s->queues[i].n_packs_pair_stolen_f = 0;
     s->queues[i].n_packs_self_stolen_g = 0;
@@ -2899,15 +2899,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     /* A. Nasar: Increment counters required for the pack tasks */
     if (t->type == task_type_self || t->type == task_type_sub_self) {
       if (t->subtype == task_subtype_gpu_pack){
-        atomic_inc(&s->queues[qid].n_packs_self_left);
+	lock_lock(&s->queues[qid].lock);
+        atomic_inc(&s->queues[qid].n_packs_self_left_d);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f){
+	lock_lock(&s->queues[qid].lock);
         atomic_inc(&s->queues[qid].n_packs_self_left_f);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g){
+	lock_lock(&s->queues[qid].lock);
         atomic_inc(&s->queues[qid].n_packs_self_left_g);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_g_left[qid]);
       }
     }
@@ -2915,15 +2921,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
      MPI where ci may not be on this node/rank */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (t->subtype == task_subtype_gpu_pack) {
-        atomic_inc(&s->queues[qid].n_packs_pair_left);
+	lock_lock(&s->queues[qid].lock);
+        atomic_inc(&s->queues[qid].n_packs_pair_left_d);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
+	lock_lock(&s->queues[qid].lock);
         atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
+	lock_lock(&s->queues[qid].lock);
         atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_g_left[qid]);
       }
     }
@@ -3172,26 +3184,54 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           const int ind = rand_r(&seed) % count;
           /*Get a pointer to the queue we're stealing from*/
           int qstl_id = qids[ind];
+
+	  /* If we got the queue we already have, abort */
           if(qid == qstl_id){
         	  /* Reduce the size of the list of non-empty queues */
         	  qids[ind] = qids[--count];
         	  continue;
           }
+
+	  /* The queue we are stealing from */
       	  struct queue * q_stl = &s->queues[qstl_id];
-          /* Try to get a task from that random queue */
+
+	  /* Can we lock our own queue? */
+	  if (lock_trylock(&q->lock) != 0) {
+
+	    /* No --> continue and try a different queue */
+	    continue;
+	    
+	  } else {
+
+	    /* Yes --> Try locking the que we steal from */
+	    if (lock_trylock(&q_stl->lock) != 0) {
+
+	      /* Failed? --> Unlock the 1st queue  and
+		 try again */
+	      lock_unlock(&q->lock);
+	      continue;
+	    }
+	  }
+
+	  /* We now have locked q and q_stl */
+
+	  /* Try to get a task from that random queue */
           TIMER_TIC;
           res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
-          /* Lucky? */
-		  if (res != NULL){
-        	/*A.Nasar: Get task type*/
-        	enum task_types type = res->type;
-        	enum task_subtypes subtype = res->subtype;
-        	/*Move counter from the robbed to the robber*/
+	  
+          /* Lucky? i.e. did we actually get a task? */
+	  if (res != NULL){
+
+	    /*A.Nasar: Get task type*/
+	    enum task_types type = res->type;
+	    enum task_subtypes subtype = res->subtype;
+
+	    /*Move counter from the robbed to the robber*/
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack) {
-              atomic_inc(&q->n_packs_self_left);
-              atomic_dec(&q_stl->n_packs_self_left);
+              atomic_inc(&q->n_packs_self_left_d);
+              atomic_dec(&q_stl->n_packs_self_left_d);
               atomic_inc(&s->s_d_left[qid]);
               atomic_dec(&s->s_d_left[qstl_id]);
             }
@@ -3211,8 +3251,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack) {
-              atomic_inc(&q->n_packs_pair_left);
-              atomic_dec(&q_stl->n_packs_pair_left);
+              atomic_inc(&q->n_packs_pair_left_d);
+              atomic_dec(&q_stl->n_packs_pair_left_d);
               atomic_inc(&s->p_d_left[qid]);
               atomic_dec(&s->p_d_left[qstl_id]);
             }
@@ -3233,9 +3273,13 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             /* Run with the task */
             break;
           } else {
+	    
             /* Reduce the size of the list of non-empty queues */
             qids[ind] = qids[--count];
           }
+
+	  lock_unlock(&q->lock);
+	  lock_unlock(&q_stl->lock);
         }
         if (res != NULL) break;
       }
@@ -3399,10 +3443,10 @@ void scheduler_free_tasks(struct scheduler *s) {
   s->size = 0;
   s->nr_tasks = 0;
   // reset GPU task counters too
-  s->nr_self_pack_tasks = 0;
+  s->nr_self_pack_tasks_d = 0;
   s->nr_self_pack_tasks_f = 0;
   s->nr_self_pack_tasks_g = 0;
-  s->nr_pair_pack_tasks = 0;
+  s->nr_pair_pack_tasks_d = 0;
   s->nr_pair_pack_tasks_f = 0;
   s->nr_pair_pack_tasks_g = 0;
 }
diff --git a/src/scheduler.h b/src/scheduler.h
index 5551b167b8..57e6857b7c 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -79,7 +79,7 @@ struct scheduler {
   /* Number of tasks we want*/
   int target_gpu_tasks;
   /* Actual number of density pack tasks. */
-  int nr_self_pack_tasks, nr_pair_pack_tasks;
+  int nr_self_pack_tasks_d, nr_pair_pack_tasks_d;
   /* Actual number of force pack tasks. */
   int nr_self_pack_tasks_f, nr_pair_pack_tasks_f;
   /* Actual number of gradient pack tasks. */

From 73fd85fdb3d4e30343527db9859f9867c5027ace Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 27 Jan 2025 14:16:50 +0000
Subject: [PATCH 140/217] Changed dec and inc ops when stealing to be
 non-atomic. Also replaced counters ending with sched->nr_pair_pack_tasks to 
 sched->nr_pair_pack_tasks_d in engine_maketasks.c

---
 src/engine_maketasks.c |  4 ++--
 src/scheduler.c        | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index d0e17ced5b..a8fcf8a917 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -5024,9 +5024,9 @@ void engine_maketasks(struct engine *e) {
     }
   }
 #ifdef SWIFT_DEBUG_CHECKS
-  if (count_current_self != sched->nr_self_pack_tasks)
+  if (count_current_self != sched->nr_self_pack_tasks_d)
     error("We did not find the correct number of self pack tasks!!");
-  if (count_current_pair != sched->nr_pair_pack_tasks)
+  if (count_current_pair != sched->nr_pair_pack_tasks_d)
     error("We did not find the correct number of pair pack tasks!!");
 #endif
 
diff --git a/src/scheduler.c b/src/scheduler.c
index 5624624016..9814318e34 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -3230,43 +3230,43 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 	    /*Move counter from the robbed to the robber*/
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack) {
-              atomic_inc(&q->n_packs_self_left_d);
-              atomic_dec(&q_stl->n_packs_self_left_d);
+              q->n_packs_self_left_d--;
+              q_stl->n_packs_self_left_d--;
               atomic_inc(&s->s_d_left[qid]);
               atomic_dec(&s->s_d_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_g) {
-              atomic_inc(&q->n_packs_self_left_g);
-              atomic_dec(&q_stl->n_packs_self_left_g);
+              q->n_packs_self_left_g--;
+              q_stl->n_packs_self_left_g--;
               atomic_inc(&s->s_g_left[qid]);
               atomic_dec(&s->s_g_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self)&&
                 subtype == task_subtype_gpu_pack_f) {
-              atomic_inc(&q->n_packs_self_left_f);
-              atomic_dec(&q_stl->n_packs_self_left_f);
+              q->n_packs_self_left_f--;
+              q_stl->n_packs_self_left_f--;
               atomic_inc(&s->s_f_left[qid]);
               atomic_dec(&s->s_f_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack) {
-              atomic_inc(&q->n_packs_pair_left_d);
-              atomic_dec(&q_stl->n_packs_pair_left_d);
+              q->n_packs_pair_left_d--;
+              q_stl->n_packs_pair_left_d--;
               atomic_inc(&s->p_d_left[qid]);
               atomic_dec(&s->p_d_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_g) {
-              atomic_inc(&q->n_packs_pair_left_g);
-              atomic_dec(&q_stl->n_packs_pair_left_g);
+              q->n_packs_pair_left_g--;
+              q_stl->n_packs_pair_left_g--;
               atomic_inc(&s->p_g_left[qid]);
               atomic_dec(&s->p_g_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair)&&
                 subtype == task_subtype_gpu_pack_f) {
-              atomic_inc(&q->n_packs_pair_left_f);
-              atomic_dec(&q_stl->n_packs_pair_left_f);
+              q->n_packs_pair_left_f--;
+              q_stl->n_packs_pair_left_f--;
               atomic_inc(&s->p_f_left[qid]);
               atomic_dec(&s->p_f_left[qstl_id]);
             }

From 3e08973df37fdfd18945d289cabf035696480a73 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 27 Jan 2025 14:48:32 +0000
Subject: [PATCH 141/217] Repeated changing atomic_incs to regular increment
 within queue lock regions in scheduler enqueue function

---
 src/scheduler.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 9814318e34..e08463e110 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2900,19 +2900,19 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     if (t->type == task_type_self || t->type == task_type_sub_self) {
       if (t->subtype == task_subtype_gpu_pack){
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_self_left_d);
+        s->queues[qid].n_packs_self_left_d++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f){
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_self_left_f);
+        s->queues[qid].n_packs_self_left_f++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g){
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_self_left_g);
+        s->queues[qid].n_packs_self_left_g++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_g_left[qid]);
       }
@@ -2922,19 +2922,19 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (t->subtype == task_subtype_gpu_pack) {
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_pair_left_d);
+        s->queues[qid].n_packs_pair_left_d++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_pair_left_f);
+        s->queues[qid].n_packs_pair_left_f++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
 	lock_lock(&s->queues[qid].lock);
-        atomic_inc(&s->queues[qid].n_packs_pair_left_g);
+        s->queues[qid].n_packs_pair_left_g++;
 	lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_g_left[qid]);
       }

From 3a71e3fe41158ceef3ace406009113f98109319c Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 27 Jan 2025 15:01:04 +0000
Subject: [PATCH 142/217] Repeated the same for atomic_decs in
 runner_doiact_functions_hydro_gpu

---
 src/runner_doiact_functions_hydro_gpu.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 5b3002c133..d6db85e681 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -214,7 +214,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
 
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_self_left_d);
+  s->queues[qid].n_packs_self_left_d--;
 
   if (s->queues[qid].n_packs_self_left_d < 1) 
     pack_vars->launch_leftovers = 1;
@@ -353,7 +353,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_self_left_g);
+  s->queues[qid].n_packs_self_left_g--;
 
   if (s->queues[qid].n_packs_self_left_g < 1) 
     pack_vars->launch_leftovers = 1;
@@ -492,7 +492,7 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
 
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_self_left_f);
+  s->queues[qid].n_packs_self_left_f--;
 
   if (s->queues[qid].n_packs_self_left_f < 1) 
     pack_vars->launch_leftovers = 1;
@@ -705,7 +705,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_pair_left_d);
+  s->queues[qid].n_packs_pair_left_d--;
 
   if (s->queues[qid].n_packs_pair_left_d < 1) 
     pack_vars->launch_leftovers = 1;
@@ -923,7 +923,7 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
 
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_pair_left_g);
+  s->queues[qid].n_packs_pair_left_g--;
 
   if (s->queues[qid].n_packs_pair_left_g < 1) 
     pack_vars->launch_leftovers = 1;
@@ -1144,7 +1144,7 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
 
   lock_lock(&s->queues[qid].lock);
 
-  atomic_dec(&s->queues[qid].n_packs_pair_left_f);
+  s->queues[qid].n_packs_pair_left_f--;
 
   if (s->queues[qid].n_packs_pair_left_f < 1) 
     pack_vars->launch_leftovers = 1;

From b4c57bfd0b9d6386a4f4bb97848184434fcd0d7b Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Mon, 27 Jan 2025 22:49:17 +0100
Subject: [PATCH 143/217] Applied code formatting script

---
 src/cell_grid.c                           |   2 +-
 src/cuda/GPU_runner_functions.cu          |  24 +-
 src/cuda/part_gpu.h                       |   2 +-
 src/engine_config.c                       |   9 +-
 src/engine_maketasks.c                    | 234 +++++++++--------
 src/engine_marktasks.c                    |   9 +-
 src/fof.c                                 |   4 +-
 src/hip/BLOCK_SIZE.h                      |   2 +-
 src/hip/Data_and_GPU_prep_functions.cu    |  20 +-
 src/hip/HIP_runner_functions.h            |   2 +-
 src/hip/cell_gpu.h                        |   2 +-
 src/hip/cuda_headers.h                    |   2 +-
 src/hip/device_functions.h                |  18 +-
 src/hip/part_gpu.h                        |   2 +-
 src/hip/tester.cu                         |   1 +
 src/lightcone/lightcone_crossing.h        |   2 +-
 src/lightcone/lightcone_replications.c    |   4 +-
 src/lightcone/lightcone_shell.c           |   2 +-
 src/power_spectrum.c                      |   4 +-
 src/queue.h                               |  15 +-
 src/runner_black_holes.c                  |  12 +-
 src/runner_doiact_functions_black_holes.h |  16 +-
 src/runner_doiact_functions_hydro.h       |  32 +--
 src/runner_doiact_functions_hydro_gpu.h   | 298 +++++++++++-----------
 src/runner_doiact_functions_limiter.h     |  12 +-
 src/runner_doiact_functions_stars.h       |  20 +-
 src/runner_doiact_grav.c                  |   2 +-
 src/runner_doiact_nosort.h                |   2 +-
 src/runner_gpu_pack_functions.c           |  26 +-
 src/runner_main_clean.cu                  | 235 ++++++++---------
 src/runner_others.c                       |   2 +-
 src/runner_sinks.c                        |  18 +-
 src/scheduler.c                           | 248 +++++++++---------
 src/scheduler.h                           |   3 +-
 src/space_regrid.c                        |   2 +-
 src/space_split.c                         |   2 +-
 src/task.c                                |  48 ++--
 37 files changed, 679 insertions(+), 659 deletions(-)
 mode change 100755 => 100644 src/hip/Data_and_GPU_prep_functions.cu
 mode change 100755 => 100644 src/hip/cell_gpu.h
 mode change 100755 => 100644 src/hip/cuda_headers.h
 mode change 100755 => 100644 src/hip/part_gpu.h
 mode change 100755 => 100644 src/hip/tester.cu

diff --git a/src/cell_grid.c b/src/cell_grid.c
index 3b3d9fa130..313d3843a3 100644
--- a/src/cell_grid.c
+++ b/src/cell_grid.c
@@ -353,7 +353,7 @@ void cell_set_grid_completeness_mapper(void *map_data, int num_elements,
         }
       }
     } /* Now loop over all the neighbours of this cell */
-  }   /* Loop through the elements, which are just byte offsets from NULL. */
+  } /* Loop through the elements, which are just byte offsets from NULL. */
 }
 
 /**
diff --git a/src/cuda/GPU_runner_functions.cu b/src/cuda/GPU_runner_functions.cu
index 6197c0231b..d3c08c10ae 100644
--- a/src/cuda/GPU_runner_functions.cu
+++ b/src/cuda/GPU_runner_functions.cu
@@ -1831,8 +1831,8 @@ __device__ void DOPAIR2NONSYMGPU(struct part_soa parts_soa, int pid,
           rot_uzi += faci * curlvrz;
         }
       } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-    }   /*End of looping through particles in shared memory---Shared arrays
-           zero'ed for next step in outer loop*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
     __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
@@ -1976,8 +1976,8 @@ __device__ void DOPAIR2NONSYMGPUAOS(struct part_aos *parts_aos, int pid,
           // 20)printf("incorrect timebin %i\n", timebin[j_block]);
         }
       } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-    }   /*End of looping through particles in shared memory---Shared arrays
-           zero'ed for next step in outer loop*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
     __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
@@ -2083,8 +2083,8 @@ __device__ void DOPAIR2NONSYMGPUAOSF4(
           res_rot.w -= faci * dvdr;
         }
       } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-    }   /*End of looping through particles in shared memory---Shared arrays
-           zero'ed for next step in outer loop*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
     __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
@@ -2320,8 +2320,8 @@ __device__ void DOPAIR2NONSYMGPUAOSG(struct part_aos_g *parts_aos, int pid,
           alpha_visc_max_ngb = max(alpha_visc_max_ngb, alpha_j);
         }
       } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-    }   /*End of looping through particles in shared memory---Shared arrays
-           zero'ed for next step in outer loop*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
     __syncthreads();
   } /*Loop through parts in cell j one BLOCK_SIZE at a time*/
   if (pid >= ci_start && pid < ci_end) {
@@ -3051,11 +3051,11 @@ __device__ void DOPAIR2GPU(struct part_soa parts_soa, int pid,
           atomicAdd(&parts_soa.rot_uz[j], facj * curlvrz);
           //		  printf("rho %f rho_dh %f wcount %f wcount_dh %f div_v
           //%f rotux %f rotuy %f rotuz %f\n" 				 ,rhoi,
-          //rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
+          // rho_dhi, wcounti, wcount_dhi, div_vi, rot_uxi, rot_uyi, rot_uzi);
         } /*if r2<hjg2 */
-      }   /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
-    }     /*End of looping through particles in shared memory---Shared arrays
-             zero'ed for next step in outer loop*/
+      } /*if (jj < cj_end && pid < ci_end && pid >= ci_start)*/
+    } /*End of looping through particles in shared memory---Shared arrays
+         zero'ed for next step in outer loop*/
     __syncthreads();
     //	if(j < cj_end){
     //	  atomicAdd(&parts_soa.rho[j], rho_tmp[threadIdx.x]);
diff --git a/src/cuda/part_gpu.h b/src/cuda/part_gpu.h
index 37b019d421..f07b5bfda6 100644
--- a/src/cuda/part_gpu.h
+++ b/src/cuda/part_gpu.h
@@ -9,7 +9,7 @@ typedef int8_t timebin_t;
 extern "C" {
 #endif
 
-//#include </usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h>
+// #include </usr/local/cuda-12.2/targets/x86_64-linux/include/vector_types.h>
 
 #include <vector_types.h>
 
diff --git a/src/engine_config.c b/src/engine_config.c
index ff3ff5ec9f..4c0c4420c4 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -923,12 +923,11 @@ void engine_config(int restart, int fof, struct engine *e,
       parser_get_opt_param_float(params, "Scheduler:links_per_tasks", 25.);
 
   /* Init the scheduler. Allow stealing*/
-    scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
-                   (e->policy & scheduler_flag_steal), e->nodeID,
-                   &e->threadpool);
+  scheduler_init(&e->sched, e->s, maxtasks, nr_queues,
+                 (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
   /* Init the scheduler. NO stealing  A. Nasar */
-//  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
-//                 &e->threadpool);
+  //  scheduler_init(&e->sched, e->s, maxtasks, nr_queues, 0, e->nodeID,
+  //                 &e->threadpool);
 
   /* Maximum size of MPI task messages, in KB, that should not be buffered,
    * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. Can be
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index a8fcf8a917..85e5df2493 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2208,13 +2208,13 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
         engine_addlink(e, &ci->hydro.density, t);
       } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
         engine_addlink(e, &ci->hydro.force_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_g) {
         engine_addlink(e, &ci->hydro.gradient_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
       } else if (t_subtype == task_subtype_external_grav) {
@@ -2232,15 +2232,15 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       } else if (t_subtype == task_subtype_gpu_pack) {
         engine_addlink(e, &ci->hydro.density_pack, t);
         engine_addlink(e, &cj->hydro.density_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
         engine_addlink(e, &ci->hydro.force_pack, t);
         engine_addlink(e, &cj->hydro.force_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_g) {
         engine_addlink(e, &ci->hydro.gradient_pack, t);
         engine_addlink(e, &cj->hydro.gradient_pack, t);
-//        error("Abouzied: you need to code this up!");
+        //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_grav) {
         engine_addlink(e, &ci->grav.grav, t);
         engine_addlink(e, &cj->grav.grav, t);
@@ -3425,9 +3425,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       }
     }
     /*Make packing depend on sorts and drift A. Nasar */
-    else if (t_type == task_type_sub_self && t_subtype == task_subtype_gpu_pack) {
+    else if (t_type == task_type_sub_self &&
+             t_subtype == task_subtype_gpu_pack) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
-//      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      //      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
     }
     /* Otherwise, sub-self interaction? */
     else if (t_type == task_type_sub_self &&
@@ -3442,8 +3443,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second hydro loop */
       t_force = scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
                                   flags, 0, ci, NULL);
-      t_force_gpu = scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
-                                  flags, 0, ci, NULL);
+      t_force_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
+                            flags, 0, ci, NULL);
 
       /* and the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -3551,8 +3553,9 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-      t_gradient_gpu = scheduler_addtask(sched, task_type_sub_self,
-                                     task_subtype_gpu_pack_g, flags, 0, ci, NULL);
+      t_gradient_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g,
+                            flags, 0, ci, NULL);
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
@@ -3564,17 +3567,16 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
       scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-    		              t_gradient_gpu);
+                          t_gradient_gpu);
       scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-    		              t_force_gpu);
+                          t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-    		              t_force_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -3714,22 +3716,21 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /* Otherwise, sub-pair interaction? */
     else if (t_type == task_type_sub_pair &&
              t_subtype == task_subtype_gpu_pack) {
-        /* Make all density pack tasks depend on the drift */
-        if (ci->nodeID == nodeID) {
-          scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
-        }
-        if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
-          scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
-        }
+      /* Make all density pack tasks depend on the drift */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
+      }
 
-        /* Make all density tasks depend on the sorts */
-        scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
-        if (ci->hydro.super != cj->hydro.super) {
-          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
-        }
-    }
-    else if (t_type == task_type_sub_pair &&
-             t_subtype == task_subtype_density) {
+      /* Make all density tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
+      }
+    } else if (t_type == task_type_sub_pair &&
+               t_subtype == task_subtype_density) {
 
       const int bcount_i = ci->black_holes.count;
       const int bcount_j = cj->black_holes.count;
@@ -3751,8 +3752,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force */
       t_force = scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
                                   flags, 0, ci, cj);
-      t_force_gpu = scheduler_addtask(sched, task_type_sub_pair, task_subtype_gpu_pack_f,
-                                  flags, 0, ci, cj);
+      t_force_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3764,8 +3765,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
-          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
-          scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -3918,8 +3919,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-      t_gradient_gpu = scheduler_addtask(sched, task_type_sub_pair,
-                                     task_subtype_gpu_pack_g, flags, 0, ci, cj);
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
@@ -3934,18 +3935,18 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
         scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-        		            t_gradient_gpu);
+                            t_gradient_gpu);
         scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-        		            t_force_gpu);
+                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
         scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-        		            t_gradient_gpu);
+                            t_gradient_gpu);
         scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
-        		            t_force_gpu);
+                            t_force_gpu);
       }
 #else
 
@@ -3956,14 +3957,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                                              with_cooling,
                                              with_timestep_limiter);
         scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-        		            t_force_gpu);
+                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
         scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-        		            t_force_gpu);
+                            t_force_gpu);
       }
 #endif
 
@@ -4916,7 +4917,6 @@ void engine_maketasks(struct engine *e) {
 
   tic2 = getticks();
 
-
   /* Run through the tasks and make force tasks for each density task.
      Each force task depends on the cell ghosts and unlocks the kick task
      of its super-cell. */
@@ -4931,35 +4931,41 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
-//  int unsplit = 0, split = 0;
-//  /*These loops should really be threadmapped A. Nasar*/
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//	  struct task * t = &sched->tasks[i];
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack){
-//        t->type = task_type_self;
-//        fprintf(stderr, "sub_self");
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack){
-//    	t->type = task_type_pair;
-//        fprintf(stderr, "sub_pair");
-//      }
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_g){
-//        t->type = task_type_self;
-//        fprintf(stderr, "sub_self");
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_g){
-//    	t->type = task_type_pair;
-//        fprintf(stderr, "sub_pair");
-//      }
-//	  if(t->type == task_type_sub_self && t->subtype == task_subtype_gpu_pack_f){
-//        t->type = task_type_self;
-//        fprintf(stderr, "sub_self");
-//	  }
-//      if(t->type == task_type_sub_pair && t->subtype == task_subtype_gpu_pack_f){
-//    	t->type = task_type_pair;
-//        fprintf(stderr, "sub_pair");
-//      }
-//  }
+  //  int unsplit = 0, split = 0;
+  //  /*These loops should really be threadmapped A. Nasar*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //	  struct task * t = &sched->tasks[i];
+  //	  if(t->type == task_type_sub_self && t->subtype ==
+  //task_subtype_gpu_pack){
+  //        t->type = task_type_self;
+  //        fprintf(stderr, "sub_self");
+  //	  }
+  //      if(t->type == task_type_sub_pair && t->subtype ==
+  //      task_subtype_gpu_pack){
+  //    	t->type = task_type_pair;
+  //        fprintf(stderr, "sub_pair");
+  //      }
+  //	  if(t->type == task_type_sub_self && t->subtype ==
+  //task_subtype_gpu_pack_g){
+  //        t->type = task_type_self;
+  //        fprintf(stderr, "sub_self");
+  //	  }
+  //      if(t->type == task_type_sub_pair && t->subtype ==
+  //      task_subtype_gpu_pack_g){
+  //    	t->type = task_type_pair;
+  //        fprintf(stderr, "sub_pair");
+  //      }
+  //	  if(t->type == task_type_sub_self && t->subtype ==
+  //task_subtype_gpu_pack_f){
+  //        t->type = task_type_self;
+  //        fprintf(stderr, "sub_self");
+  //	  }
+  //      if(t->type == task_type_sub_pair && t->subtype ==
+  //      task_subtype_gpu_pack_f){
+  //    	t->type = task_type_pair;
+  //        fprintf(stderr, "sub_pair");
+  //      }
+  //  }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */
@@ -4973,7 +4979,8 @@ void engine_maketasks(struct engine *e) {
   struct task *last_created_pair_unpack = NULL;
 
   /* Loop over all the currently existing pack tasks
-   * These loops should be thread-mapped too but will be a bit more tricky: A. Nasar*/
+   * These loops should be thread-mapped too but will be a bit more tricky: A.
+   * Nasar*/
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
@@ -5162,17 +5169,19 @@ void engine_maketasks(struct engine *e) {
   if (count_current_pair != sched->nr_pair_pack_tasks_f)
     error("We did not find the correct number of F pair pack tasks!!");
 #endif
-/*Debug code to check if some tasks are not split to desired level in tree for GPU*/
-//  for (int i = 0; i < sched->nr_tasks; i++) {
-//    struct task *t = &sched->tasks[i];
-//    if(t->ci != NULL){
-////      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) || (!t->ci->split && t->cj->split)))
-////    	  error("one is split the other isn't");
-//      if(t->ci->hydro.count > 80 && t->type == task_type_self)
-//    	  error("Count is %i task subtype (%s)",
-//                  t->ci->hydro.count, subtaskID_names[t->subtype]);
-//    }
-//  }
+  /*Debug code to check if some tasks are not split to desired level in tree for
+   * GPU*/
+  //  for (int i = 0; i < sched->nr_tasks; i++) {
+  //    struct task *t = &sched->tasks[i];
+  //    if(t->ci != NULL){
+  ////      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) ||
+  ///(!t->ci->split && t->cj->split))) /    	  error("one is split the other
+  ///isn't");
+  //      if(t->ci->hydro.count > 80 && t->type == task_type_self)
+  //    	  error("Count is %i task subtype (%s)",
+  //                  t->ci->hydro.count, subtaskID_names[t->subtype]);
+  //    }
+  //  }
   if (e->verbose)
     message("Making extra hydroloop tasks took %.3f %s.",
             clocks_from_ticks(getticks() - tic2), clocks_getunit());
@@ -5329,34 +5338,33 @@ void engine_maketasks(struct engine *e) {
 
     struct task *t = &sched->tasks[i];
     if (t->subtype == task_subtype_density ||
-      t->subtype == task_subtype_gradient  ||
-	  t->subtype == task_subtype_force){
-    	t->implicit = 1;
+        t->subtype == task_subtype_gradient ||
+        t->subtype == task_subtype_force) {
+      t->implicit = 1;
     }
-//    if (t->subtype == task_subtype_gpu_pack ||
-//      t->subtype == task_subtype_gpu_pack_g ||
-//	  t->subtype == task_subtype_gpu_pack_f ||
-//	  t->subtype == task_subtype_gpu_unpack ||
-//	  t->subtype == task_subtype_gpu_unpack_g ||
-//	  t->subtype == task_subtype_gpu_unpack_f){
-//    	t->implicit = 1;
-//    }
-//    if (t->subtype == task_subtype_gpu_pack_g ||
-//	  t->subtype == task_subtype_gpu_pack_f ||
-//	  t->subtype == task_subtype_gpu_unpack_g ||
-//	  t->subtype == task_subtype_gpu_unpack_f){// ||
-////	  (t->type == task_type_pair &&
-////	   t->subtype == task_subtype_gpu_pack)){
-//    	t->implicit = 1;
-//    }
-//    if ((t->subtype == task_subtype_gpu_pack ||
-//      t->subtype == task_subtype_gpu_pack_g  ||
-//	  t->subtype == task_subtype_gpu_pack_f) &&
-//	  (t->type == task_type_sub_pair ||
-//	  t->type == task_type_sub_self)){
-//    	t->implicit = 1;
-////    	error("STill have subs");
-//    }
+    //    if (t->subtype == task_subtype_gpu_pack ||
+    //      t->subtype == task_subtype_gpu_pack_g ||
+    //	  t->subtype == task_subtype_gpu_pack_f ||
+    //	  t->subtype == task_subtype_gpu_unpack ||
+    //	  t->subtype == task_subtype_gpu_unpack_g ||
+    //	  t->subtype == task_subtype_gpu_unpack_f){
+    //    	t->implicit = 1;
+    //    }
+    //    if (t->subtype == task_subtype_gpu_pack_g ||
+    //	  t->subtype == task_subtype_gpu_pack_f ||
+    //	  t->subtype == task_subtype_gpu_unpack_g ||
+    //	  t->subtype == task_subtype_gpu_unpack_f){// ||
+    ////	  (t->type == task_type_pair &&
+    ////	   t->subtype == task_subtype_gpu_pack)){
+    //    	t->implicit = 1;
+    //    }
+    //    if ((t->subtype == task_subtype_gpu_pack ||
+    //      t->subtype == task_subtype_gpu_pack_g  ||
+    //	  t->subtype == task_subtype_gpu_pack_f) &&
+    //	  (t->type == task_type_sub_pair ||
+    //	  t->type == task_type_sub_self)){
+    //    	t->implicit = 1;
+    ////    	error("STill have subs");
+    //    }
   }
-
 }
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 00222ffc46..15be210a22 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -186,7 +186,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
         }
       }
 
-      /* Store current values of dx_max and h_max. A. Nasar: Unsure if we actually need this*/
+      /* Store current values of dx_max and h_max. A. Nasar: Unsure if we
+         actually need this*/
       else if (t_type == task_type_sub_self &&
                t_subtype == task_subtype_gpu_pack) {
         if (ci_active_hydro) {
@@ -200,7 +201,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 
       else if (t_type == task_type_sub_self &&
                (t_subtype == task_subtype_force ||
-            	t_subtype == task_subtype_gpu_pack_f)) {
+                t_subtype == task_subtype_gpu_pack_f)) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
@@ -219,8 +220,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_gradient ||
-			   t_subtype == task_subtype_gpu_pack_g) {
+                   t_subtype == task_subtype_gradient ||
+               t_subtype == task_subtype_gpu_pack_g) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
diff --git a/src/fof.c b/src/fof.c
index e9f69932cf..6594b2b60e 100644
--- a/src/fof.c
+++ b/src/fof.c
@@ -2350,8 +2350,8 @@ void fof_calc_group_mass(struct fof_props *props, const struct space *s,
         }
 
       } /* Foreign root */
-    }   /* Particle is in a group */
-  }     /* Loop over particles */
+    } /* Particle is in a group */
+  } /* Loop over particles */
 
   size_t nsend = map.size;
   struct fof_mass_send_hashmap hashmap_mass_send = {NULL, 0};
diff --git a/src/hip/BLOCK_SIZE.h b/src/hip/BLOCK_SIZE.h
index b476b4d766..d36e10b99b 100644
--- a/src/hip/BLOCK_SIZE.h
+++ b/src/hip/BLOCK_SIZE.h
@@ -7,4 +7,4 @@
 #ifdef WITH_CUDA
 //}
 #endif
-#endif // BLOCK_SIZE_H
+#endif  // BLOCK_SIZE_H
diff --git a/src/hip/Data_and_GPU_prep_functions.cu b/src/hip/Data_and_GPU_prep_functions.cu
old mode 100755
new mode 100644
index c96734e8b3..57cbe0ad7c
--- a/src/hip/Data_and_GPU_prep_functions.cu
+++ b/src/hip/Data_and_GPU_prep_functions.cu
@@ -7,16 +7,16 @@
 
 /*ifdef WITH_CUDA prevents name mangling. C code sees exact names
  of functions rather than mangled template names produced by C++*/
-//#ifdef WITH_CUDA
+// #ifdef WITH_CUDA
 //	extern "C"{
-//#endif
+// #endif
 
-//#include "cuda/cuda_headers.h"
-//#include "device_functions.h"
-//#include "cuda/cell_gpu.h"
+// #include "cuda/cuda_headers.h"
+// #include "device_functions.h"
+// #include "cuda/cell_gpu.h"
 #include <cuda_profiler_api.h>
 #include <vector.h>
-//#include "../config.h"
+// #include "../config.h"
 
 void populate_parts_list(struct cell *ci, struct part_gpu *parts) {
   ////////////////////////////////////////////
@@ -143,7 +143,7 @@ void populate_parts_list_soa(
     SPH_sum[p_gid] = 0.f;
     //			fprintf(stderr,"tid is %i\n",tid_p[p]);
     //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
-    //id[p_gid]);
+    // id[p_gid]);
   }
 }
 
@@ -220,10 +220,10 @@ void pack_data_soa(int count_all_parts, struct cell *ci, int first_part_tmp,
     SPH_sum[p_gid] = 0.f;
     //			fprintf(stderr,"tid is %i\n",tid_p[p]);
     //			fprintf(stderr,"Tester 222, count=%i, p=%i\n", count,
-    //id[p_gid]);
+    // id[p_gid]);
   }
 }
 
-//#ifdef WITH_CUDA
+// #ifdef WITH_CUDA
 //	}
-//#endif
+// #endif
diff --git a/src/hip/HIP_runner_functions.h b/src/hip/HIP_runner_functions.h
index b85772f6b0..43a52f96ed 100644
--- a/src/hip/HIP_runner_functions.h
+++ b/src/hip/HIP_runner_functions.h
@@ -19,4 +19,4 @@ void launch_density_kernel(struct part_soa parts_soa, int *d_task_first_part,
 }
 #endif
 
-#endif // CUDA_HEADER_H
+#endif  // CUDA_HEADER_H
diff --git a/src/hip/cell_gpu.h b/src/hip/cell_gpu.h
old mode 100755
new mode 100644
index 0265592a97..dc8d9306f2
--- a/src/hip/cell_gpu.h
+++ b/src/hip/cell_gpu.h
@@ -289,4 +289,4 @@ struct cells_gpu_flat_test {
   float *locx;
 };
 
-#endif // CELL_GPU_H
+#endif  // CELL_GPU_H
diff --git a/src/hip/cuda_headers.h b/src/hip/cuda_headers.h
old mode 100755
new mode 100644
index 40782e0056..2df61a53b5
--- a/src/hip/cuda_headers.h
+++ b/src/hip/cuda_headers.h
@@ -60,4 +60,4 @@ void mgd_mem_cuda_kernel_bundles(struct part_gpu **parts_gpu_list,
 }
 #endif
 
-#endif // CUDA_HEADER_H
+#endif  // CUDA_HEADER_H
diff --git a/src/hip/device_functions.h b/src/hip/device_functions.h
index 2cba0e9829..237c87dec1 100644
--- a/src/hip/device_functions.h
+++ b/src/hip/device_functions.h
@@ -3,11 +3,11 @@
 #include "../../config.h"
 
 /* Local headers. */
-//#include "../dimension.h"
-//#include "../error.h"
-//#include "../inline.h"
-//#include "../minmax.h"
-//#include "../vector.h"
+// #include "../dimension.h"
+// #include "../error.h"
+// #include "../inline.h"
+// #include "../minmax.h"
+// #include "../vector.h"
 
 // Is this even necessary? Probably not as our code will operate differently
 #define num_cuda_threads 128
@@ -22,11 +22,11 @@
 #define kernel_ivals 2
 #define kernel_degree 3 /*!< Degree of the polynomial */
 #define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_dim_plus_one                                              \
+#define kernel_gamma_dim_plus_one \
   ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
-#define kernel_gamma_inv_dim                                                   \
+#define kernel_gamma_inv_dim \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
-#define kernel_gamma_inv_dim_plus_one                                          \
+#define kernel_gamma_inv_dim_plus_one \
   ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
 #define kernel_ivals_f ((float)kernel_ivals) /*!< Number of branches */
 #define kernel_constant ((float)(16. * M_1_PI))
@@ -146,4 +146,4 @@ __device__ void d_kernel_deval(float u, float *restrict W,
 }
 #endif
 
-#endif // DEVICE_FUNCTIONS_H
+#endif  // DEVICE_FUNCTIONS_H
diff --git a/src/hip/part_gpu.h b/src/hip/part_gpu.h
old mode 100755
new mode 100644
index a19257abc4..5d7e32c611
--- a/src/hip/part_gpu.h
+++ b/src/hip/part_gpu.h
@@ -134,4 +134,4 @@ typedef struct part_soa {
 };
 #endif
 
-#endif // PART_GPU_H
+#endif  // PART_GPU_H
diff --git a/src/hip/tester.cu b/src/hip/tester.cu
old mode 100755
new mode 100644
index 5e70230211..3ffaf9e10c
--- a/src/hip/tester.cu
+++ b/src/hip/tester.cu
@@ -1,4 +1,5 @@
 #include "tester.h"
+
 #include <iostream>
 #include <vector>
 #ifdef __cplusplus
diff --git a/src/lightcone/lightcone_crossing.h b/src/lightcone/lightcone_crossing.h
index 78777f1ff0..226fbfa3b8 100644
--- a/src/lightcone/lightcone_crossing.h
+++ b/src/lightcone/lightcone_crossing.h
@@ -246,7 +246,7 @@ lightcone_check_particle_crosses(
         lightcone_buffer_map_update(props, e, gp, a_cross, x_cross);
 
     } /* Next periodic replication*/
-  }   /* Next lightcone */
+  } /* Next lightcone */
 }
 
 #endif /* SWIFT_LIGHTCONE_CROSSING_H */
diff --git a/src/lightcone/lightcone_replications.c b/src/lightcone/lightcone_replications.c
index f65044814c..22ec701d92 100644
--- a/src/lightcone/lightcone_replications.c
+++ b/src/lightcone/lightcone_replications.c
@@ -128,8 +128,8 @@ void replication_list_init(struct replication_list *replication_list,
             replication_list->nrep += 1;
           }
         } /* Next replication in z */
-      }   /* Next replication in y */
-    }     /* Next replication in x */
+      } /* Next replication in y */
+    } /* Next replication in x */
 
     /* Allocate storage after first pass */
     if (ipass == 0) {
diff --git a/src/lightcone/lightcone_shell.c b/src/lightcone/lightcone_shell.c
index e3ac45885f..51d07bd8b7 100644
--- a/src/lightcone/lightcone_shell.c
+++ b/src/lightcone/lightcone_shell.c
@@ -762,7 +762,7 @@ void healpix_smoothing_mapper(void *map_data, int num_elements,
               } /* Next smoothed map */
             }
           } /* Next pixel in this range */
-        }   /* Next range of pixels */
+        } /* Next range of pixels */
 
         /* Free array of pixel ranges */
         free(range);
diff --git a/src/power_spectrum.c b/src/power_spectrum.c
index 2f252fca48..f6693c7c5a 100644
--- a/src/power_spectrum.c
+++ b/src/power_spectrum.c
@@ -804,8 +804,8 @@ void pow_from_grid_mapper(void* map_data, const int num, void* extra) {
                          (powgridft[index][0] * powgridft2[index][0] +
                           powgridft[index][1] * powgridft2[index][1]));
       } /* Loop over z */
-    }   /* Loop over y */
-  }     /* Loop over z */
+    } /* Loop over y */
+  } /* Loop over z */
 }
 
 /**
diff --git a/src/queue.h b/src/queue.h
index 4ace573b5b..b90ca90b46 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -76,17 +76,22 @@ struct queue {
   volatile unsigned int first_incoming, last_incoming, count_incoming;
 
   /*Number of pack tasks left in queue A. Nasar */
-  volatile int n_packs_self_left_d; /*Number of density pack tasks left in queue*/
+  volatile int
+      n_packs_self_left_d; /*Number of density pack tasks left in queue*/
   volatile int n_packs_self_left_f; /*Number of force pack tasks left in queue*/
-  volatile int n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
+  volatile int
+      n_packs_self_left_g; /*Number of gradient pack tasks left in queue*/
 
   volatile int n_packs_pair_left_d;
   volatile int n_packs_pair_left_f;
   volatile int n_packs_pair_left_g;
 
-  volatile int n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/
-  volatile int n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
-  volatile int n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
+  volatile int
+      n_packs_self_stolen_d; /*Number of density pack tasks left in queue*/
+  volatile int
+      n_packs_self_stolen_f; /*Number of force pack tasks left in queue*/
+  volatile int
+      n_packs_self_stolen_g; /*Number of gradient pack tasks left in queue*/
 
   volatile int n_packs_pair_stolen_d;
   volatile int n_packs_pair_stolen_f;
diff --git a/src/runner_black_holes.c b/src/runner_black_holes.c
index aebef16591..ca5dc32461 100644
--- a/src/runner_black_holes.c
+++ b/src/runner_black_holes.c
@@ -211,7 +211,7 @@ void runner_do_gas_swallow(struct runner *r, struct cell *c, int timer) {
               break;
             }
           } /* Loop over foreign BHs */
-        }   /* Is the cell local? */
+        } /* Is the cell local? */
 #endif
 
         /* If we have a local particle, we must have found the BH in one
@@ -221,8 +221,8 @@ void runner_do_gas_swallow(struct runner *r, struct cell *c, int timer) {
                 p->id, swallow_id);
         }
       } /* Part was flagged for swallowing */
-    }   /* Loop over the parts */
-  }     /* Cell is not split */
+    } /* Loop over the parts */
+  } /* Cell is not split */
 }
 
 /**
@@ -449,7 +449,7 @@ void runner_do_bh_swallow(struct runner *r, struct cell *c, int timer) {
               break;
             }
           } /* Loop over foreign BHs */
-        }   /* Is the cell local? */
+        } /* Is the cell local? */
 #endif
 
         /* If we have a local particle, we must have found the BH in one
@@ -460,8 +460,8 @@ void runner_do_bh_swallow(struct runner *r, struct cell *c, int timer) {
         }
 
       } /* Part was flagged for swallowing */
-    }   /* Loop over the parts */
-  }     /* Cell is not split */
+    } /* Loop over the parts */
+  } /* Cell is not split */
 }
 
 /**
diff --git a/src/runner_doiact_functions_black_holes.h b/src/runner_doiact_functions_black_holes.h
index 09ef41e852..cbfa9c78ed 100644
--- a/src/runner_doiact_functions_black_holes.h
+++ b/src/runner_doiact_functions_black_holes.h
@@ -116,8 +116,8 @@ void DOSELF1_BH(struct runner *r, struct cell *c, int timer) {
           }
         }
       } /* loop over the parts in ci. */
-    }   /* loop over the bparts in ci. */
-  }     /* Do we have gas particles in the cell? */
+    } /* loop over the bparts in ci. */
+  } /* Do we have gas particles in the cell? */
 
   /* When doing BH swallowing, we need a quick loop also over the BH
    * neighbours */
@@ -177,7 +177,7 @@ void DOSELF1_BH(struct runner *r, struct cell *c, int timer) {
         }
       }
     } /* loop over the bparts in ci. */
-  }   /* loop over the bparts in ci. */
+  } /* loop over the bparts in ci. */
 
 #endif /* (FUNCTION_TASK_LOOP == TASK_LOOP_SWALLOW) */
 
@@ -286,8 +286,8 @@ void DO_NONSYM_PAIR1_BH_NAIVE(struct runner *r, struct cell *restrict ci,
           }
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the bparts in ci. */
-  }     /* Do we have gas particles in the cell? */
+    } /* loop over the bparts in ci. */
+  } /* Do we have gas particles in the cell? */
 
   /* When doing BH swallowing, we need a quick loop also over the BH
    * neighbours */
@@ -347,7 +347,7 @@ void DO_NONSYM_PAIR1_BH_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the bparts in cj. */
-  }   /* loop over the bparts in ci. */
+  } /* loop over the bparts in ci. */
 
 #endif /* (FUNCTION_TASK_LOOP == TASK_LOOP_SWALLOW) */
 }
@@ -469,7 +469,7 @@ void DOPAIR1_SUBSET_BH_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 }
 
 /**
@@ -557,7 +557,7 @@ void DOSELF1_SUBSET_BH(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 }
 
 /**
diff --git a/src/runner_doiact_functions_hydro.h b/src/runner_doiact_functions_hydro.h
index eff6702b82..1bcf1af207 100644
--- a/src/runner_doiact_functions_hydro.h
+++ b/src/runner_doiact_functions_hydro.h
@@ -156,7 +156,7 @@ void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -310,7 +310,7 @@ void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
         }
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -451,7 +451,7 @@ void DOSELF1_NAIVE(struct runner *r, struct cell *restrict c) {
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -592,7 +592,7 @@ void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) {
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -694,7 +694,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_subset_naive);
 }
@@ -803,7 +803,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the parts in ci. */
+    } /* loop over the parts in ci. */
   }
 
   /* Parts are on the right. */
@@ -869,7 +869,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the parts in ci. */
+    } /* loop over the parts in ci. */
   }
 
   TIMER_TOC(timer_dopair_subset);
@@ -1036,7 +1036,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_doself_subset);
 }
@@ -1219,8 +1219,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the parts in ci. */
-  }     /* Cell ci is active */
+    } /* loop over the parts in ci. */
+  } /* Cell ci is active */
 
   if (CELL_IS_ACTIVE(cj, e)) {
 
@@ -1319,8 +1319,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
 #endif
         }
       } /* loop over the parts in ci. */
-    }   /* loop over the parts in cj. */
-  }     /* Cell cj is active */
+    } /* loop over the parts in cj. */
+  } /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -1741,8 +1741,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           }
         }
       } /* loop over the parts in cj. */
-    }   /* Is pi active? */
-  }     /* Loop over all ci */
+    } /* Is pi active? */
+  } /* Loop over all ci */
 
   /* Loop over *all* the parts in cj starting from the centre until
      we are out of range of anything in ci (using the maximal hj). */
@@ -1958,8 +1958,8 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           }
         }
       } /* loop over the parts in ci. */
-    }   /* Is pj active? */
-  }     /* Loop over all cj */
+    } /* Is pj active? */
+  } /* Loop over all cj */
 
   /* Clean-up if necessary */  // MATTHIEU: temporary disable this optimization
   if (CELL_IS_ACTIVE(ci, e))   // && !cell_is_all_active_hydro(ci, e))
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index d6db85e681..07682a2eaf 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1,4 +1,5 @@
 #include "scheduler.h"
+
 #include <atomic.h>
 struct pack_vars_self {
   /*List of tasks and respective cells to be packed*/
@@ -147,7 +148,8 @@ void runner_doself1_pack(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_d == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_self_left_d == 0))
+    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -169,11 +171,11 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-//  pthread_mutex_lock(&s->sleep_mutex);
-//  atomic_dec(&(s->s_d_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->s_d_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
 
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
@@ -216,13 +218,11 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
 
   s->queues[qid].n_packs_self_left_d--;
 
-  if (s->queues[qid].n_packs_self_left_d < 1) 
-    pack_vars->launch_leftovers = 1;
-      
+  if (s->queues[qid].n_packs_self_left_d < 1) pack_vars->launch_leftovers = 1;
+
   lock_unlock(&s->queues[qid].lock);
-  
 
-//  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
+  //  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -308,11 +308,11 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-//  pthread_mutex_lock(&s->sleep_mutex);
-//  atomic_dec(&(s->s_g_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->s_g_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -343,27 +343,23 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_g++;
   /* Record that we have now done a packing (self) */
-//  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
+  //  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-  
   lock_lock(&s->queues[qid].lock);
 
   s->queues[qid].n_packs_self_left_g--;
 
-  if (s->queues[qid].n_packs_self_left_g < 1) 
-    pack_vars->launch_leftovers = 1;
-      
-  lock_unlock(&s->queues[qid].lock);
+  if (s->queues[qid].n_packs_self_left_g < 1) pack_vars->launch_leftovers = 1;
 
+  lock_unlock(&s->queues[qid].lock);
 
-  
-//  if ((s->queues[qid].n_packs_self_left_g < 1))
-//    pack_vars->launch_leftovers = 1;
+  //  if ((s->queues[qid].n_packs_self_left_g < 1))
+  //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -447,11 +443,11 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-//  pthread_mutex_lock(&s->sleep_mutex);
-//  atomic_dec(&(s->s_f_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->s_f_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
   int tasks_packed = pack_vars->tasks_packed;
   //	pack_vars->cellx[tasks_packed] = ci->loc[0];
   //	pack_vars->celly[tasks_packed] = ci->loc[1];
@@ -482,26 +478,23 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_f++;
   /* Record that we have now done a packing (self) */
-//  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
+  //  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
-
   lock_lock(&s->queues[qid].lock);
 
   s->queues[qid].n_packs_self_left_f--;
 
-  if (s->queues[qid].n_packs_self_left_f < 1) 
-    pack_vars->launch_leftovers = 1;
-      
-  lock_unlock(&s->queues[qid].lock);
+  if (s->queues[qid].n_packs_self_left_f < 1) pack_vars->launch_leftovers = 1;
 
+  lock_unlock(&s->queues[qid].lock);
 
   //  if ((s->queues[qid].n_packs_self_left_f < 1))
-//    pack_vars->launch_leftovers = 1;
+  //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -561,8 +554,9 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s,
       r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
       tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
   //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
   // pack_vars->count_parts is actually increment here
   /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
    * packed_tmp+1 is index for cell j */
@@ -592,7 +586,8 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_d == 0)) pack_vars->launch_leftovers = 1;
+  if ((s->queues[qid].n_packs_pair_left_d == 0))
+    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -620,10 +615,10 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
   int qid = r->qid;
-//  pthread_mutex_lock(&s->sleep_mutex);
-//  atomic_dec(&(s->p_d_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->p_d_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -662,8 +657,9 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
       count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
       shift_tmp);
   //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
   // pack_vars->count_parts is actually increment here
   /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
    * packed_tmp+1 is index for cell j */
@@ -699,19 +695,18 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
-//  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left));
-//  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
+  //  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
+  //  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
 
   lock_lock(&s->queues[qid].lock);
 
   s->queues[qid].n_packs_pair_left_d--;
 
-  if (s->queues[qid].n_packs_pair_left_d < 1) 
-    pack_vars->launch_leftovers = 1;
-      
+  if (s->queues[qid].n_packs_pair_left_d < 1) pack_vars->launch_leftovers = 1;
+
   lock_unlock(&s->queues[qid].lock);
-  
+
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -839,10 +834,10 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   int tasks_packed = pack_vars->tasks_packed;
 
   int qid = r->qid;
-//  pthread_mutex_lock(&s->sleep_mutex);
-//  atomic_dec(&(s->p_g_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->p_g_left[qid]));
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -881,8 +876,9 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
       count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
       shift_tmp);
   //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
   // pack_vars->count_parts is actually increment here
   /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
    * packed_tmp+1 is index for cell j */
@@ -918,21 +914,19 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
   /* Record that we have now done a packing (self) */
-//  int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
+  //  int qid = r->qid;
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
 
   lock_lock(&s->queues[qid].lock);
 
   s->queues[qid].n_packs_pair_left_g--;
 
-  if (s->queues[qid].n_packs_pair_left_g < 1) 
-    pack_vars->launch_leftovers = 1;
-      
+  if (s->queues[qid].n_packs_pair_left_g < 1) pack_vars->launch_leftovers = 1;
+
   lock_unlock(&s->queues[qid].lock);
 
-  
-//  if ((s->p_g_left[qid] < 1))
-//    pack_vars->launch_leftovers = 1;
+  //  if ((s->p_g_left[qid] < 1))
+  //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -1062,11 +1056,11 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
 
   /* Record that we have now done a packing (self) */
   int qid = r->qid;
-//  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-//  pthread_mutex_lock(&s->sleep_mutex);
+  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
+  //  pthread_mutex_lock(&s->sleep_mutex);
   atomic_dec(&(s->p_f_left[qid]));
-//  pthread_cond_broadcast(&s->sleep_cond);
-//  pthread_mutex_unlock(&s->sleep_mutex);
+  //  pthread_cond_broadcast(&s->sleep_cond);
+  //  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   /*Get the shifts in case of periodics*/
@@ -1105,8 +1099,9 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
       count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
       shift_tmp);
   //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  //timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  //pack_vars->count_max_parts); //This may cause an issue. Be sure to test that
+  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
+  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
+  // that
   // pack_vars->count_parts is actually increment here
   /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
    * packed_tmp+1 is index for cell j */
@@ -1146,14 +1141,12 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
 
   s->queues[qid].n_packs_pair_left_f--;
 
-  if (s->queues[qid].n_packs_pair_left_f < 1) 
-    pack_vars->launch_leftovers = 1;
-      
+  if (s->queues[qid].n_packs_pair_left_f < 1) pack_vars->launch_leftovers = 1;
+
   lock_unlock(&s->queues[qid].lock);
 
-  
-//  if ((s->p_f_left[qid] < 1))
-//    pack_vars->launch_leftovers = 1;
+  //  if ((s->p_f_left[qid] < 1))
+  //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -1248,9 +1241,9 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
     ////
     //										//
-    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error
-    //in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-    //			cudaGetErrorString(cu_error), r->cpuid);
+    // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
+    // stderr, 			"CUDA error in density self host 2 device
+    // memcpy: %s cpuid id is: %i\n ", 			cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
     //	  }
     // #endif
@@ -1278,7 +1271,8 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     //	  if (cu_error != cudaSuccess) {
     //		fprintf(stderr,
     //				"CUDA error with self density kernel launch: %s
-    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    // cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
     //	  }
     // #endif
     cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
@@ -1288,10 +1282,10 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     // #ifdef CUDA_DEBUG
     //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
     //										//
-    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
-    //with self density D2H memcpy: %s cpuid id is: %i\n ",
-    //				cudaGetErrorString(cu_error), r->cpuid);
-    //		error("Something's up with your cuda code");
+    // Get error code 	  if (cu_error != cudaSuccess) {
+    // fprintf(stderr, 				"CUDA error with self density
+    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
+    //r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
@@ -1374,7 +1368,7 @@ void runner_doself1_launch_f4(
     if (tasks_packed == 0)
       error("zero tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[tasks_packed - 1];
+    // pack_vars->task_first_part[tasks_packed - 1];
     pack_vars->bundle_first_part[nBundles_temp] =
         task_first_part_f4[tasks_packed - 1].x;
   }
@@ -1439,15 +1433,16 @@ void runner_doself1_launch_f4(
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
     //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
-    //// 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error in density
-    //self host 2 device memcpy: %s cpuid id is: %i\n ",
+    //// 	  if (cu_error != cudaSuccess) { 		fprintf(
+    ///stderr, 			"CUDA error in density
+    // self host 2 device memcpy: %s cpuid id is: %i\n ",
     //			cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
     //	  }
-    //      clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-    //      *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-    //  			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
-    //  1000000000.0;
+    //       clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
+    //       *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
+    //   			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
+    //   1000000000.0;
     cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -1456,9 +1451,9 @@ void runner_doself1_launch_f4(
     //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
     ////
     //										//
-    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf( 			stderr, 			"CUDA error
-    //in density self host 2 device memcpy: %s cpuid id is: %i\n ",
-    //			cudaGetErrorString(cu_error), r->cpuid);
+    // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
+    // stderr, 			"CUDA error in density self host 2 device
+    // memcpy: %s cpuid id is: %i\n ", 			cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
     //	  }
     // #endif
@@ -1475,8 +1470,9 @@ void runner_doself1_launch_f4(
     //	  const char *loop_type = "density";
     //	  struct first_part first_parts;
     //	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
-    //pack_vars->task_first_part[i]; 	  fprintf(stderr, "Launching kernel with %i
-    //tasks leftovers %i\n", 			  tasks_packed, pack_vars->launch_leftovers);
+    // pack_vars->task_first_part[i]; 	  fprintf(stderr, "Launching kernel with
+    // %i tasks leftovers %i\n", 			  tasks_packed,
+    // pack_vars->launch_leftovers);
     // Launch the kernel
     launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                           numBlocks_x, numBlocks_y, bundle_first_task,
@@ -1486,7 +1482,8 @@ void runner_doself1_launch_f4(
     //	  if (cu_error != cudaSuccess) {
     //		fprintf(stderr,
     //				"CUDA error with self density kernel launch: %s
-    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    // cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
     //	  }
     // #endif
     cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
@@ -1496,10 +1493,10 @@ void runner_doself1_launch_f4(
     // #ifdef CUDA_DEBUG
     //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
     //										//
-    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
-    //with self density D2H memcpy: %s cpuid id is: %i\n ",
-    //				cudaGetErrorString(cu_error), r->cpuid);
-    //		error("Something's up with your cuda code");
+    // Get error code 	  if (cu_error != cudaSuccess) {
+    // fprintf(stderr, 				"CUDA error with self density
+    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
+    //r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
@@ -1546,16 +1543,16 @@ void runner_doself1_launch_f4(
         }
         //			  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
         //				*hmemcpy_time += (t1hmemcpy.tv_sec -
-        //t0hmemcpy.tv_sec) + 				(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
-        //1000000000.0;
+        // t0hmemcpy.tv_sec) + 				(t1hmemcpy.tv_nsec -
+        // t0hmemcpy.tv_nsec) / 1000000000.0;
         const ticks tic = getticks();
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4(r, cii, parts_recv, 0,
                                               &pack_length_unpack, tid,
                                               pack_vars->count_max_parts, e);
-    	const ticks toc = getticks();
+        const ticks toc = getticks();
 
-    	total_cpu_unpack_ticks += toc - tic;
+        total_cpu_unpack_ticks += toc - tic;
         /* Record things for debugging */
         cii->gpu_done++;
         /*Time end of unpacking*/
@@ -1803,7 +1800,7 @@ void runner_doself1_launch_f4_g(
   if (pack_vars->launch_leftovers) {
     nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
     //	  if(tasks_packed == 0) error("zero tasks packed but somehow got into
-    //GPU loop");
+    // GPU loop");
     pack_vars->bundle_first_part[nBundles_temp] =
         task_first_part_f4[tasks_packed - 1].x;
   }
@@ -1851,7 +1848,7 @@ void runner_doself1_launch_f4_g(
                     bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
     //	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
-    //first_part_tmp, bundle_n_parts);
+    // first_part_tmp, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
     cudaError_t cu_error =
@@ -1906,7 +1903,7 @@ void runner_doself1_launch_f4_g(
     }
 #endif
   } /*End of looping over bundles to launch in streams*/
-    //	exit(0);
+  //	exit(0);
   /* Make sure all the kernels and copies back are finished */
   //	cudaDeviceSynchronize();
 
@@ -1948,15 +1945,15 @@ void runner_doself1_launch_f4_g(
         }
         /*Time unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp0);
-    	const ticks tic = getticks();
+        const ticks tic = getticks();
 
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4_g(r, cii, parts_recv, 0,
                                                 &pack_length_unpack, tid,
                                                 pack_vars->count_max_parts, e);
-    	const ticks toc = getticks();
+        const ticks toc = getticks();
 
-    	total_cpu_unpack_ticks += toc - tic;
+        total_cpu_unpack_ticks += toc - tic;
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
@@ -2355,15 +2352,15 @@ void runner_doself1_launch_f4_f(
           ; /* spin until we acquire the lock */
         }
         clock_gettime(CLOCK_REALTIME, &tp0);
-    	const ticks tic = getticks();
+        const ticks tic = getticks();
 
         /* Do the copy */
         runner_doself1_gpu_unpack_neat_aos_f4_f(r, cii, parts_recv, 0,
                                                 &pack_length_unpack, tid,
                                                 pack_vars->count_max_parts, e);
-    	const ticks toc = getticks();
+        const ticks toc = getticks();
 
-    	total_cpu_unpack_ticks += toc - tic;
+        total_cpu_unpack_ticks += toc - tic;
         /* Record things for debugging */
         cii->gpu_done_f++;
         clock_gettime(CLOCK_REALTIME, &tp1);
@@ -2532,7 +2529,8 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s,
     //	  if (cu_error != cudaSuccess) {
     //		fprintf(stderr,
     //				"CUDA error with self density kernel launch: %s
-    //cpuid id is: %i\n ", 				cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
+    // cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
     //	  }
     // #endif
 
@@ -2545,10 +2543,10 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s,
     // #ifdef CUDA_DEBUG
     //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
     //										//
-    //Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(stderr, 				"CUDA error
-    //with self density D2H memcpy: %s cpuid id is: %i\n ",
-    //				cudaGetErrorString(cu_error), r->cpuid);
-    //		error("Something's up with your cuda code");
+    // Get error code 	  if (cu_error != cudaSuccess) {
+    // fprintf(stderr, 				"CUDA error with self density
+    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
+    //r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
@@ -2637,7 +2635,7 @@ void runner_dopair1_launch_f4(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
   }
@@ -2845,7 +2843,7 @@ void runner_dopair1_launch_f4_one_memcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
   }
@@ -2880,8 +2878,9 @@ void runner_dopair1_launch_f4_one_memcpy(
                       fparti_fpartj_lparti_lpartj_dens[tid].y;
         parts_in_bundle_cj += count_j;
         max_parts_j = max(max_parts_j, count_j);
-//        if(count_i > 100 || count_j > 100)
-//        	error("Sending data for excessive n parts %i %i", count_i, count_j);
+        //        if(count_i > 100 || count_j > 100)
+        //        	error("Sending data for excessive n parts %i %i",
+        //        count_i, count_j);
       }
     }
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -2902,7 +2901,8 @@ void runner_dopair1_launch_f4_one_memcpy(
               "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
               "is: %i\n ",
               cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code first_part %i bundle size %i", first_part_tmp_i, bundle_n_parts);
+      error("Something's up with your cuda code first_part %i bundle size %i",
+            first_part_tmp_i, bundle_n_parts);
     }
 #endif
     /* LAUNCH THE GPU KERNELS for ci & cj */
@@ -2956,13 +2956,12 @@ void runner_dopair1_launch_f4_one_memcpy(
   *gpu_time +=
       (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
 
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
-  
+
   for (int bid = 0; bid < nBundles_temp; bid++) {
     /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -2975,10 +2974,10 @@ void runner_dopair1_launch_f4_one_memcpy(
         (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
     ////////////
-    
+
     /*Time unpacking*/
     //		clock_gettime(CLOCK_REALTIME, &tp0);
-    
+
     for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
       if (tid < tasks_packed) {
@@ -2997,17 +2996,17 @@ void runner_dopair1_launch_f4_one_memcpy(
           ; /* spin until we acquire the lock */
         }
 
-	const ticks tic = getticks();
-	
+        const ticks tic = getticks();
+
         /* Do the copy */
         runner_do_ci_cj_gpu_unpack_neat_aos_f4(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
-	const ticks toc = getticks();
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
 
-	total_cpu_unpack_ticks += toc - tic;
-	
         /* Record things for debugging */
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
@@ -3042,11 +3041,9 @@ void runner_dopair1_launch_f4_one_memcpy(
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +
   //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
-
   /* Write the timers back to the task */
   t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
 
-  
 } /*End of GPU work*/
 
 void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
@@ -3080,7 +3077,7 @@ void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
   }
@@ -3573,7 +3570,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
   }
@@ -3725,16 +3722,16 @@ void runner_dopair1_launch_f4_g_one_memcpy(
           ; /* spin until we acquire the lock */
         }
 
-    	const ticks tic = getticks();
+        const ticks tic = getticks();
 
         /* Do the copy */
         runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
-    	const ticks toc = getticks();
+        const ticks toc = getticks();
 
-    	total_cpu_unpack_ticks += toc - tic;
+        total_cpu_unpack_ticks += toc - tic;
 
         /* Record things for debugging */
         cii->gpu_done_pair_g++;
@@ -3806,7 +3803,7 @@ void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
   }
@@ -4316,7 +4313,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
   }
@@ -4486,10 +4483,9 @@ void runner_dopair1_launch_f4_f_one_memcpy(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
+        const ticks toc = getticks();
 
-    	const ticks toc = getticks();
-
-    	total_cpu_unpack_ticks += toc - tic;
+        total_cpu_unpack_ticks += toc - tic;
 
         /* Record things for debugging */
         cii->gpu_done_pair_f++;
@@ -4557,7 +4553,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
     if (tasks_packed == 0)
       error("zero pair tasks packed but somehow got into GPU loop");
     //	  pack_vars->bundle_first_part[nBundles_temp] =
-    //pack_vars->task_first_part[packed_tmp - 2];
+    // pack_vars->task_first_part[packed_tmp - 2];
     pack_vars->bundle_first_part[nBundles_temp] =
         fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
   }
diff --git a/src/runner_doiact_functions_limiter.h b/src/runner_doiact_functions_limiter.h
index 44f6572b42..0d7e07de6a 100644
--- a/src/runner_doiact_functions_limiter.h
+++ b/src/runner_doiact_functions_limiter.h
@@ -123,7 +123,7 @@ void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
       }
 
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
@@ -216,7 +216,7 @@ void DOSELF1_NAIVE(struct runner *r, struct cell *restrict c) {
         IACT_NONSYM(r2, dx, hj, hi, pj, pi, a, H);
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(TIMER_DOSELF);
 }
@@ -355,8 +355,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           IACT_NONSYM(r2, dx, hi, hj, pi, pj, a, H);
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the parts in ci. */
-  }     /* Cell ci is active */
+    } /* loop over the parts in ci. */
+  } /* Cell ci is active */
 
   if (cell_is_starting_hydro(cj, e)) {
 
@@ -439,8 +439,8 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
           IACT_NONSYM(r2, dx, hj, hi, pj, pi, a, H);
         }
       } /* loop over the parts in ci. */
-    }   /* loop over the parts in cj. */
-  }     /* Cell cj is active */
+    } /* loop over the parts in cj. */
+  } /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR);
 }
diff --git a/src/runner_doiact_functions_stars.h b/src/runner_doiact_functions_stars.h
index 4cf6a7c9a9..b91066f509 100644
--- a/src/runner_doiact_functions_stars.h
+++ b/src/runner_doiact_functions_stars.h
@@ -150,7 +150,7 @@ void DOSELF1_STARS(struct runner *r, struct cell *c, int timer) {
 #endif
       }
     } /* loop over the parts in ci. */
-  }   /* loop over the sparts in ci. */
+  } /* loop over the sparts in ci. */
 
   TIMER_TOC(TIMER_DOSELF_STARS);
 }
@@ -280,7 +280,7 @@ void DO_NONSYM_PAIR1_STARS_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 }
 
 /**
@@ -475,8 +475,8 @@ void DO_SYM_PAIR1_STARS(struct runner *r, struct cell *ci, struct cell *cj,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the parts in ci. */
-  }     /* do_ci_stars */
+    } /* loop over the parts in ci. */
+  } /* do_ci_stars */
 
   if (do_cj_stars) {
     /* Pick-out the sorted lists. */
@@ -629,8 +629,8 @@ void DO_SYM_PAIR1_STARS(struct runner *r, struct cell *ci, struct cell *cj,
 #endif
         }
       } /* loop over the parts in ci. */
-    }   /* loop over the parts in cj. */
-  }     /* Cell cj is active */
+    } /* loop over the parts in cj. */
+  } /* Cell cj is active */
 
   TIMER_TOC(TIMER_DOPAIR_STARS);
 }
@@ -755,7 +755,7 @@ void DOPAIR1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the sparts in ci. */
+    } /* loop over the sparts in ci. */
   }
 
   /* Sparts are on the right. */
@@ -818,7 +818,7 @@ void DOPAIR1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the sparts in ci. */
+    } /* loop over the sparts in ci. */
   }
 }
 
@@ -916,7 +916,7 @@ void DOPAIR1_SUBSET_STARS_NAIVE(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 }
 
 /**
@@ -1003,7 +1003,7 @@ void DOSELF1_SUBSET_STARS(struct runner *r, struct cell *restrict ci,
 #endif
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 }
 
 /**
diff --git a/src/runner_doiact_grav.c b/src/runner_doiact_grav.c
index e2f50d7214..fb4fef6bd1 100644
--- a/src/runner_doiact_grav.c
+++ b/src/runner_doiact_grav.c
@@ -2509,7 +2509,7 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci,
       multi_i->pot.interacted = 1;
 
     } /* We are in charge of this pair */
-  }   /* Loop over top-level cells */
+  } /* Loop over top-level cells */
 
   if (timer) TIMER_TOC(timer_dograv_long_range);
 }
diff --git a/src/runner_doiact_nosort.h b/src/runner_doiact_nosort.h
index 4b500fe2e6..51d2412d0f 100644
--- a/src/runner_doiact_nosort.h
+++ b/src/runner_doiact_nosort.h
@@ -315,7 +315,7 @@ void DOPAIR_SUBSET_NOSORT(struct runner *r, struct cell *restrict ci,
         IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
       }
     } /* loop over the parts in cj. */
-  }   /* loop over the parts in ci. */
+  } /* loop over the parts in ci. */
 
   TIMER_TOC(timer_dopair_subset);
 }
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
index 617b9b8e3b..c51d503352 100644
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -88,8 +88,10 @@ void runner_doself1_gpu_pack_neat_aos_f4(
 
 #ifdef SWIFT_DEBUG_CHECKS
   if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i count %i\n"
-    		, count_max_parts_tmp, local_pack_position + count);
+    fprintf(stderr,
+            "Exceeded count_max_parts_tmp. Make arrays bigger! count_max %i "
+            "count %i\n",
+            count_max_parts_tmp, local_pack_position + count);
     error("0");
   }
 #endif
@@ -958,7 +960,7 @@ void unpack_neat_aos_f4(struct cell *c,
     p->density.rot_v[2] += rot_ux_div_v.z;
     p->viscosity.div_v += rot_ux_div_v.w;
     //	      fprintf(stderr, "rho %f div_v %f\n", p_tmp.rho_dh_wcount.x,
-    //p_tmp.rot_ux_div_v.w);
+    // p_tmp.rot_ux_div_v.w);
   }
 }
 
@@ -1009,7 +1011,7 @@ void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
     p->u_dt += p_tmp.u_dt;
     p->force.h_dt += p_tmp.h_dt;
     //	      p->limiter_data.min_ngb_time_bin = min(p_tmp.min_ngb_time_bin,
-    //p->limiter_data.min_ngb_time_bin);
+    // p->limiter_data.min_ngb_time_bin);
     p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
     const float v_sig = p->viscosity.v_sig;
     p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
@@ -1215,11 +1217,11 @@ void unpack_neat_pair_aos_f4_g(
   //  (int i = 0; i < count; i++) {
   //	  int j = i + pp;
   //	  c->hydro.parts[i].viscosity.v_sig =
-  //parts_aos_buffer[j].vsig_lapu_aviscmax.x;
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.x;
   //	  c->hydro.parts[i].diffusion.laplace_u +=
-  //parts_aos_buffer[j].vsig_lapu_aviscmax.y;
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.y;
   //	  c->hydro.parts[i].force.alpha_visc_max_ngb =
-  //parts_aos_buffer[j].vsig_lapu_aviscmax.z;
+  // parts_aos_buffer[j].vsig_lapu_aviscmax.z;
   //  }
   if (cell_is_active_hydro(c, e)) {
 
@@ -1944,8 +1946,9 @@ void runner_doself1_gpu_pack(
   //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
   //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
   //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
-  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized, count_max_parts_tmp,
-  //    fgpuin); 	  fprintf(stderr,"working on a split cell\n");
+  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized,
+  //    count_max_parts_tmp, fgpuin); 	  fprintf(stderr,"working on a split
+  //    cell\n");
   //      }
   //    }
   //  }
@@ -2069,8 +2072,9 @@ void runner_doself1_gpu_unpack(
   //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
   //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
   //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
-  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized, count_max_parts_tmp,
-  //    fgpuin); 	  fprintf(stderr,"working on a split cell\n");
+  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized,
+  //    count_max_parts_tmp, fgpuin); 	  fprintf(stderr,"working on a split
+  //    cell\n");
   //      }
   //    }
   //  } else {
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4a520db3c6..62907f1d17 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -657,9 +657,10 @@ void *runner_main2(void *data) {
   if ((res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes)) != MPI_SUCCESS)
     error("MPI_Comm_size failed with error %i.", res);
 #endif
-  int parts_per_top_level_cell = space->nr_local_cells_with_particles /
-		  space->nr_parts; /*A. Nasar: What I think is a good approximation for
-		                               average N particles in each top level cell*/
+  int parts_per_top_level_cell =
+      space->nr_local_cells_with_particles /
+      space->nr_parts; /*A. Nasar: What I think is a good approximation for
+                                   average N particles in each top level cell*/
   float eta_neighbours = e->s->eta_neighbours;
   int np_per_cell = ceil(2.0 * eta_neighbours);
   np_per_cell *= np_per_cell * np_per_cell;
@@ -670,12 +671,12 @@ void *runner_main2(void *data) {
   int tot_self_tasks = space->nr_parts / np_per_cell;
 
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
-   *  the allocated memory on buffers and GPU. This can happen if calculated h is
-   *  larger than cell width and splitting makes bigger than target cells*/
-  int count_max_parts_tmp =
-      2 * target_n_tasks * (np_per_cell + buff);
+   *  the allocated memory on buffers and GPU. This can happen if calculated h
+   * is larger than cell width and splitting makes bigger than target cells*/
+  int count_max_parts_tmp = 2 * target_n_tasks * (np_per_cell + buff);
 
-//  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell, count_max_parts_tmp, target_n_tasks);
+  //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
+  //  count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -993,8 +994,8 @@ void *runner_main2(void *data) {
     tasks_done_gpu_inc = 0;
     ticks hang_time = getticks();
     while (1) {
-//      ticks tic_get_task = getticks();
-      //A. Nasar: Get qid for re-use later
+      //      ticks tic_get_task = getticks();
+      // A. Nasar: Get qid for re-use later
       int qid = r->qid;
       /* If there's no old task, try to get a new one. */
       if (t == NULL) {
@@ -1018,10 +1019,10 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        if(t->subtype != task_subtype_gpu_unpack &&
-	   t->subtype != task_subtype_gpu_unpack_g &&
-	   t->subtype != task_subtype_gpu_unpack_f)
-	  t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
+        if (t->subtype != task_subtype_gpu_unpack &&
+            t->subtype != task_subtype_gpu_unpack_g &&
+            t->subtype != task_subtype_gpu_unpack_f)
+          t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
       } else {
         t->sid = -1;
       }
@@ -1066,30 +1067,31 @@ void *runner_main2(void *data) {
                 runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
                                        parts_aos_f4_send, task_first_part_f4);
 
-	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
             n_cells++;
-            if(ci->hydro.count > 1.5 * np_per_cell){
-            	n_w_prts_gtr_target++;
-            	message("count %i target %i", ci->hydro.count, np_per_cell);
+            if (ci->hydro.count > 1.5 * np_per_cell) {
+              n_w_prts_gtr_target++;
+              message("count %i target %i", ci->hydro.count, np_per_cell);
             }
-//            	error("There's %i parts in a cell when it should be %i max", ci->hydro.count, np_per_cell);
+            //            	error("There's %i parts in a cell when it should
+            //            be %i max", ci->hydro.count, np_per_cell);
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_dens->launch;
 
-//            if ((sched->s_d_left[qid] < 1)){
-//            	launch_leftovers = 1;
-//            	pack_vars_self_dens->launch_leftovers = 1;
-//            }
+            //            if ((sched->s_d_left[qid] < 1)){
+            //            	launch_leftovers = 1;
+            //            	pack_vars_self_dens->launch_leftovers = 1;
+            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch) n_full_d_bundles++;
             if (launch_leftovers) n_partial_d_bundles++;
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_dens->tasks_packed;
-//              signal_sleeping_runners(sched, t, t_packed);
+              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4(
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
@@ -1109,22 +1111,22 @@ void *runner_main2(void *data) {
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
 
-	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
 
-//            if ((sched->s_g_left[qid] < 1)){
-//              launch_leftovers = 1;
-//          	  pack_vars_self_grad->launch_leftovers = 1;
-//            }
+            //            if ((sched->s_g_left[qid] < 1)){
+            //              launch_leftovers = 1;
+            //          	  pack_vars_self_grad->launch_leftovers = 1;
+            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_grad->tasks_packed;
-//              signal_sleeping_runners(sched, t, t_packed);
+              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_g(
                   r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                   parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
@@ -1132,7 +1134,7 @@ void *runner_main2(void *data) {
                   &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
                   d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
-#endif        // GPUGRADSELF
+#endif  // GPUGRADSELF
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_self_f++;
 #ifdef GPUOFFLOAD_FORCE
@@ -1142,22 +1144,22 @@ void *runner_main2(void *data) {
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
 
-	        t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
 
-//            if ((sched->s_f_left[qid] < 1)){
-//              launch_leftovers = 1;
-//          	  pack_vars_self_forc->launch_leftovers = 1;
-//            }
+            //            if ((sched->s_f_left[qid] < 1)){
+            //              launch_leftovers = 1;
+            //          	  pack_vars_self_forc->launch_leftovers = 1;
+            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_forc->tasks_packed;
-//              signal_sleeping_runners(sched, t, t_packed);
+              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1299,13 +1301,13 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
 
-	          ticks tic_cpu_pack = getticks();
+              ticks tic_cpu_pack = getticks();
 
-	          packing_time_pair += runner_dopair1_pack_f4(
+              packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
 
-	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
@@ -1315,14 +1317,14 @@ void *runner_main2(void *data) {
               if (launch) n_full_p_d_bundles++;
               if (launch_leftovers) n_partial_p_d_bundles++;
 
-//              if ((sched->p_d_left[qid] < 1)){
-//            	  launch_leftovers = 1;
-//            	  pack_vars_pair_dens->launch_leftovers = 1;
-//              }
+              //              if ((sched->p_d_left[qid] < 1)){
+              //            	  launch_leftovers = 1;
+              //            	  pack_vars_pair_dens->launch_leftovers = 1;
+              //              }
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-            	int t_packed = pack_vars_pair_dens->tasks_packed;
-//                signal_sleeping_runners(sched, t, t_packed);
+                int t_packed = pack_vars_pair_dens->tasks_packed;
+                //                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1334,9 +1336,9 @@ void *runner_main2(void *data) {
               pack_vars_pair_dens->launch_leftovers = 0;
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
-#endif        // DO_CORNERS
-#endif        // GPUOFFLOAD_DENSITY
-          }   /* pair / pack */
+#endif  // DO_CORNERS
+#endif  // GPUOFFLOAD_DENSITY
+          } /* pair / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_pair_g++;
 #ifdef GPUOFFLOAD_GRADIENT
@@ -1383,29 +1385,29 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-  	          ticks tic_cpu_pack = getticks();
+              ticks tic_cpu_pack = getticks();
 
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
                                            fparti_fpartj_lparti_lpartj_grad);
 
-  	          t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
 
-//              if ((sched->p_g_left[qid] < 1)){
-//            	  launch_leftovers = 1;
-//            	  pack_vars_pair_grad->launch_leftovers = 1;
-//              }
+              //              if ((sched->p_g_left[qid] < 1)){
+              //            	  launch_leftovers = 1;
+              //            	  pack_vars_pair_grad->launch_leftovers = 1;
+              //              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-            	int t_packed = pack_vars_pair_grad->tasks_packed;
-//                signal_sleeping_runners(sched, t, t_packed);
+                int t_packed = pack_vars_pair_grad->tasks_packed;
+                //                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_g_one_memcpy(
                     r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
                     parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
@@ -1417,8 +1419,8 @@ void *runner_main2(void *data) {
               pack_vars_pair_grad->launch_leftovers = 0;
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
-#endif        // DO_CORNERS
-#endif        // GPUOFFLOAD_GRADIENT
+#endif  // DO_CORNERS
+#endif  // GPUOFFLOAD_GRADIENT
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_pair_f++;
 #ifdef GPUOFFLOAD_FORCE
@@ -1481,15 +1483,15 @@ void *runner_main2(void *data) {
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
-//              if ((sched->p_f_left[qid] < 1)){
-//            	  launch_leftovers = 1;
-//            	  pack_vars_pair_forc->launch_leftovers = 1;
-//              }
+              //              if ((sched->p_f_left[qid] < 1)){
+              //            	  launch_leftovers = 1;
+              //            	  pack_vars_pair_forc->launch_leftovers = 1;
+              //              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-            	int t_packed = pack_vars_pair_forc->tasks_packed;
-//                signal_sleeping_runners(sched, t, t_packed);
+                int t_packed = pack_vars_pair_forc->tasks_packed;
+                //                signal_sleeping_runners(sched, t, t_packed);
                 runner_dopair1_launch_f4_f_one_memcpy(
                     r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
                     parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
@@ -1591,14 +1593,13 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
-//            fprintf(stderr, "split a g task\n");
+            //            fprintf(stderr, "split a g task\n");
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
-//            fprintf(stderr, "split a f task\n");
-          }
-          else if (t->subtype == task_subtype_limiter)
+            //            fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_self_stars_density(r, ci, 1);
@@ -1638,20 +1639,19 @@ void *runner_main2(void *data) {
         case task_type_sub_pair:
           if (t->subtype == task_subtype_density) {
             int nothing = 0;
-//            message("Doing a pair sub task");
+            //            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
-//            fprintf(stderr, "split a g task\n");
+            //            fprintf(stderr, "split a g task\n");
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
-//            fprintf(stderr, "split a f task\n");
-          }
-          else if (t->subtype == task_subtype_limiter)
+            //            fprintf(stderr, "split a f task\n");
+          } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
             runner_dosub_pair_stars_density(r, ci, cj, 1);
@@ -1904,7 +1904,8 @@ void *runner_main2(void *data) {
       }
       r->active_time += (getticks() - task_beg);
 //      if(g100 > 0)
-//    	  message("less than 100 %i more than 100 %i max count %i", l100, g100, maxcount);
+//    	  message("less than 100 %i more than 100 %i max count %i", l100, g100,
+//    maxcount);
 
 /* Mark that we have run this task on these cells */
 #ifdef SWIFT_DEBUG_CHECKS
@@ -1927,8 +1928,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	    t->toc = getticks();
-	    t->total_ticks += t->toc - t->tic;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1939,8 +1940,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_GRADIENT
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	    t->toc = getticks();
-	    t->total_ticks += t->toc - t->tic;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1951,8 +1952,8 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_FORCE
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
-	    t->toc = getticks();
-	    t->total_ticks += t->toc - t->tic;
+        t->toc = getticks();
+        t->total_ticks += t->toc - t->tic;
         t = NULL;
 #else
         t = scheduler_done(sched, t);
@@ -1960,45 +1961,45 @@ void *runner_main2(void *data) {
       }
 
       else if (t->subtype != task_subtype_gpu_pack &&
-          t->subtype != task_subtype_gpu_pack_g &&
-          t->subtype != task_subtype_gpu_pack_f) {
+               t->subtype != task_subtype_gpu_pack_g &&
+               t->subtype != task_subtype_gpu_pack_f) {
         t = scheduler_done(sched, t);
       }
     } /* main loop. */
 
     message("cpu %i packed %i cells with %i containing more parts than target",
-    		r->cpuid, n_cells, n_w_prts_gtr_target);
-
-//    message("Worked on %i supers w more than 100 parts", g100);
-      // Stuff for writing debug data to file for validation
-      ////        if (step % 10 == 0 || step == 1) {
-      //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
-      //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
-      //      = 0; tid < space->nr_local_cells;
-      //           tid++) { /* This should indeed be tasks_done_gpu as they are
-      //           the only
-      ////                     tasks which have been done*/
-      //        struct cell *ctemp = &(space->cells_top[tid]);
-      //        for (int i = 0; i < ctemp->hydro.count; i++) {
-      //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
-      //          %f, %f\n",
-      //                  ctemp->hydro.parts[i].x[0],
-      //                  ctemp->hydro.parts[i].x[1],
-      //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
-      //                  ctemp->hydro.parts[i].density.rho_dh,
-      //                  ctemp->hydro.parts[i].viscosity.v_sig,
-      //                  ctemp->hydro.parts[i].diffusion.laplace_u,
-      //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
-      //                  ctemp->hydro.parts[i].a_hydro[0],
-      //				  ctemp->hydro.parts[i].a_hydro[1],
-      //				  ctemp->hydro.parts[i].a_hydro[2]);
-      ////          message("wcount %f density %f",
-      /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
-      /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
-      //        }
-      //      }
-      ////  }
-      /*Output compute times to separate files. cat later into one file*/
+            r->cpuid, n_cells, n_w_prts_gtr_target);
+
+    //    message("Worked on %i supers w more than 100 parts", g100);
+    // Stuff for writing debug data to file for validation
+    ////        if (step % 10 == 0 || step == 1) {
+    //      if(r->cpuid == 0 && engine_rank == 0)fprintf(fgpu_steps, "x, y, z,
+    //      rho, rhodh, v_sig, lap_u, a_visc_max, ax, ay, az\n"); for (int tid
+    //      = 0; tid < space->nr_local_cells;
+    //           tid++) { /* This should indeed be tasks_done_gpu as they are
+    //           the only
+    ////                     tasks which have been done*/
+    //        struct cell *ctemp = &(space->cells_top[tid]);
+    //        for (int i = 0; i < ctemp->hydro.count; i++) {
+    //          fprintf(fgpu_steps, "%f, %f, %f, %f, %f, %f, %f, %f, %f, %f,
+    //          %f, %f\n",
+    //                  ctemp->hydro.parts[i].x[0],
+    //                  ctemp->hydro.parts[i].x[1],
+    //                  ctemp->hydro.parts[i].x[2], ctemp->hydro.parts[i].rho,
+    //                  ctemp->hydro.parts[i].density.rho_dh,
+    //                  ctemp->hydro.parts[i].viscosity.v_sig,
+    //                  ctemp->hydro.parts[i].diffusion.laplace_u,
+    //                  ctemp->hydro.parts[i].force.alpha_visc_max_ngb,
+    //                  ctemp->hydro.parts[i].a_hydro[0],
+    //				  ctemp->hydro.parts[i].a_hydro[1],
+    //				  ctemp->hydro.parts[i].a_hydro[2]);
+    ////          message("wcount %f density %f",
+    /// ctemp->hydro.parts[i].density.wcount, ctemp->hydro.parts[i].rho); /
+    /// message("wcount is %f\n", ctemp->hydro.parts[i].density.wcount);
+    //        }
+    //      }
+    ////  }
+    /*Output compute times to separate files. cat later into one file*/
 //    if (step % 11 == 0 || step == 1) {
 #ifdef DUMP_TIMINGS
 #if defined(GPUOFFLOAD_DENSITY) || defined(GPUOFFLOAD_GRADIENT) || \
@@ -2088,8 +2089,8 @@ void *runner_main2(void *data) {
     //	  size_t total_byte ;
     //	  cudaError_t cuda_status = cudaMemGetInfo( &free_byte,
     //&total_byte ) ; 	  double free = (double)free_byte; 	  double
-    //available = (double)total_byte; 	  double used = (available - free);
-    //fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
+    // available = (double)total_byte; 	  double used = (available - free);
+    // fprintf(stderr, "Used %f GB GPU memory\n", used/1e9);
     /* Wait at the wait barrier. */
     //    swift_barrier_wait(&e->wait_barrier);
   }
diff --git a/src/runner_others.c b/src/runner_others.c
index f25dcb0b44..cbace92a63 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -296,7 +296,7 @@ void runner_do_star_formation_sink(struct runner *r, struct cell *c,
         sink_update_sink_properties_after_star_formation(s, e, sink_props,
                                                          phys_const);
       } /* if sink_is_active */
-    }   /* Loop over the particles */
+    } /* Loop over the particles */
   }
 
   /* If we formed any stars, the star sorts are now invalid. We need to
diff --git a/src/runner_sinks.c b/src/runner_sinks.c
index 245417bfbc..eeece9401b 100644
--- a/src/runner_sinks.c
+++ b/src/runner_sinks.c
@@ -109,8 +109,8 @@ void runner_doself_sinks_swallow(struct runner *r, struct cell *c, int timer) {
               e->gravity_properties, e->sink_properties);
         }
       } /* loop over the parts in ci. */
-    }   /* loop over the bparts in ci. */
-  }     /* Do we have gas particles in the cell? */
+    } /* loop over the bparts in ci. */
+  } /* Do we have gas particles in the cell? */
 
   /* When doing sink swallowing, we need a quick loop also over the sink
    * neighbours */
@@ -165,7 +165,7 @@ void runner_doself_sinks_swallow(struct runner *r, struct cell *c, int timer) {
             e->gravity_properties, e->sink_properties);
       }
     } /* loop over the sinks in ci. */
-  }   /* loop over the sinks in ci. */
+  } /* loop over the sinks in ci. */
 
   if (timer) TIMER_TOC(timer_doself_sink_swallow);
 }
@@ -252,8 +252,8 @@ void runner_do_nonsym_pair_sinks_naive_swallow(struct runner *r,
               e->gravity_properties, e->sink_properties);
         }
       } /* loop over the parts in cj. */
-    }   /* loop over the sinks in ci. */
-  }     /* Do we have gas particles in the cell? */
+    } /* loop over the sinks in ci. */
+  } /* Do we have gas particles in the cell? */
 
   /* When doing sink swallowing, we need a quick loop also over the sinks
    * neighbours */
@@ -308,7 +308,7 @@ void runner_do_nonsym_pair_sinks_naive_swallow(struct runner *r,
             e->gravity_properties, e->sink_properties);
       }
     } /* loop over the sinks in cj. */
-  }   /* loop over the sinks in ci. */
+  } /* loop over the sinks in ci. */
 }
 
 /**
@@ -682,7 +682,7 @@ void runner_do_sinks_gas_swallow(struct runner *r, struct cell *c, int timer) {
           get_integer_time_begin(ti_current + 1, p->time_bin);
       ti_beg_max = max(ti_beg, ti_beg_max);
     } /* Loop over the parts */
-  }   /* Cell is not split */
+  } /* Cell is not split */
 
   /* Update ti_beg_max. See bug fix above. */
   if (ti_beg_max != c->hydro.ti_beg_max) {
@@ -875,8 +875,8 @@ void runner_do_sinks_sink_swallow(struct runner *r, struct cell *c, int timer) {
         }
 
       } /* Part was flagged for swallowing */
-    }   /* Loop over the parts */
-  }     /* Cell is not split */
+    } /* Loop over the parts */
+  } /* Cell is not split */
 }
 
 /**
diff --git a/src/scheduler.c b/src/scheduler.c
index e08463e110..e96e10dd63 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1385,7 +1385,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
               }
       }
     } /* pair interaction? */
-  }   /* iterate over the current task. */
+  } /* iterate over the current task. */
 }
 
 /**
@@ -1460,9 +1460,9 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
                         s);
 
           } /* Self-gravity only */
-        }   /* Make tasks explicitly */
-      }     /* Cell is split */
-    }       /* Self interaction */
+        } /* Make tasks explicitly */
+      } /* Cell is split */
+    } /* Self interaction */
 
     /* Pair interaction? */
     else if (t->type == task_type_pair) {
@@ -1535,7 +1535,7 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) {
         } /* Split the pair */
       }
     } /* pair interaction? */
-  }   /* iterate over the current task. */
+  } /* iterate over the current task. */
 }
 
 /**
@@ -1650,7 +1650,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
 
     /* Invoke the correct splitting strategy */
     if (t->subtype == task_subtype_density) {
-            scheduler_splittask_hydro(t, s);
+      scheduler_splittask_hydro(t, s);
     } else if (t->subtype == task_subtype_external_grav) {
       scheduler_splittask_gravity(t, s);
     } else if (t->subtype == task_subtype_grav) {
@@ -1659,17 +1659,16 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
     } else if (t->subtype == task_subtype_gpu_pack ||
                t->subtype == task_subtype_gpu_pack_g ||
                t->subtype == task_subtype_gpu_pack_f) {
-        scheduler_splittask_hydro(t, s);
+      scheduler_splittask_hydro(t, s);
     } else if (t->subtype == task_subtype_gpu_unpack ||
-            t->subtype == task_subtype_gpu_unpack_g ||
-            t->subtype == task_subtype_gpu_unpack_f){
-    	/*Do nothing and grab next task to split.
-    	 *These tasks are cell-less so cannot split.
-    	 *Will remove this if statement if set on splitting
-    	 *b4 creating unpack tasks*/
-    	continue;
-    }
-    else {
+               t->subtype == task_subtype_gpu_unpack_g ||
+               t->subtype == task_subtype_gpu_unpack_f) {
+      /*Do nothing and grab next task to split.
+       *These tasks are cell-less so cannot split.
+       *Will remove this if statement if set on splitting
+       *b4 creating unpack tasks*/
+      continue;
+    } else {
 #ifdef SWIFT_DEBUG_CHECKS
       error("Unexpected task sub-type %s/%s", taskID_names[t->type],
             subtaskID_names[t->subtype]);
@@ -1769,23 +1768,23 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
   // #ifdef WITH_CUDA  A. Nasar
-  if(t->subtype == task_subtype_gpu_pack){
-	  if(t->type == task_type_self || t->type == task_type_sub_self)
-		    atomic_inc(&s->nr_self_pack_tasks_d);
-	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
-		    atomic_inc(&s->nr_pair_pack_tasks_d);
+  if (t->subtype == task_subtype_gpu_pack) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_d);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_d);
   }
-  if(t->subtype == task_subtype_gpu_pack_f){
-	  if(t->type == task_type_self || t->type == task_type_sub_self)
-		    atomic_inc(&s->nr_self_pack_tasks_f);
-	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
-		    atomic_inc(&s->nr_pair_pack_tasks_f);
+  if (t->subtype == task_subtype_gpu_pack_f) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_f);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_f);
   }
-  if(t->subtype == task_subtype_gpu_pack_g){
-	  if(t->type == task_type_self || t->type == task_type_sub_self)
-		    atomic_inc(&s->nr_self_pack_tasks_g);
-	  if(t->type == task_type_pair || t->type == task_type_sub_pair)
-		    atomic_inc(&s->nr_pair_pack_tasks_g);
+  if (t->subtype == task_subtype_gpu_pack_g) {
+    if (t->type == task_type_self || t->type == task_type_sub_self)
+      atomic_inc(&s->nr_self_pack_tasks_g);
+    if (t->type == task_type_pair || t->type == task_type_sub_pair)
+      atomic_inc(&s->nr_pair_pack_tasks_g);
   }
   // #endif
   /* Add an index for it. */
@@ -2256,17 +2255,17 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * (bcount_i + bcount_j);
         } else if (t->subtype == task_subtype_gpu_pack) {
-		  cost = 2.f * (wscale * count_i) * count_i;
-		} else if (t->subtype == task_subtype_gpu_pack_f) {
-		  cost = 2.f * (wscale * count_i) * count_i;
-		} else if (t->subtype == task_subtype_gpu_pack_g) {
-		  cost = 2.f * (wscale * count_i) * count_i;
-		} else if (t->subtype == task_subtype_gpu_unpack) {
-		  cost = 1.f * wscale;
-		} else if (t->subtype == task_subtype_gpu_unpack_f) {
-		  cost = 1.f * wscale;
-		} else if (t->subtype == task_subtype_gpu_unpack_g) {
-		  cost = 1.f * wscale;
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_f) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_pack_g) {
+          cost = 2.f * (wscale * count_i) * count_i;
+        } else if (t->subtype == task_subtype_gpu_unpack) {
+          cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_f) {
+          cost = 1.f * wscale;
+        } else if (t->subtype == task_subtype_gpu_unpack_g) {
+          cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_density ||
                    t->subtype == task_subtype_gradient ||
                    t->subtype == task_subtype_force ||
@@ -2306,21 +2305,21 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * bcount_i;
         } else if (t->subtype == task_subtype_gpu_pack)  // A. Nasar
-            cost = 1.f * (wscale * count_i) * count_i;   // * s->pack_size;
-          else if (t->subtype == task_subtype_gpu_pack_f)
-            cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
-          else if (t->subtype == task_subtype_gpu_pack_g)
-            cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
-          else if (t->subtype == task_subtype_gpu_unpack)
-            cost = 1.f * wscale * s->pack_size;
-          else if (t->subtype == task_subtype_gpu_unpack_f)
-            cost = 1.f * wscale * s->pack_size;
-          else if (t->subtype == task_subtype_gpu_unpack_g)
-            cost = 1.f * wscale * s->pack_size;
-          else if (t->subtype == task_subtype_density ||
-                   t->subtype == task_subtype_gradient ||
-                   t->subtype == task_subtype_force ||
-                   t->subtype == task_subtype_limiter) {
+          cost = 1.f * (wscale * count_i) * count_i;     // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_f)
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_g)
+          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack)
+          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_f)
+          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_unpack_g)
+          cost = 1.f * wscale * s->pack_size;
+        else if (t->subtype == task_subtype_density ||
+                 t->subtype == task_subtype_gradient ||
+                 t->subtype == task_subtype_force ||
+                 t->subtype == task_subtype_limiter) {
           cost = 1.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_rt_gradient) {
           cost = 1.f * wscale * scount_i * count_i;
@@ -2898,22 +2897,22 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     queue_insert(&s->queues[qid], t);
     /* A. Nasar: Increment counters required for the pack tasks */
     if (t->type == task_type_self || t->type == task_type_sub_self) {
-      if (t->subtype == task_subtype_gpu_pack){
-	lock_lock(&s->queues[qid].lock);
+      if (t->subtype == task_subtype_gpu_pack) {
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_d++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_d_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_f){
-	lock_lock(&s->queues[qid].lock);
+      if (t->subtype == task_subtype_gpu_pack_f) {
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_f++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_f_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_g){
-	lock_lock(&s->queues[qid].lock);
+      if (t->subtype == task_subtype_gpu_pack_g) {
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_g++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->s_g_left[qid]);
       }
     }
@@ -2921,21 +2920,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
      MPI where ci may not be on this node/rank */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (t->subtype == task_subtype_gpu_pack) {
-	lock_lock(&s->queues[qid].lock);
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_d++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
-	lock_lock(&s->queues[qid].lock);
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_f++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
-	lock_lock(&s->queues[qid].lock);
+        lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_g++;
-	lock_unlock(&s->queues[qid].lock);
+        lock_unlock(&s->queues[qid].lock);
         atomic_inc(&s->p_g_left[qid]);
       }
     }
@@ -2990,7 +2989,8 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) {
   return NULL;
 }
 
-struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, int tasks_packed) {
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+                                     int tasks_packed) {
   /* Mark the task as skip. */
   //  t->skip = 1;
 
@@ -3151,7 +3151,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
   if (qid >= nr_queues || qid < 0) error("Bad queue ID.");
 
   /*Get a pointer to our queue for re-use*/
-  struct queue * q = &s->queues[qid];
+  struct queue *q = &s->queues[qid];
   /* Loop as long as there are tasks... */
   while (s->waiting > 0 && res == NULL) {
     /* Try more than once before sleeping. */
@@ -3172,7 +3172,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
         /* Make list of queues that have 1 or more tasks in them */
         for (int k = 0; k < nr_queues; k++) {
-        	if(k == qid) continue;
+          if (k == qid) continue;
           if (s->queues[k].count > 0 || s->queues[k].count_incoming > 0) {
             qids[count++] = k;
           }
@@ -3185,85 +3185,85 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
           /*Get a pointer to the queue we're stealing from*/
           int qstl_id = qids[ind];
 
-	  /* If we got the queue we already have, abort */
-          if(qid == qstl_id){
-        	  /* Reduce the size of the list of non-empty queues */
-        	  qids[ind] = qids[--count];
-        	  continue;
+          /* If we got the queue we already have, abort */
+          if (qid == qstl_id) {
+            /* Reduce the size of the list of non-empty queues */
+            qids[ind] = qids[--count];
+            continue;
           }
 
-	  /* The queue we are stealing from */
-      	  struct queue * q_stl = &s->queues[qstl_id];
+          /* The queue we are stealing from */
+          struct queue *q_stl = &s->queues[qstl_id];
 
-	  /* Can we lock our own queue? */
-	  if (lock_trylock(&q->lock) != 0) {
+          /* Can we lock our own queue? */
+          if (lock_trylock(&q->lock) != 0) {
 
-	    /* No --> continue and try a different queue */
-	    continue;
-	    
-	  } else {
+            /* No --> continue and try a different queue */
+            continue;
 
-	    /* Yes --> Try locking the que we steal from */
-	    if (lock_trylock(&q_stl->lock) != 0) {
+          } else {
 
-	      /* Failed? --> Unlock the 1st queue  and
-		 try again */
-	      lock_unlock(&q->lock);
-	      continue;
-	    }
-	  }
+            /* Yes --> Try locking the que we steal from */
+            if (lock_trylock(&q_stl->lock) != 0) {
 
-	  /* We now have locked q and q_stl */
+              /* Failed? --> Unlock the 1st queue  and
+                 try again */
+              lock_unlock(&q->lock);
+              continue;
+            }
+          }
+
+          /* We now have locked q and q_stl */
 
-	  /* Try to get a task from that random queue */
+          /* Try to get a task from that random queue */
           TIMER_TIC;
           res = queue_gettask(q_stl, prev, 0);
           TIMER_TOC(timer_qsteal);
-	  
+
           /* Lucky? i.e. did we actually get a task? */
-	  if (res != NULL){
+          if (res != NULL) {
 
-	    /*A.Nasar: Get task type*/
-	    enum task_types type = res->type;
-	    enum task_subtypes subtype = res->subtype;
+            /*A.Nasar: Get task type*/
+            enum task_types type = res->type;
+            enum task_subtypes subtype = res->subtype;
 
-	    /*Move counter from the robbed to the robber*/
-            if ((type == task_type_self || type == task_type_sub_self)&&
+            /*Move counter from the robbed to the robber*/
+            if ((type == task_type_self || type == task_type_sub_self) &&
                 subtype == task_subtype_gpu_pack) {
               q->n_packs_self_left_d--;
               q_stl->n_packs_self_left_d--;
               atomic_inc(&s->s_d_left[qid]);
               atomic_dec(&s->s_d_left[qstl_id]);
             }
-            if ((type == task_type_self || type == task_type_sub_self)&&
+            if ((type == task_type_self || type == task_type_sub_self) &&
                 subtype == task_subtype_gpu_pack_g) {
               q->n_packs_self_left_g--;
               q_stl->n_packs_self_left_g--;
               atomic_inc(&s->s_g_left[qid]);
               atomic_dec(&s->s_g_left[qstl_id]);
             }
-            if ((type == task_type_self || type == task_type_sub_self)&&
+            if ((type == task_type_self || type == task_type_sub_self) &&
                 subtype == task_subtype_gpu_pack_f) {
               q->n_packs_self_left_f--;
               q_stl->n_packs_self_left_f--;
               atomic_inc(&s->s_f_left[qid]);
               atomic_dec(&s->s_f_left[qstl_id]);
             }
-            if ((type == task_type_pair || type == task_type_sub_pair)&&
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack) {
               q->n_packs_pair_left_d--;
               q_stl->n_packs_pair_left_d--;
               atomic_inc(&s->p_d_left[qid]);
               atomic_dec(&s->p_d_left[qstl_id]);
             }
-            if ((type == task_type_pair || type == task_type_sub_pair)&&
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack_g) {
               q->n_packs_pair_left_g--;
               q_stl->n_packs_pair_left_g--;
               atomic_inc(&s->p_g_left[qid]);
               atomic_dec(&s->p_g_left[qstl_id]);
             }
-            if ((type == task_type_pair || type == task_type_sub_pair)&&
+            if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack_f) {
               q->n_packs_pair_left_f--;
               q_stl->n_packs_pair_left_f--;
@@ -3273,13 +3273,13 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             /* Run with the task */
             break;
           } else {
-	    
+
             /* Reduce the size of the list of non-empty queues */
             qids[ind] = qids[--count];
           }
 
-	  lock_unlock(&q->lock);
-	  lock_unlock(&q_stl->lock);
+          lock_unlock(&q->lock);
+          lock_unlock(&q_stl->lock);
         }
         if (res != NULL) break;
       }
@@ -3295,9 +3295,11 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       pthread_mutex_lock(&s->sleep_mutex);
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
-    	struct queue qq = s->queues[qid];
-//    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen %i, pair_left %i", s->waiting,
-//    			qq.n_packs_self_stolen_f, qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
+        struct queue qq = s->queues[qid];
+        //    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
+        //    %i, pair_left %i", s->waiting, 			qq.n_packs_self_stolen_f,
+        //    qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f,
+        //    qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);
@@ -3345,13 +3347,13 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
   for (int k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL);
 
   /* Initialize each queue. */
-  for (int k = 0; k < nr_queues; k++){
-	  s->s_d_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
-	  s->s_g_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
-	  s->s_f_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
-	  s->p_d_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
-	  s->p_g_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
-	  s->p_f_left = (volatile int *) malloc(sizeof(volatile int) * nr_queues);
+  for (int k = 0; k < nr_queues; k++) {
+    s->s_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->s_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->s_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_d_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_g_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
+    s->p_f_left = (volatile int *)malloc(sizeof(volatile int) * nr_queues);
   }
 
   /* Init the sleep mutex and cond. */
diff --git a/src/scheduler.h b/src/scheduler.h
index 57e6857b7c..b7f8b9f2ad 100644
--- a/src/scheduler.h
+++ b/src/scheduler.h
@@ -353,6 +353,7 @@ void scheduler_dump_queues(struct engine *e);
 void scheduler_report_task_times(const struct scheduler *s,
                                  const int nr_threads);
 struct task *enqueue_dependencies(struct scheduler *s, struct task *t);
-struct task *signal_sleeping_runners(struct scheduler *s, struct task *t, int tasks_packed);
+struct task *signal_sleeping_runners(struct scheduler *s, struct task *t,
+                                     int tasks_packed);
 
 #endif /* SWIFT_SCHEDULER_H */
diff --git a/src/space_regrid.c b/src/space_regrid.c
index 95fa4d9cd9..487fe7c0e3 100644
--- a/src/space_regrid.c
+++ b/src/space_regrid.c
@@ -388,7 +388,7 @@ void space_regrid(struct space *s, int verbose) {
     // message( "rebuilding upper-level cells took %.3f %s." ,
     // clocks_from_ticks(double)(getticks() - tic), clocks_getunit());
 
-  }      /* re-build upper-level cells? */
+  } /* re-build upper-level cells? */
   else { /* Otherwise, just clean up the cells. */
 
     /* Free the old cells, if they were allocated. */
diff --git a/src/space_split.c b/src/space_split.c
index 0b79d5b23f..43e020d5e5 100644
--- a/src/space_split.c
+++ b/src/space_split.c
@@ -439,7 +439,7 @@ void space_split_recursive(struct space *s, struct cell *c,
       gravity_multipole_compute_power(&c->grav.multipole->m_pole);
 
     } /* Deal with gravity */
-  }   /* Split or let it be? */
+  } /* Split or let it be? */
 
   /* Otherwise, collect the data from the particles this cell. */
   else {
diff --git a/src/task.c b/src/task.c
index 4142a76674..d9d02529cb 100644
--- a/src/task.c
+++ b/src/task.c
@@ -194,22 +194,22 @@ MPI_Comm subtaskMPI_comms[task_subtype_count];
  * @param ARRAY is the array of this specific type.
  * @param COUNT is the number of elements in the array.
  */
-#define TASK_CELL_OVERLAP(TYPE, ARRAY, COUNT)                           \
-  __attribute__((always_inline))                                        \
-  INLINE static size_t task_cell_overlap_##TYPE(                        \
-      const struct cell *restrict ci, const struct cell *restrict cj) { \
-                                                                        \
-    if (ci == NULL || cj == NULL) return 0;                             \
-                                                                        \
-    if (ci->ARRAY <= cj->ARRAY &&                                       \
-        ci->ARRAY + ci->COUNT >= cj->ARRAY + cj->COUNT) {               \
-      return cj->COUNT;                                                 \
-    } else if (cj->ARRAY <= ci->ARRAY &&                                \
-               cj->ARRAY + cj->COUNT >= ci->ARRAY + ci->COUNT) {        \
-      return ci->COUNT;                                                 \
-    }                                                                   \
-                                                                        \
-    return 0;                                                           \
+#define TASK_CELL_OVERLAP(TYPE, ARRAY, COUNT)                    \
+  __attribute__((always_inline)) INLINE static size_t            \
+      task_cell_overlap_##TYPE(const struct cell *restrict ci,   \
+                               const struct cell *restrict cj) { \
+                                                                 \
+    if (ci == NULL || cj == NULL) return 0;                      \
+                                                                 \
+    if (ci->ARRAY <= cj->ARRAY &&                                \
+        ci->ARRAY + ci->COUNT >= cj->ARRAY + cj->COUNT) {        \
+      return cj->COUNT;                                          \
+    } else if (cj->ARRAY <= ci->ARRAY &&                         \
+               cj->ARRAY + cj->COUNT >= ci->ARRAY + ci->COUNT) { \
+      return ci->COUNT;                                          \
+    }                                                            \
+                                                                 \
+    return 0;                                                    \
   }
 
 TASK_CELL_OVERLAP(part, hydro.parts, hydro.count);
@@ -1746,13 +1746,15 @@ void task_dump_active(struct engine *e) {
       /* Get destination rank of MPI requests. */
       int paired = (t->cj != NULL);
       int otherrank = 0;
-      //A. N.: Mods requied to stop code crashing when debugging GPU tasks
-      if(t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
-    		  && t->subtype!= task_subtype_gpu_unpack_g)
-    	  otherrank = t->ci->nodeID;
-      if (paired && t->subtype!= task_subtype_gpu_unpack && t->subtype!= task_subtype_gpu_unpack_f
-    		  && t->subtype!= task_subtype_gpu_unpack_g)
-    	  otherrank = t->cj->nodeID;
+      // A. N.: Mods requied to stop code crashing when debugging GPU tasks
+      if (t->subtype != task_subtype_gpu_unpack &&
+          t->subtype != task_subtype_gpu_unpack_f &&
+          t->subtype != task_subtype_gpu_unpack_g)
+        otherrank = t->ci->nodeID;
+      if (paired && t->subtype != task_subtype_gpu_unpack &&
+          t->subtype != task_subtype_gpu_unpack_f &&
+          t->subtype != task_subtype_gpu_unpack_g)
+        otherrank = t->cj->nodeID;
 
       fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
               engine_rank, otherrank, taskID_names[t->type],

From 532e06f1fae7bd2536d5578219352fd588c9eda2 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Mon, 27 Jan 2025 23:01:13 +0100
Subject: [PATCH 144/217] Rename the density pack and unpack tasks such that
 they have a _d suffix to match the gradient and force ones

---
 src/engine.c                            |  8 ++--
 src/engine_maketasks.c                  | 46 ++++++++++----------
 src/engine_marktasks.c                  | 17 ++++----
 src/runner_doiact_functions_hydro_gpu.h | 23 ++++++----
 src/scheduler.c                         | 58 ++++++++++++-------------
 src/task.c                              | 28 ++++++------
 src/task.h                              |  4 +-
 7 files changed, 95 insertions(+), 89 deletions(-)

diff --git a/src/engine.c b/src/engine.c
index 606e246a09..023885cb0c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2218,10 +2218,10 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   ////	  if(tmp_t->subtype == task_subtype_force){
   ////		if(tmp_t->skip == 1)error("inactive force task");
   ////	  }
-  //	  if(tmp_t->subtype == task_subtype_gpu_pack){
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack_d){
   //		if(tmp_t->skip == 1)error("inactive pack task");
   //	  }
-  //	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack_d){
   //	    if(tmp_t->skip == 1)error("inactive unpack task");
   //	  }
   //  }
@@ -2320,10 +2320,10 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
   //	  if(tmp_t->subtype == task_subtype_force){
   //		if(tmp_t->skip == 1)error("inactive force task");
   //	  }
-  //	  if(tmp_t->subtype == task_subtype_gpu_pack){
+  //	  if(tmp_t->subtype == task_subtype_gpu_pack_d){
   //		if(tmp_t->skip == 1)error("inactive pack task");
   //	  }
-  //	  if(tmp_t->subtype == task_subtype_gpu_unpack){
+  //	  if(tmp_t->subtype == task_subtype_gpu_unpack_d){
   //	    if(tmp_t->skip == 1)error("inactive unpack task");
   //	  }
   //  }
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 85e5df2493..90bf7b3dcd 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2161,7 +2161,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
         //      } else if (t_subtype == task_subtype_gpu_pack_f) {
         //        engine_addlink(e, &ci->hydro.force_pack, t);
@@ -2181,7 +2181,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
         engine_addlink(e, &cj->hydro.density_pack, t);
         //      } else if (t_subtype == task_subtype_gpu_pack_f) {
@@ -2206,7 +2206,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
 
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) {  // A. Nasar
+      } else if (t_subtype == task_subtype_gpu_pack_d) {  // A. Nasar
         engine_addlink(e, &ci->hydro.density_pack, t);
         //        error("Abouzied: you need to code this up!");
       } else if (t_subtype == task_subtype_gpu_pack_f) {
@@ -2229,7 +2229,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements,
       if (t_subtype == task_subtype_density) {
         engine_addlink(e, &ci->hydro.density, t);
         engine_addlink(e, &cj->hydro.density, t);
-      } else if (t_subtype == task_subtype_gpu_pack) {
+      } else if (t_subtype == task_subtype_gpu_pack_d) {
         engine_addlink(e, &ci->hydro.density_pack, t);
         engine_addlink(e, &cj->hydro.density_pack, t);
         //        error("Abouzied: you need to code this up!");
@@ -2550,7 +2550,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
 
     /*Make packing depend on sorts and drift A. Nasar */
-    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
+    else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_d) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
     }
 
@@ -2834,7 +2834,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
 
     /*Make packing depend on sorts and drift A. Nasar */
-    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack) {
+    else if (t_type == task_type_pair && t_subtype == task_subtype_gpu_pack_d) {
       /* Make all density tasks depend on the drift */
       if (ci->nodeID == nodeID) {
         scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
@@ -3426,7 +3426,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     }
     /*Make packing depend on sorts and drift A. Nasar */
     else if (t_type == task_type_sub_self &&
-             t_subtype == task_subtype_gpu_pack) {
+             t_subtype == task_subtype_gpu_pack_d) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
       //      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
     }
@@ -3715,7 +3715,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
     /* Otherwise, sub-pair interaction? */
     else if (t_type == task_type_sub_pair &&
-             t_subtype == task_subtype_gpu_pack) {
+             t_subtype == task_subtype_gpu_pack_d) {
       /* Make all density pack tasks depend on the drift */
       if (ci->nodeID == nodeID) {
         scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
@@ -4358,8 +4358,8 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
       // A. Nasar also add a pack task for GPU
-      scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack, 0, 0, ci,
-                        NULL);
+      scheduler_addtask(sched, task_type_self, task_subtype_gpu_pack_d, 0, 0,
+                        ci, NULL);
     }
 
     /* Now loop over all the neighbours of this cell */
@@ -4393,7 +4393,7 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
           const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))];
           scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0,
                             ci, cj);
-          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack, sid,
+          scheduler_addtask(sched, task_type_pair, task_subtype_gpu_pack_d, sid,
                             0, ci, cj);  // A. Nasar
 
 #ifdef SWIFT_DEBUG_CHECKS
@@ -4936,17 +4936,17 @@ void engine_maketasks(struct engine *e) {
   //  for (int i = 0; i < sched->nr_tasks; i++) {
   //	  struct task * t = &sched->tasks[i];
   //	  if(t->type == task_type_sub_self && t->subtype ==
-  //task_subtype_gpu_pack){
+  // task_subtype_gpu_pack_d){
   //        t->type = task_type_self;
   //        fprintf(stderr, "sub_self");
   //	  }
   //      if(t->type == task_type_sub_pair && t->subtype ==
-  //      task_subtype_gpu_pack){
+  //      task_subtype_gpu_pack_d){
   //    	t->type = task_type_pair;
   //        fprintf(stderr, "sub_pair");
   //      }
   //	  if(t->type == task_type_sub_self && t->subtype ==
-  //task_subtype_gpu_pack_g){
+  // task_subtype_gpu_pack_g){
   //        t->type = task_type_self;
   //        fprintf(stderr, "sub_self");
   //	  }
@@ -4956,7 +4956,7 @@ void engine_maketasks(struct engine *e) {
   //        fprintf(stderr, "sub_pair");
   //      }
   //	  if(t->type == task_type_sub_self && t->subtype ==
-  //task_subtype_gpu_pack_f){
+  // task_subtype_gpu_pack_f){
   //        t->type = task_type_self;
   //        fprintf(stderr, "sub_self");
   //	  }
@@ -4984,13 +4984,13 @@ void engine_maketasks(struct engine *e) {
   for (int i = 0; i < sched->nr_tasks; i++) {
 
     struct task *t = &sched->tasks[i];
-    if (t->subtype != task_subtype_gpu_pack) continue;
+    if (t->subtype != task_subtype_gpu_pack_d) continue;
 
     if (t->type == task_type_self || t->type == task_type_sub_self) {
 
       if (count_current_self % pack_size == 0) {
         last_created_self_unpack = scheduler_addtask(
-            sched, task_type_self, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+            sched, task_type_self, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
         last_created_self_unpack->gpu_done = 0;
       }
 
@@ -5009,7 +5009,7 @@ void engine_maketasks(struct engine *e) {
     else if (t->type == task_type_pair || t->type == task_type_sub_pair) {
       if (count_current_pair % pack_size_pair == 0) {
         last_created_pair_unpack = scheduler_addtask(
-            sched, task_type_pair, task_subtype_gpu_unpack, 0, 0, NULL, NULL);
+            sched, task_type_pair, task_subtype_gpu_unpack_d, 0, 0, NULL, NULL);
       }
 
       scheduler_addunlock(sched, t, last_created_pair_unpack);
@@ -5176,7 +5176,7 @@ void engine_maketasks(struct engine *e) {
   //    if(t->ci != NULL){
   ////      if(t->type == task_type_pair && ((t->ci->split && !t->cj->split) ||
   ///(!t->ci->split && t->cj->split))) /    	  error("one is split the other
-  ///isn't");
+  /// isn't");
   //      if(t->ci->hydro.count > 80 && t->type == task_type_self)
   //    	  error("Count is %i task subtype (%s)",
   //                  t->ci->hydro.count, subtaskID_names[t->subtype]);
@@ -5342,10 +5342,10 @@ void engine_maketasks(struct engine *e) {
         t->subtype == task_subtype_force) {
       t->implicit = 1;
     }
-    //    if (t->subtype == task_subtype_gpu_pack ||
+    //    if (t->subtype == task_subtype_gpu_pack_d ||
     //      t->subtype == task_subtype_gpu_pack_g ||
     //	  t->subtype == task_subtype_gpu_pack_f ||
-    //	  t->subtype == task_subtype_gpu_unpack ||
+    //	  t->subtype == task_subtype_gpu_unpack_d ||
     //	  t->subtype == task_subtype_gpu_unpack_g ||
     //	  t->subtype == task_subtype_gpu_unpack_f){
     //    	t->implicit = 1;
@@ -5355,10 +5355,10 @@ void engine_maketasks(struct engine *e) {
     //	  t->subtype == task_subtype_gpu_unpack_g ||
     //	  t->subtype == task_subtype_gpu_unpack_f){// ||
     ////	  (t->type == task_type_pair &&
-    ////	   t->subtype == task_subtype_gpu_pack)){
+    ////	   t->subtype == task_subtype_gpu_pack_d)){
     //    	t->implicit = 1;
     //    }
-    //    if ((t->subtype == task_subtype_gpu_pack ||
+    //    if ((t->subtype == task_subtype_gpu_pack_d ||
     //      t->subtype == task_subtype_gpu_pack_g  ||
     //	  t->subtype == task_subtype_gpu_pack_f) &&
     //	  (t->type == task_type_sub_pair ||
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 15be210a22..4504a07f95 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -89,7 +89,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     // Activate GPU unpack tasks (cell-less dummy tasks so need activating
     // separately)
     if (t_type == task_type_self &&
-        (t_subtype == task_subtype_gpu_unpack ||
+        (t_subtype == task_subtype_gpu_unpack_d ||
          t_subtype == task_subtype_gpu_unpack_g ||
          t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
       scheduler_activate(s, t);
@@ -97,7 +97,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
     }
 
     if (t_type == task_type_pair &&
-        (t_subtype == task_subtype_gpu_unpack ||
+        (t_subtype == task_subtype_gpu_unpack_d ||
          t_subtype == task_subtype_gpu_unpack_g ||
          t_subtype == task_subtype_gpu_unpack_f)) {  // A. Nasar
       scheduler_activate(s, t);
@@ -115,9 +115,9 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
 #ifndef WITH_CUDA  // A. Nasar
       if (ci->nodeID != nodeID) error("Non-local self task found");
 #else
-      if (ci->nodeID != nodeID && t_subtype != task_subtype_gpu_unpack &&
-          t_subtype != task_subtype_gpu_unpack_f &&
-          t_subtype != task_subtype_gpu_unpack_g) {
+      if ((ci->nodeID != nodeID) && (t_subtype != task_subtype_gpu_unpack_d) &&
+          (t_subtype != task_subtype_gpu_unpack_f) &&
+          (t_subtype != task_subtype_gpu_unpack_g)) {
         fprintf(stderr, "task is %i\n", subtaskID_names[t->subtype]);
         error("Non-local self task found. Task is subtaskID_names[%s]",
               subtaskID_names[t->subtype]);
@@ -145,7 +145,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       /* Activate packing for GPU A. Nasar */
-      else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack) {
+      else if (t_type == task_type_self &&
+               t_subtype == task_subtype_gpu_pack_d) {
         if (ci_active_hydro) {
           scheduler_activate(s, t);
           ci->pack_done = 0;
@@ -189,7 +190,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       /* Store current values of dx_max and h_max. A. Nasar: Unsure if we
          actually need this*/
       else if (t_type == task_type_sub_self &&
-               t_subtype == task_subtype_gpu_pack) {
+               t_subtype == task_subtype_gpu_pack_d) {
         if (ci_active_hydro) {
           scheduler_activate(s, t);
         }
@@ -482,7 +483,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       const int cj_active_rt = cell_is_rt_active(cj, e);
 
       /* Activate packing for GPU A. Nasar */
-      if (t_subtype == task_subtype_gpu_pack &&
+      if (t_subtype == task_subtype_gpu_pack_d &&
           ((ci_active_hydro && ci_nodeID == nodeID) ||
            (cj_active_hydro && cj_nodeID == nodeID))) {
         scheduler_activate(s, t);
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 07682a2eaf..4aa0820074 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1243,7 +1243,8 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     //										//
     // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
     // stderr, 			"CUDA error in density self host 2 device
-    // memcpy: %s cpuid id is: %i\n ", 			cudaGetErrorString(cu_error), r->cpuid);
+    // memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
     //	  }
     // #endif
@@ -1284,8 +1285,9 @@ void runner_doself1_launch(struct runner *r, struct scheduler *s,
     //										//
     // Get error code 	  if (cu_error != cudaSuccess) {
     // fprintf(stderr, 				"CUDA error with self density
-    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
-    //r->cpuid); 		error("Something's up with your cuda code");
+    // D2H memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error),
+    // r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
@@ -1434,7 +1436,7 @@ void runner_doself1_launch_f4(
                     cudaMemcpyHostToDevice, stream[bid]);
     //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
     //// 	  if (cu_error != cudaSuccess) { 		fprintf(
-    ///stderr, 			"CUDA error in density
+    /// stderr, 			"CUDA error in density
     // self host 2 device memcpy: %s cpuid id is: %i\n ",
     //			cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
@@ -1453,7 +1455,8 @@ void runner_doself1_launch_f4(
     //										//
     // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
     // stderr, 			"CUDA error in density self host 2 device
-    // memcpy: %s cpuid id is: %i\n ", 			cudaGetErrorString(cu_error), r->cpuid);
+    // memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error), r->cpuid);
     //		exit(0);
     //	  }
     // #endif
@@ -1495,8 +1498,9 @@ void runner_doself1_launch_f4(
     //										//
     // Get error code 	  if (cu_error != cudaSuccess) {
     // fprintf(stderr, 				"CUDA error with self density
-    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
-    //r->cpuid); 		error("Something's up with your cuda code");
+    // D2H memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error),
+    // r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
@@ -2545,8 +2549,9 @@ void runner_dopair1_launch(struct runner *r, struct scheduler *s,
     //										//
     // Get error code 	  if (cu_error != cudaSuccess) {
     // fprintf(stderr, 				"CUDA error with self density
-    // D2H memcpy: %s cpuid id is: %i\n ", 				cudaGetErrorString(cu_error),
-    //r->cpuid); 		error("Something's up with your cuda code");
+    // D2H memcpy: %s cpuid id is: %i\n ",
+    // cudaGetErrorString(cu_error),
+    // r->cpuid); 		error("Something's up with your cuda code");
     //	  }
     // #endif
   } /*End of looping over bundles to launch in streams*/
diff --git a/src/scheduler.c b/src/scheduler.c
index e96e10dd63..a10e1f905e 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -901,7 +901,7 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
   int local_count = 0;
   for (int i = 0; i < s->nr_tasks; i++) {
     const struct task *ta = &s->tasks[i];
-    //    if(ta->subtype == task_subtype_gpu_unpack
+    //    if(ta->subtype == task_subtype_gpu_unpack_d
     //  		  || ta->subtype == task_subtype_gpu_unpack_f
     //			  || ta->subtype == task_subtype_gpu_unpack_g)continue;
     /* Are we using this task?
@@ -955,7 +955,7 @@ void scheduler_write_cell_dependencies(struct scheduler *s, int verbose,
     /* and their dependencies */
     for (int j = 0; j < ta->nr_unlock_tasks; j++) {
       const struct task *tb = ta->unlock_tasks[j];
-      if (tb->subtype == task_subtype_gpu_unpack ||
+      if (tb->subtype == task_subtype_gpu_unpack_d ||
           tb->subtype == task_subtype_gpu_unpack_f ||
           tb->subtype == task_subtype_gpu_unpack_g)
         continue;
@@ -1656,11 +1656,11 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
     } else if (t->subtype == task_subtype_grav) {
       scheduler_splittask_gravity(t, s);
       // if task is gpu task do not split A. Nasar
-    } else if (t->subtype == task_subtype_gpu_pack ||
+    } else if (t->subtype == task_subtype_gpu_pack_d ||
                t->subtype == task_subtype_gpu_pack_g ||
                t->subtype == task_subtype_gpu_pack_f) {
       scheduler_splittask_hydro(t, s);
-    } else if (t->subtype == task_subtype_gpu_unpack ||
+    } else if (t->subtype == task_subtype_gpu_unpack_d ||
                t->subtype == task_subtype_gpu_unpack_g ||
                t->subtype == task_subtype_gpu_unpack_f) {
       /*Do nothing and grab next task to split.
@@ -1768,7 +1768,7 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
   if (cj != NULL) cell_set_flag(cj, cell_flag_has_tasks);
 
   // #ifdef WITH_CUDA  A. Nasar
-  if (t->subtype == task_subtype_gpu_pack) {
+  if (t->subtype == task_subtype_gpu_pack_d) {
     if (t->type == task_type_self || t->type == task_type_sub_self)
       atomic_inc(&s->nr_self_pack_tasks_d);
     if (t->type == task_type_pair || t->type == task_type_sub_pair)
@@ -1874,7 +1874,7 @@ void scheduler_set_unlocks(struct scheduler *s) {
       for (int j = i + 1; j < t->nr_unlock_tasks; j++) {
         /*Fix for the case when one unpack task works over the same cell
          * connected to two pair pack tasks*/
-        if (t->subtype == task_subtype_gpu_unpack ||
+        if (t->subtype == task_subtype_gpu_unpack_d ||
             t->subtype == task_subtype_gpu_unpack_g ||
             t->subtype == task_subtype_gpu_unpack_f) {
           continue;
@@ -2060,13 +2060,13 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * (wscale * gcount_i) * gcount_i;
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
-        else if (t->subtype == task_subtype_gpu_pack)  // A. Nasar
-          cost = 1.f * (wscale * count_i) * count_i;   // * s->pack_size;
+        else if (t->subtype == task_subtype_gpu_pack_d)  // A. Nasar
+          cost = 1.f * (wscale * count_i) * count_i;     // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_f)
           cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_g)
           cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
-        else if (t->subtype == task_subtype_gpu_unpack)
+        else if (t->subtype == task_subtype_gpu_unpack_d)
           cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_f)
           cost = 1.f * wscale * s->pack_size;
@@ -2111,13 +2111,13 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           else
             cost = 2.f * (wscale * gcount_i) * gcount_j;
           // Abouzied: Think about good cost (for rainy days) A. Nasar
-        } else if (t->subtype == task_subtype_gpu_pack) {
+        } else if (t->subtype == task_subtype_gpu_pack_d) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_pack_f) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_pack_g) {
           cost = 2.f * (wscale * count_i) * count_i;
-        } else if (t->subtype == task_subtype_gpu_unpack) {
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
           cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
           cost = 1.f * wscale;
@@ -2254,13 +2254,13 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
 
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * (bcount_i + bcount_j);
-        } else if (t->subtype == task_subtype_gpu_pack) {
+        } else if (t->subtype == task_subtype_gpu_pack_d) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_pack_f) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_pack_g) {
           cost = 2.f * (wscale * count_i) * count_i;
-        } else if (t->subtype == task_subtype_gpu_unpack) {
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
           cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
           cost = 1.f * wscale;
@@ -2304,13 +2304,13 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
           cost = 1.f * wscale * count_i;
         } else if (t->subtype == task_subtype_do_bh_swallow) {
           cost = 1.f * wscale * bcount_i;
-        } else if (t->subtype == task_subtype_gpu_pack)  // A. Nasar
-          cost = 1.f * (wscale * count_i) * count_i;     // * s->pack_size;
+        } else if (t->subtype == task_subtype_gpu_pack_d)  // A. Nasar
+          cost = 1.f * (wscale * count_i) * count_i;       // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_f)
           cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_g)
           cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
-        else if (t->subtype == task_subtype_gpu_unpack)
+        else if (t->subtype == task_subtype_gpu_unpack_d)
           cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_f)
           cost = 1.f * wscale * s->pack_size;
@@ -2479,7 +2479,7 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
 
     //    if (t->type == task_type_self){ // A. Nasar increment number of
     //    waiting tasks
-    //      if(t->subtype == task_subtype_gpu_pack)
+    //      if(t->subtype == task_subtype_gpu_pack_d)
     //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left);
     //      if (t->subtype == task_subtype_gpu_pack_f)
     //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_self_left_f);
@@ -2488,7 +2488,7 @@ void scheduler_rewait_mapper(void *map_data, int num_elements,
     //    }
     //
     //    if (t->type == task_type_pair){
-    //      if(t->subtype == task_subtype_gpu_pack)
+    //      if(t->subtype == task_subtype_gpu_pack_d)
     //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left);
     //      if (t->subtype == task_subtype_gpu_pack_f)
     //        atomic_inc(&s->queues[t->ci->hydro.super->owner].n_packs_pair_left_f);
@@ -2627,7 +2627,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
             t->subtype == task_subtype_external_grav) {
           qid = t->ci->grav.super->owner;
           owner = &t->ci->grav.super->owner;
-        } else if (t->subtype == task_subtype_gpu_pack) {  // A. Nasar
+        } else if (t->subtype == task_subtype_gpu_pack_d) {  // A. Nasar
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
         } else if (t->subtype == task_subtype_gpu_pack_f) {
@@ -2636,7 +2636,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         } else if (t->subtype == task_subtype_gpu_pack_g) {
           qid = t->ci->hydro.super->owner;
           owner = &t->ci->hydro.super->owner;
-        } else if (t->subtype == task_subtype_gpu_unpack) {
+        } else if (t->subtype == task_subtype_gpu_unpack_d) {
           qid = -1;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
           qid = -1;
@@ -2668,7 +2668,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
         break;
       case task_type_pair:
       case task_type_sub_pair:
-        if (t->subtype == task_subtype_gpu_unpack ||
+        if (t->subtype == task_subtype_gpu_unpack_d ||
             t->subtype == task_subtype_gpu_unpack_f ||
             t->subtype == task_subtype_gpu_unpack_g) {
           qid = -1;
@@ -2897,7 +2897,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     queue_insert(&s->queues[qid], t);
     /* A. Nasar: Increment counters required for the pack tasks */
     if (t->type == task_type_self || t->type == task_type_sub_self) {
-      if (t->subtype == task_subtype_gpu_pack) {
+      if (t->subtype == task_subtype_gpu_pack_d) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_d++;
         lock_unlock(&s->queues[qid].lock);
@@ -2919,7 +2919,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     /* A. Nasar NEED to think about how to do this with
      MPI where ci may not be on this node/rank */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (t->subtype == task_subtype_gpu_pack) {
+      if (t->subtype == task_subtype_gpu_pack_d) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_d++;
         lock_unlock(&s->queues[qid].lock);
@@ -3229,7 +3229,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
             /*Move counter from the robbed to the robber*/
             if ((type == task_type_self || type == task_type_sub_self) &&
-                subtype == task_subtype_gpu_pack) {
+                subtype == task_subtype_gpu_pack_d) {
               q->n_packs_self_left_d--;
               q_stl->n_packs_self_left_d--;
               atomic_inc(&s->s_d_left[qid]);
@@ -3250,7 +3250,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
               atomic_dec(&s->s_f_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair) &&
-                subtype == task_subtype_gpu_pack) {
+                subtype == task_subtype_gpu_pack_d) {
               q->n_packs_pair_left_d--;
               q_stl->n_packs_pair_left_d--;
               atomic_inc(&s->p_d_left[qid]);
@@ -3297,9 +3297,9 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       if (res == NULL && s->waiting > 0) {
         struct queue qq = s->queues[qid];
         //    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
-        //    %i, pair_left %i", s->waiting, 			qq.n_packs_self_stolen_f,
-        //    qq.n_packs_self_left_f, qq.n_packs_pair_stolen_f,
-        //    qq.n_packs_pair_left_f);
+        //    %i, pair_left %i", s->waiting,
+        //    qq.n_packs_self_stolen_f, qq.n_packs_self_left_f,
+        //    qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);
@@ -3569,7 +3569,7 @@ void scheduler_report_task_times_mapper(void *map_data, int num_elements,
     const enum task_categories cat = task_get_category(t);
     time_local[cat] += total_time;
 
-    if (t->subtype == task_subtype_gpu_pack ||
+    if (t->subtype == task_subtype_gpu_pack_d ||
         t->subtype == task_subtype_gpu_pack_f ||
         t->subtype == task_subtype_gpu_pack_g) {
       time_local[task_category_gpu_pack] +=
diff --git a/src/task.c b/src/task.c
index d9d02529cb..cbe9547e9d 100644
--- a/src/task.c
+++ b/src/task.c
@@ -605,7 +605,7 @@ void task_unlock(struct task *t) {
 #ifdef SWIFT_TASKS_WITHOUT_ATOMICS
         cell_unlocktree(ci);
 #endif
-      } else if (subtype == task_subtype_gpu_unpack) {
+      } else if (subtype == task_subtype_gpu_unpack_d) {
         //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
         //        pp++){
         //		  cell_unlocktree(t->ci_unpack[pp]);
@@ -615,7 +615,7 @@ void task_unlock(struct task *t) {
         /*Do nothing and be on your way*/
       } else if (subtype == task_subtype_gpu_unpack_g) {
         /*Do nothing and be on your way*/
-      } else if (subtype == task_subtype_gpu_pack) {
+      } else if (subtype == task_subtype_gpu_pack_d) {
         cell_unlocktree(ci);
       } else if (subtype == task_subtype_gpu_pack_f) {
         cell_unlocktree(ci);
@@ -668,7 +668,7 @@ void task_unlock(struct task *t) {
         cell_unlocktree(ci);
         cell_unlocktree(cj);
 #endif
-      } else if (subtype == task_subtype_gpu_pack) {
+      } else if (subtype == task_subtype_gpu_pack_d) {
         cell_unlocktree(ci);
         cell_unlocktree(cj);
       } else if (subtype == task_subtype_gpu_pack_f) {
@@ -677,7 +677,7 @@ void task_unlock(struct task *t) {
       } else if (subtype == task_subtype_gpu_pack_g) {
         cell_unlocktree(ci);
         cell_unlocktree(cj);
-      } else if (subtype == task_subtype_gpu_unpack) {
+      } else if (subtype == task_subtype_gpu_unpack_d) {
         /* Nothing to do */
       } else if (subtype == task_subtype_gpu_unpack_f) {
         /* Nothing to do */
@@ -886,7 +886,7 @@ int task_lock(struct task *t) {
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
 #endif
-      } else if (subtype == task_subtype_gpu_pack) {
+      } else if (subtype == task_subtype_gpu_pack_d) {
         /* Attempt to lock the cell */
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
@@ -898,7 +898,7 @@ int task_lock(struct task *t) {
         /* Attempt to lock the cell */
         if (ci->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
-      } else if (subtype == task_subtype_gpu_unpack) {
+      } else if (subtype == task_subtype_gpu_unpack_d) {
         //        for(int pp = 0; pp < 128 /*should be sched->pack_size*/;
         //        pp++){
         //    	  if (t->ci_unpack[pp]->gpu_done == 0){
@@ -1034,7 +1034,7 @@ int task_lock(struct task *t) {
           return 0;
         }
 #endif
-      } else if (subtype == task_subtype_gpu_pack) {
+      } else if (subtype == task_subtype_gpu_pack_d) {
         /* Lock the parts in both cells */
         if (ci->hydro.hold || cj->hydro.hold) return 0;
         if (cell_locktree(ci) != 0) return 0;
@@ -1058,7 +1058,7 @@ int task_lock(struct task *t) {
           cell_unlocktree(ci);
           return 0;
         }
-      } else if (subtype == task_subtype_gpu_unpack) {
+      } else if (subtype == task_subtype_gpu_unpack_d) {
         /* Nothing to do here. */
         return 1;
       } else if (subtype == task_subtype_gpu_unpack_f) {
@@ -1231,8 +1231,8 @@ void task_get_group_name(int type, int subtype, char *cluster) {
 
   switch (subtype) {
     /* A. Nasar */
-    case task_subtype_gpu_pack:
-    case task_subtype_gpu_unpack:
+    case task_subtype_gpu_pack_d:
+    case task_subtype_gpu_unpack_d:
       strcpy(cluster, "Density");
       break;
     case task_subtype_gpu_pack_f:
@@ -1747,11 +1747,11 @@ void task_dump_active(struct engine *e) {
       int paired = (t->cj != NULL);
       int otherrank = 0;
       // A. N.: Mods requied to stop code crashing when debugging GPU tasks
-      if (t->subtype != task_subtype_gpu_unpack &&
+      if (t->subtype != task_subtype_gpu_unpack_d &&
           t->subtype != task_subtype_gpu_unpack_f &&
           t->subtype != task_subtype_gpu_unpack_g)
         otherrank = t->ci->nodeID;
-      if (paired && t->subtype != task_subtype_gpu_unpack &&
+      if (paired && t->subtype != task_subtype_gpu_unpack_d &&
           t->subtype != task_subtype_gpu_unpack_f &&
           t->subtype != task_subtype_gpu_unpack_g)
         otherrank = t->cj->nodeID;
@@ -1881,8 +1881,8 @@ enum task_categories task_get_category(const struct task *t) {
         case task_subtype_force:
           return task_category_hydro;
 
-        case task_subtype_gpu_pack:  // A. Nasar
-        case task_subtype_gpu_unpack:
+        case task_subtype_gpu_pack_d:  // A. Nasar
+        case task_subtype_gpu_unpack_d:
         case task_subtype_gpu_pack_f:
         case task_subtype_gpu_unpack_f:
         case task_subtype_gpu_pack_g:
diff --git a/src/task.h b/src/task.h
index dfce456a4b..c6991751b5 100644
--- a/src/task.h
+++ b/src/task.h
@@ -160,10 +160,10 @@ enum task_subtypes {
   task_subtype_sink_do_gas_swallow,
   task_subtype_rt_gradient,
   task_subtype_rt_transport,
-  task_subtype_gpu_pack,  // A. Nasar
+  task_subtype_gpu_pack_d,  // A. Nasar
   task_subtype_gpu_pack_g,
   task_subtype_gpu_pack_f,
-  task_subtype_gpu_unpack,
+  task_subtype_gpu_unpack_d,
   task_subtype_gpu_unpack_g,
   task_subtype_gpu_unpack_f,
   task_subtype_count

From 214e1728017a34319acf5ec728d2e9b4474ed684 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Mon, 27 Jan 2025 23:06:58 +0100
Subject: [PATCH 145/217] Fix compiler warnings

---
 src/engine_maketasks.c |  2 +-
 src/engine_marktasks.c |  4 ++--
 src/scheduler.c        | 36 ++++++++++++++++++++++--------------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index 90bf7b3dcd..cd0404ded3 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4353,7 +4353,7 @@ void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements,
       continue;
 
     /* If the cell is local build a self-interaction */
-    struct task *t_pack_self;  // A. Nasar
+    // struct task *t_pack_self;  // A. Nasar
     if (ci->nodeID == nodeID) {
       scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci,
                         NULL);
diff --git a/src/engine_marktasks.c b/src/engine_marktasks.c
index 4504a07f95..89f5e41b74 100644
--- a/src/engine_marktasks.c
+++ b/src/engine_marktasks.c
@@ -221,8 +221,8 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
       }
 
       else if (t_type == task_type_sub_self &&
-                   t_subtype == task_subtype_gradient ||
-               t_subtype == task_subtype_gpu_pack_g) {
+               (t_subtype == task_subtype_gradient ||
+                t_subtype == task_subtype_gpu_pack_g)) {
         if (ci_active_hydro) scheduler_activate(s, t);
       }
 
diff --git a/src/scheduler.c b/src/scheduler.c
index a10e1f905e..d8e85e5d05 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2900,19 +2900,22 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
       if (t->subtype == task_subtype_gpu_pack_d) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_d++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->s_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_f++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->s_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_g++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->s_g_left[qid]);
       }
     }
@@ -2922,19 +2925,22 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
       if (t->subtype == task_subtype_gpu_pack_d) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_d++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->p_d_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_f) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_f++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->p_f_left[qid]);
       }
       if (t->subtype == task_subtype_gpu_pack_g) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_g++;
-        lock_unlock(&s->queues[qid].lock);
+        if (lock_unlock(&s->queues[qid].lock) != 0)
+          error("Error unlocking queue");
         atomic_inc(&s->p_g_left[qid]);
       }
     }
@@ -3208,7 +3214,8 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
 
               /* Failed? --> Unlock the 1st queue  and
                  try again */
-              lock_unlock(&q->lock);
+              if (lock_unlock(&q->lock) != 0)
+                error("Unlocking our queue failed");
               continue;
             }
           }
@@ -3278,8 +3285,9 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
             qids[ind] = qids[--count];
           }
 
-          lock_unlock(&q->lock);
-          lock_unlock(&q_stl->lock);
+          if (lock_unlock(&q->lock) != 0) error("Unlocking our queue failed");
+          if (lock_unlock(&q_stl->lock) != 0)
+            error("Unlocking the stealing queue failed");
         }
         if (res != NULL) break;
       }
@@ -3295,11 +3303,11 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
       pthread_mutex_lock(&s->sleep_mutex);
       res = queue_gettask(&s->queues[qid], prev, 1);
       if (res == NULL && s->waiting > 0) {
-        struct queue qq = s->queues[qid];
-        //    	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
-        //    %i, pair_left %i", s->waiting,
-        //    qq.n_packs_self_stolen_f, qq.n_packs_self_left_f,
-        //    qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
+        // struct queue qq = s->queues[qid];
+        //     	message("s->waiting %i self_stolen %i, self_left %i, pair_stolen
+        //     %i, pair_left %i", s->waiting,
+        //     qq.n_packs_self_stolen_f, qq.n_packs_self_left_f,
+        //     qq.n_packs_pair_stolen_f, qq.n_packs_pair_left_f);
         pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex);
       }
       pthread_mutex_unlock(&s->sleep_mutex);

From 2a4ec9399beb60432bbefa8cde58c2ffbb9219df Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 28 Jan 2025 10:07:05 +0000
Subject: [PATCH 146/217] Fix missing changes to task names in runner_main()

---
 src/runner_main_clean.cu | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 62907f1d17..50be174a90 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1019,7 +1019,7 @@ void *runner_main2(void *data) {
         struct cell *ci_temp = ci;
         struct cell *cj_temp = cj;
         double shift[3];
-        if (t->subtype != task_subtype_gpu_unpack &&
+        if (t->subtype != task_subtype_gpu_unpack_d &&
             t->subtype != task_subtype_gpu_unpack_g &&
             t->subtype != task_subtype_gpu_unpack_f)
           t->sid = space_getsid_and_swap_cells(e->s, &ci_temp, &cj_temp, shift);
@@ -1039,7 +1039,7 @@ void *runner_main2(void *data) {
       /* Different types of tasks... */
       switch (t->type) {
         case task_type_self:
-          if (t->subtype == task_subtype_gpu_unpack) {
+          if (t->subtype == task_subtype_gpu_unpack_d) {
             unpacked++;
           } else if (t->subtype == task_subtype_gpu_unpack_g) {
             unpacked_g++;
@@ -1058,7 +1058,7 @@ void *runner_main2(void *data) {
             density++;
 #endif
             /* GPU WORK */
-          } else if (t->subtype == task_subtype_gpu_pack) {
+          } else if (t->subtype == task_subtype_gpu_pack_d) {
             packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
             ticks tic_cpu_pack = getticks();
@@ -1252,7 +1252,7 @@ void *runner_main2(void *data) {
 #endif
           }
           /* GPU WORK */
-          else if (t->subtype == task_subtype_gpu_pack) {
+          else if (t->subtype == task_subtype_gpu_pack_d) {
             packed_pair++;
 #ifdef GPUOFFLOAD_DENSITY
 #ifdef DO_CORNERS
@@ -1506,7 +1506,7 @@ void *runner_main2(void *data) {
             }
 #endif  // DO_CORNERS
 #endif  // GPUOFFLOAD_FORCE
-          } else if (t->subtype == task_subtype_gpu_unpack) {
+          } else if (t->subtype == task_subtype_gpu_unpack_d) {
             unpacked_pair++;
           } else if (t->subtype == task_subtype_gpu_unpack_g) {
             unpacked_pair_g++;
@@ -1924,7 +1924,7 @@ void *runner_main2(void *data) {
       /* We're done with this task, see if we get a next one. */
       prev = t;
 
-      if (t->subtype == task_subtype_gpu_pack) {
+      if (t->subtype == task_subtype_gpu_pack_d) {
 #ifdef GPUOFFLOAD_DENSITY
         /* Don't enqueue unpacks yet. Just signal the runners */
         t->skip = 1;
@@ -1960,7 +1960,7 @@ void *runner_main2(void *data) {
 #endif
       }
 
-      else if (t->subtype != task_subtype_gpu_pack &&
+      else if (t->subtype != task_subtype_gpu_pack_d &&
                t->subtype != task_subtype_gpu_pack_g &&
                t->subtype != task_subtype_gpu_pack_f) {
         t = scheduler_done(sched, t);

From 38ab772b42b59aa583e957f39e552370d79d117f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 28 Jan 2025 10:07:48 +0000
Subject: [PATCH 147/217] Make git ignore the new executables

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 46ef541ee9..d7cdef73c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,8 @@ swift
 swift_mpi
 fof
 fof_mpi
+swift_cuda
+swift_cudampi
 
 src/version_string.h
 swift*.tar.gz

From 88c4f2db3fda5ffddec50b5695254129bda9878d Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 28 Jan 2025 12:06:05 +0000
Subject: [PATCH 148/217] Correct executable name

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d7cdef73c9..0e3cb19964 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,7 +18,7 @@ swift_mpi
 fof
 fof_mpi
 swift_cuda
-swift_cudampi
+swift_mpicuda
 
 src/version_string.h
 swift*.tar.gz

From 3a279b357e9e2e52fd7958a47413307cc738051f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 10:08:18 +0000
Subject: [PATCH 149/217] Corrected one of the silly exit(0)s in
 src/runner_doiact_functions_hydro_gpu.h

---
 src/runner_doiact_functions_hydro_gpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 4aa0820074..34f6fa9034 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2930,7 +2930,7 @@ void runner_dopair1_launch_f4_one_memcpy(
           "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
           cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
           max_parts_i, max_parts_j);
-      exit(0);
+      error("Something's up with kernel launch.");
     }
 #endif
 

From 2f608601e3697cefee9845efa57770f78405b704 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 10:30:48 +0000
Subject: [PATCH 150/217] Added if statement to stop code launching kernels for
 cells with no particles

---
 src/runner_doiact_functions_hydro_gpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 34f6fa9034..855aaa9592 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -2917,7 +2917,7 @@ void runner_dopair1_launch_f4_one_memcpy(
     int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int bundle_part_0 = pack_vars->bundle_first_part[bid];
     /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_density_gpu_aos_f4(
+    if(numBlocks_x > 0)runner_dopair_branch_density_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
         numBlocks_y, bundle_part_0, bundle_n_parts);
 
@@ -3646,7 +3646,7 @@ void runner_dopair1_launch_f4_g_one_memcpy(
     //              bundle_part_0, bundle_first_task);
 
     /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_gradient_gpu_aos_f4(
+    if(numBlocks_x > 0)runner_dopair_branch_gradient_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
         numBlocks_y, bundle_part_0, bundle_n_parts);
 
@@ -4402,7 +4402,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     //              bundle_part_0, bundle_first_task);
 
     /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+    if(numBlocks_x > 0)runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
                                           stream[bid], numBlocks_x, numBlocks_y,
                                           bundle_part_0, bundle_n_parts);
 

From 9cb3d3576d4bf4da072ccb6cb64ffe948279e05e Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 10:34:16 +0000
Subject: [PATCH 151/217] Added the same if statement to self tasks

---
 src/runner_doiact_functions_hydro_gpu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 855aaa9592..44b5d27a7c 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1477,7 +1477,7 @@ void runner_doself1_launch_f4(
     // %i tasks leftovers %i\n", 			  tasks_packed,
     // pack_vars->launch_leftovers);
     // Launch the kernel
-    launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    if(numBlocks_x > 0)launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                           numBlocks_x, numBlocks_y, bundle_first_task,
                           d_task_first_part_f4);
     // #ifdef CUDA_DEBUG
@@ -1878,7 +1878,7 @@ void runner_doself1_launch_f4_g(
     int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //	  const char *loop_type = "density";
     // Launch the kernel
-    launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    if(numBlocks_x > 0)launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                            numBlocks_x, numBlocks_y, bundle_first_task,
                            d_task_first_part_f4);
 #ifdef CUDA_DEBUG
@@ -2288,7 +2288,7 @@ void runner_doself1_launch_f4_f(
     int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     // Launch the kernel
-    launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    if(numBlocks_x > 0)launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                         numBlocks_x, numBlocks_y, bundle_first_task,
                         d_task_first_part_f4_f);
 #ifdef CUDA_DEBUG

From cf3b019ecbee631cc3116b0477fd33b910617f9d Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 10:44:31 +0000
Subject: [PATCH 152/217] Extended (if(numBlocks_x > 0)) to all cuda calls as
 code was still crashing due to memcpys

---
 src/runner_doiact_functions_hydro_gpu.h | 137 ++++++++++++------------
 1 file changed, 69 insertions(+), 68 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 44b5d27a7c..48b3cd3dfd 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1426,11 +1426,21 @@ void runner_doself1_launch_f4(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
     //      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task],
     //      (last_task - first_task) * sizeof(int2),
     //    		  devId, stream[bid]);
-    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4[first_task],
                     &task_first_part_f4[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -1445,7 +1455,7 @@ void runner_doself1_launch_f4(
     //       *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
     //   			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
     //   1000000000.0;
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
                     cudaMemcpyHostToDevice, stream[bid]);
 
@@ -1460,16 +1470,7 @@ void runner_doself1_launch_f4(
     //		exit(0);
     //	  }
     // #endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+
     //	  const char *loop_type = "density";
     //	  struct first_part first_parts;
     //	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
@@ -1489,10 +1490,10 @@ void runner_doself1_launch_f4(
     // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
     //	  }
     // #endif
-    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(self_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
     // #ifdef CUDA_DEBUG
     //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
     //										//
@@ -1842,13 +1843,23 @@ void runner_doself1_launch_f4_g(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4[first_task],
                     &task_first_part_f4[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
 
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
     //	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
@@ -1866,17 +1877,6 @@ void runner_doself1_launch_f4_g(
       exit(0);
     }
 #endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    //	  const char *loop_type = "density";
     // Launch the kernel
     if(numBlocks_x > 0)launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                            numBlocks_x, numBlocks_y, bundle_first_task,
@@ -1891,10 +1891,10 @@ void runner_doself1_launch_f4_g(
       exit(0);
     }
 #endif
-    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(self_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -2256,12 +2256,22 @@ void runner_doself1_launch_f4_f(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
-    cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
                     &task_first_part_f4_f[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
 
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_send),
                     cudaMemcpyHostToDevice, stream[bid]);
 
@@ -2277,16 +2287,6 @@ void runner_doself1_launch_f4_f(
       exit(0);
     }
 #endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     // Launch the kernel
     if(numBlocks_x > 0)launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                         numBlocks_x, numBlocks_y, bundle_first_task,
@@ -2300,10 +2300,10 @@ void runner_doself1_launch_f4_f(
       exit(0);
     }
 #endif
-    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(self_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -2891,8 +2891,13 @@ void runner_dopair1_launch_f4_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -2911,11 +2916,6 @@ void runner_dopair1_launch_f4_one_memcpy(
     }
 #endif
     /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     /* Launch the kernel for ci using data for ci and cj */
     if(numBlocks_x > 0)runner_dopair_branch_density_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
@@ -2935,11 +2935,11 @@ void runner_dopair1_launch_f4_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -3616,8 +3616,13 @@ void runner_dopair1_launch_f4_g_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -3637,11 +3642,6 @@ void runner_dopair1_launch_f4_g_one_memcpy(
 
     //	  const int tasksperbundle = pack_vars->tasksperbundle;
     /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
 
@@ -3664,11 +3664,11 @@ void runner_dopair1_launch_f4_g_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -4363,8 +4363,13 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -4392,11 +4397,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
     //      }
 
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
+
     //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
@@ -4420,11 +4421,11 @@ void runner_dopair1_launch_f4_f_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
+    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //

From 7ba6d3d5b5f889c1b06afcf77e3914d517f5b989 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 11:18:18 +0000
Subject: [PATCH 153/217] Added if statements in runner_main2() so that if cell
 hydro count is zero just call scheduler_done()

---
 src/runner_main_clean.cu | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 50be174a90..bc12157f56 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1063,6 +1063,10 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
             ticks tic_cpu_pack = getticks();
 
+            if(ci->hydro.count == 0){
+            	t = scheduler_done(sched, t);
+            }
+            else{
             packing_time +=
                 runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
                                        parts_aos_f4_send, task_first_part_f4);
@@ -1099,6 +1103,7 @@ void *runner_main2(void *data) {
                   &unpack_time_self, task_first_part_self_dens_f4, devId,
                   task_first_part_f4, d_task_first_part_f4, self_end);
             } /*End of GPU work Self*/
+            }
 #endif
           } /* self / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
@@ -1106,7 +1111,10 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_GRADIENT
 
             ticks tic_cpu_pack = getticks();
-
+            if(ci->hydro.count == 0){
+            	t = scheduler_done(sched, t);
+            }
+            else{
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
@@ -1134,12 +1142,16 @@ void *runner_main2(void *data) {
                   &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
                   d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
+            }
 #endif  // GPUGRADSELF
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_self_f++;
 #ifdef GPUOFFLOAD_FORCE
             ticks tic_cpu_pack = getticks();
-
+            if(ci->hydro.count == 0){
+            	t = scheduler_done(sched, t);
+            }
+            else{
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
@@ -1167,6 +1179,7 @@ void *runner_main2(void *data) {
                   &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
                   d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
             } /*End of GPU work Self*/
+            }
 #endif
           }
 #ifdef EXTRA_HYDRO_LOOP
@@ -1302,7 +1315,10 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
               ticks tic_cpu_pack = getticks();
-
+              if(ci->hydro.count == 0){
+              	t = scheduler_done(sched, t);
+              }
+              else{
               packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
@@ -1334,6 +1350,7 @@ void *runner_main2(void *data) {
                     pair_end);
               }
               pack_vars_pair_dens->launch_leftovers = 0;
+              }
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif  // DO_CORNERS
@@ -1386,7 +1403,10 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
-
+              if(ci->hydro.count == 0){
+              	t = scheduler_done(sched, t);
+              }
+              else{
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
@@ -1417,6 +1437,7 @@ void *runner_main2(void *data) {
                     pair_end_g);
               }
               pack_vars_pair_grad->launch_leftovers = 0;
+              }
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif  // DO_CORNERS
@@ -1471,7 +1492,10 @@ void *runner_main2(void *data) {
         //            ci, 		cj, t, parts_aos_pair_forc, e,
         //            &packing_time_f);
               ticks tic_cpu_pack = getticks();
-
+              if(ci->hydro.count == 0){
+              	t = scheduler_done(sched, t);
+              }
+              else{
               packing_time_pair_f +=
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
@@ -1502,6 +1526,7 @@ void *runner_main2(void *data) {
 
                 pack_vars_pair_forc->launch_leftovers = 0;
               } /* End of GPU work Pairs */
+              }
 #ifdef DO_CORNERS
             }
 #endif  // DO_CORNERS

From 7afbe5671c0bc7b5b1fe0ff87532a4f1fb6a74fd Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 12:07:19 +0000
Subject: [PATCH 154/217] Fixed bug in implementation of pair skip condition
 when count == 0. Fixed counter decrementation in scheduler_enqueue for the
 case when cell->hydro.count == 0

---
 src/runner_main_clean.cu |  6 +++---
 src/scheduler.c          | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index bc12157f56..50d282a041 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1315,7 +1315,7 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0){
+              if(ci->hydro.count == 0 || cj->hydro.count == 0){
               	t = scheduler_done(sched, t);
               }
               else{
@@ -1403,7 +1403,7 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0){
+              if(ci->hydro.count == 0 || cj->hydro.count == 0){
               	t = scheduler_done(sched, t);
               }
               else{
@@ -1492,7 +1492,7 @@ void *runner_main2(void *data) {
         //            ci, 		cj, t, parts_aos_pair_forc, e,
         //            &packing_time_f);
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0){
+              if(ci->hydro.count == 0 || cj->hydro.count == 0){
               	t = scheduler_done(sched, t);
               }
               else{
diff --git a/src/scheduler.c b/src/scheduler.c
index d8e85e5d05..1f52c147e1 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2890,28 +2890,51 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
 
     /* Save qid as owner for next time a task accesses this cell. */
     if (owner != NULL) *owner = qid;
-
+//    if (t->type == task_type_self || t->type == task_type_sub_self) {
+//      if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
+//    	  return;
+//      }
+//    }
+//    /* A. Nasar NEED to think about how to do this with
+//     MPI where ci may not be on this node/rank */
+//    if (t->type == task_type_pair || t->type == task_type_sub_pair) {
+//      if (t->subtype == task_subtype_gpu_pack_d  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_f  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//      if (t->subtype == task_subtype_gpu_pack_g  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
+//    	  return;
+//      }
+//    }
     /* Increase the waiting counter. */
     atomic_inc(&s->waiting);
     /* Insert the task into that queue. */
     queue_insert(&s->queues[qid], t);
     /* A. Nasar: Increment counters required for the pack tasks */
     if (t->type == task_type_self || t->type == task_type_sub_self) {
-      if (t->subtype == task_subtype_gpu_pack_d) {
+      if (t->subtype == task_subtype_gpu_pack_d && t->ci->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_d++;
         if (lock_unlock(&s->queues[qid].lock) != 0)
           error("Error unlocking queue");
         atomic_inc(&s->s_d_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_f) {
+      if (t->subtype == task_subtype_gpu_pack_f && t->ci->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_f++;
         if (lock_unlock(&s->queues[qid].lock) != 0)
           error("Error unlocking queue");
         atomic_inc(&s->s_f_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_g) {
+      if (t->subtype == task_subtype_gpu_pack_g && t->ci->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_self_left_g++;
         if (lock_unlock(&s->queues[qid].lock) != 0)
@@ -2922,21 +2945,21 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
     /* A. Nasar NEED to think about how to do this with
      MPI where ci may not be on this node/rank */
     if (t->type == task_type_pair || t->type == task_type_sub_pair) {
-      if (t->subtype == task_subtype_gpu_pack_d) {
+      if (t->subtype == task_subtype_gpu_pack_d  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_d++;
         if (lock_unlock(&s->queues[qid].lock) != 0)
           error("Error unlocking queue");
         atomic_inc(&s->p_d_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_f) {
+      if (t->subtype == task_subtype_gpu_pack_f  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_f++;
         if (lock_unlock(&s->queues[qid].lock) != 0)
           error("Error unlocking queue");
         atomic_inc(&s->p_f_left[qid]);
       }
-      if (t->subtype == task_subtype_gpu_pack_g) {
+      if (t->subtype == task_subtype_gpu_pack_g  && t->ci->hydro.count > 0  && t->cj->hydro.count > 0) {
         lock_lock(&s->queues[qid].lock);
         s->queues[qid].n_packs_pair_left_g++;
         if (lock_unlock(&s->queues[qid].lock) != 0)

From fa013d2adf7ba81e8823ac7b008c7f250917c848 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 13:24:08 +0000
Subject: [PATCH 155/217] Reverted runner_main2 not to skip tasks with zero
 particles as they are now not activated in cell_unskip

---
 src/cell_unskip.c        |  2 +-
 src/runner_main_clean.cu | 35 +++++------------------------------
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index d0daee37ff..88889c28d3 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1904,7 +1904,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
   }
   /* Unskip all the other task types. */
   int c_active = cell_is_active_hydro(c, e);
-  if (c->nodeID == nodeID && c_active) {
+  if (c->nodeID == nodeID && c_active && c->hydro.count > 0) {
     for (struct link *l = c->hydro.density_pack; l != NULL;
          l = l->next) { /* A. Nasar */
       scheduler_activate(s, l->t);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 50d282a041..50be174a90 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1063,10 +1063,6 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
             ticks tic_cpu_pack = getticks();
 
-            if(ci->hydro.count == 0){
-            	t = scheduler_done(sched, t);
-            }
-            else{
             packing_time +=
                 runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
                                        parts_aos_f4_send, task_first_part_f4);
@@ -1103,7 +1099,6 @@ void *runner_main2(void *data) {
                   &unpack_time_self, task_first_part_self_dens_f4, devId,
                   task_first_part_f4, d_task_first_part_f4, self_end);
             } /*End of GPU work Self*/
-            }
 #endif
           } /* self / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
@@ -1111,10 +1106,7 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_GRADIENT
 
             ticks tic_cpu_pack = getticks();
-            if(ci->hydro.count == 0){
-            	t = scheduler_done(sched, t);
-            }
-            else{
+
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
@@ -1142,16 +1134,12 @@ void *runner_main2(void *data) {
                   &packing_time_g, &time_for_gpu_g, task_first_part_f4_g,
                   d_task_first_part_f4_g, self_end_g, &unpack_time_self_g);
             } /*End of GPU work Self*/
-            }
 #endif  // GPUGRADSELF
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_self_f++;
 #ifdef GPUOFFLOAD_FORCE
             ticks tic_cpu_pack = getticks();
-            if(ci->hydro.count == 0){
-            	t = scheduler_done(sched, t);
-            }
-            else{
+
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
@@ -1179,7 +1167,6 @@ void *runner_main2(void *data) {
                   &packing_time_f, &time_for_gpu_f, task_first_part_f4_f,
                   d_task_first_part_f4_f, self_end_f, &unpack_time_self_f);
             } /*End of GPU work Self*/
-            }
 #endif
           }
 #ifdef EXTRA_HYDRO_LOOP
@@ -1315,10 +1302,7 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0 || cj->hydro.count == 0){
-              	t = scheduler_done(sched, t);
-              }
-              else{
+
               packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
@@ -1350,7 +1334,6 @@ void *runner_main2(void *data) {
                     pair_end);
               }
               pack_vars_pair_dens->launch_leftovers = 0;
-              }
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif  // DO_CORNERS
@@ -1403,10 +1386,7 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0 || cj->hydro.count == 0){
-              	t = scheduler_done(sched, t);
-              }
-              else{
+
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
@@ -1437,7 +1417,6 @@ void *runner_main2(void *data) {
                     pair_end_g);
               }
               pack_vars_pair_grad->launch_leftovers = 0;
-              }
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif  // DO_CORNERS
@@ -1492,10 +1471,7 @@ void *runner_main2(void *data) {
         //            ci, 		cj, t, parts_aos_pair_forc, e,
         //            &packing_time_f);
               ticks tic_cpu_pack = getticks();
-              if(ci->hydro.count == 0 || cj->hydro.count == 0){
-              	t = scheduler_done(sched, t);
-              }
-              else{
+
               packing_time_pair_f +=
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
@@ -1526,7 +1502,6 @@ void *runner_main2(void *data) {
 
                 pack_vars_pair_forc->launch_leftovers = 0;
               } /* End of GPU work Pairs */
-              }
 #ifdef DO_CORNERS
             }
 #endif  // DO_CORNERS

From a662bb0c556948e991965bc5df84c4e4e6637a44 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 13:26:50 +0000
Subject: [PATCH 156/217] Converting sub tasks into selfs and pairs in
 engine_maketasks

---
 src/engine_maketasks.c | 64 +++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index cd0404ded3..b774d29912 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -4931,41 +4931,35 @@ void engine_maketasks(struct engine *e) {
      *                sched->tasks, sched->nr_tasks, sizeof(struct task),
      *                threadpool_auto_chunk_size, e); */
   }
-  //  int unsplit = 0, split = 0;
-  //  /*These loops should really be threadmapped A. Nasar*/
-  //  for (int i = 0; i < sched->nr_tasks; i++) {
-  //	  struct task * t = &sched->tasks[i];
-  //	  if(t->type == task_type_sub_self && t->subtype ==
-  // task_subtype_gpu_pack_d){
-  //        t->type = task_type_self;
-  //        fprintf(stderr, "sub_self");
-  //	  }
-  //      if(t->type == task_type_sub_pair && t->subtype ==
-  //      task_subtype_gpu_pack_d){
-  //    	t->type = task_type_pair;
-  //        fprintf(stderr, "sub_pair");
-  //      }
-  //	  if(t->type == task_type_sub_self && t->subtype ==
-  // task_subtype_gpu_pack_g){
-  //        t->type = task_type_self;
-  //        fprintf(stderr, "sub_self");
-  //	  }
-  //      if(t->type == task_type_sub_pair && t->subtype ==
-  //      task_subtype_gpu_pack_g){
-  //    	t->type = task_type_pair;
-  //        fprintf(stderr, "sub_pair");
-  //      }
-  //	  if(t->type == task_type_sub_self && t->subtype ==
-  // task_subtype_gpu_pack_f){
-  //        t->type = task_type_self;
-  //        fprintf(stderr, "sub_self");
-  //	  }
-  //      if(t->type == task_type_sub_pair && t->subtype ==
-  //      task_subtype_gpu_pack_f){
-  //    	t->type = task_type_pair;
-  //        fprintf(stderr, "sub_pair");
-  //      }
-  //  }
+    int unsplit = 0, split = 0;
+    /*These loops should really be threadmapped A. Nasar*/
+    for (int i = 0; i < sched->nr_tasks; i++) {
+  	  struct task * t = &sched->tasks[i];
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_d){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_d){
+      	t->type = task_type_pair;
+        }
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_g){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_g){
+      	t->type = task_type_pair;
+        }
+  	  if(t->type == task_type_sub_self && t->subtype ==
+   task_subtype_gpu_pack_f){
+          t->type = task_type_self;
+  	  }
+        if(t->type == task_type_sub_pair && t->subtype ==
+        task_subtype_gpu_pack_f){
+      	t->type = task_type_pair;
+        }
+    }
 
   /* Now, create unpack tasks based on the existing packs and create
    * the dependencies pack->unpack->ghost_in A. Nasar */

From 6ba1c4efbae70b0f1f5c2e9e69368e2aadc426da Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 13:30:46 +0000
Subject: [PATCH 157/217] Removed blocking if statements from launch and
 packing code in runner_doiact_functions_hydro_gpu.h

---
 src/runner_doiact_functions_hydro_gpu.h | 149 ++++++++++++------------
 1 file changed, 74 insertions(+), 75 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 48b3cd3dfd..34f6fa9034 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1426,21 +1426,11 @@ void runner_doself1_launch_f4(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //	  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
     //      cudaMemPrefetchAsync(&d_task_first_part_self_dens_f4[first_task],
     //      (last_task - first_task) * sizeof(int2),
     //    		  devId, stream[bid]);
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
                     &task_first_part_f4[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -1455,7 +1445,7 @@ void runner_doself1_launch_f4(
     //       *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
     //   			(t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) /
     //   1000000000.0;
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
                     cudaMemcpyHostToDevice, stream[bid]);
 
@@ -1470,7 +1460,16 @@ void runner_doself1_launch_f4(
     //		exit(0);
     //	  }
     // #endif
-
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //	  const char *loop_type = "density";
     //	  struct first_part first_parts;
     //	  for(int i = 0; i < numBlocks_y; i++) first_parts.list[i] =
@@ -1478,7 +1477,7 @@ void runner_doself1_launch_f4(
     // %i tasks leftovers %i\n", 			  tasks_packed,
     // pack_vars->launch_leftovers);
     // Launch the kernel
-    if(numBlocks_x > 0)launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    launch_density_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                           numBlocks_x, numBlocks_y, bundle_first_task,
                           d_task_first_part_f4);
     // #ifdef CUDA_DEBUG
@@ -1490,10 +1489,10 @@ void runner_doself1_launch_f4(
     // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
     //	  }
     // #endif
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
     // #ifdef CUDA_DEBUG
     //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
     //										//
@@ -1843,23 +1842,13 @@ void runner_doself1_launch_f4_g(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    //	  const char *loop_type = "density";
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4[first_task],
+
+    cudaMemcpyAsync(&d_task_first_part_f4[first_task],
                     &task_first_part_f4[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
 
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
     //	  fprintf(stderr, "bid %i first_part %i nparts %i\n", bid,
@@ -1877,8 +1866,19 @@ void runner_doself1_launch_f4_g(
       exit(0);
     }
 #endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //	  const char *loop_type = "density";
     // Launch the kernel
-    if(numBlocks_x > 0)launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    launch_gradient_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                            numBlocks_x, numBlocks_y, bundle_first_task,
                            d_task_first_part_f4);
 #ifdef CUDA_DEBUG
@@ -1891,10 +1891,10 @@ void runner_doself1_launch_f4_g(
       exit(0);
     }
 #endif
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -2256,22 +2256,12 @@ void runner_doself1_launch_f4_f(
     const int first_part_tmp = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp;
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
+    cudaMemcpyAsync(&d_task_first_part_f4_f[first_task],
                     &task_first_part_f4_f[first_task],
                     (last_task + 1 - first_task) * sizeof(int2),
                     cudaMemcpyHostToDevice, stream[bid]);
 
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp], &parts_send[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_send),
                     cudaMemcpyHostToDevice, stream[bid]);
 
@@ -2287,8 +2277,18 @@ void runner_doself1_launch_f4_f(
       exit(0);
     }
 #endif
+    const int tasksperbundle = pack_vars->tasksperbundle;
+    int tasks_left = tasksperbundle;
+    if (bid == nBundles_temp - 1) {
+      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    }
+    // Will launch a 2d grid of GPU thread blocks (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = tasks_left;
+    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     // Launch the kernel
-    if(numBlocks_x > 0)launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
+    launch_force_aos_f4(d_parts_send, d_parts_recv, d_a, d_H, stream[bid],
                         numBlocks_x, numBlocks_y, bundle_first_task,
                         d_task_first_part_f4_f);
 #ifdef CUDA_DEBUG
@@ -2300,10 +2300,10 @@ void runner_doself1_launch_f4_f(
       exit(0);
     }
 #endif
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp], &d_parts_recv[first_part_tmp],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(self_end[bid], stream[bid]);
+    cudaEventRecord(self_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -2891,13 +2891,8 @@ void runner_dopair1_launch_f4_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -2916,8 +2911,13 @@ void runner_dopair1_launch_f4_one_memcpy(
     }
 #endif
     /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     /* Launch the kernel for ci using data for ci and cj */
-    if(numBlocks_x > 0)runner_dopair_branch_density_gpu_aos_f4(
+    runner_dopair_branch_density_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
         numBlocks_y, bundle_part_0, bundle_n_parts);
 
@@ -2935,11 +2935,11 @@ void runner_dopair1_launch_f4_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -3616,13 +3616,8 @@ void runner_dopair1_launch_f4_g_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -3642,11 +3637,16 @@ void runner_dopair1_launch_f4_g_one_memcpy(
 
     //	  const int tasksperbundle = pack_vars->tasksperbundle;
     /* LAUNCH THE GPU KERNELS for ci & cj */
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
 
     /* Launch the kernel for ci using data for ci and cj */
-    if(numBlocks_x > 0)runner_dopair_branch_gradient_gpu_aos_f4(
+    runner_dopair_branch_gradient_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
         numBlocks_y, bundle_part_0, bundle_n_parts);
 
@@ -3664,11 +3664,11 @@ void runner_dopair1_launch_f4_g_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -4363,13 +4363,8 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
 
-    if(numBlocks_x > 0)cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
+    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_send),
                     cudaMemcpyHostToDevice, stream[bid]);
@@ -4397,13 +4392,17 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
     //      }
 
-
+    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
+    // the y dimension and max_parts is the x dimension
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
 
     /* Launch the kernel for ci using data for ci and cj */
-    if(numBlocks_x > 0)runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
+    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
                                           stream[bid], numBlocks_x, numBlocks_y,
                                           bundle_part_0, bundle_n_parts);
 
@@ -4421,11 +4420,11 @@ void runner_dopair1_launch_f4_f_one_memcpy(
 #endif
 
     // Copy results back to CPU BUFFERS
-    if(numBlocks_x > 0)cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
-    if(numBlocks_x > 0)cudaEventRecord(pair_end[bid], stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //

From 6be7e2e46391e1af47fcdc90e0392edb37035ed6 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 13:48:23 +0000
Subject: [PATCH 158/217] Skipping task activation for GPU tasks without
 particles

---
 src/cell_unskip.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 88889c28d3..53d29b6a18 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1907,7 +1907,10 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
   if (c->nodeID == nodeID && c_active && c->hydro.count > 0) {
     for (struct link *l = c->hydro.density_pack; l != NULL;
          l = l->next) { /* A. Nasar */
-      scheduler_activate(s, l->t);
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
 #ifdef SWIFT_DEBUG_CHECKS
       if (l->t->ci != NULL) {
         l->t->ci->pack_done = 0;
@@ -1937,7 +1940,10 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
       scheduler_activate(s, l->t);
     // A. Nasar activate force and gradient packing tasks
     for (struct link *l = c->hydro.force_pack; l != NULL; l = l->next) {
-      scheduler_activate(s, l->t);
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
 #ifdef SWIFT_DEBUG_CHECKS
       if (l->t->ci != NULL) {
         l->t->ci->pack_done_f = 0;
@@ -1960,7 +1966,10 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
 
 #ifdef EXTRA_HYDRO_LOOP
     for (struct link *l = c->hydro.gradient_pack; l != NULL; l = l->next) {
-      scheduler_activate(s, l->t);
+    	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
+          scheduler_activate(s, l->t);
+    	else if(l->t->type == task_type_pair && l->t->ci->hydro.count > 0 && l->t->cj->hydro.count > 0)
+          scheduler_activate(s, l->t);
 #ifdef SWIFT_DEBUG_CHECKS
       if (l->t->ci != NULL) {
         l->t->ci->pack_done_g = 0;

From 53fc4797fd156a7036a0b488cde426f94170a17e Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 13:55:16 +0000
Subject: [PATCH 159/217] Modified .yml files for gresho and islated galaxy. In
 isolated_galaxy.yml set the max time step to 1e-6 to prevent code crashing
 when there are only a few particle updates. Need to modify GPU code to work
 for this case. Also set count_max_parts_tmp in runner_main2 to 10 times
 target to prevent running out of bounds for unsplit cells

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml           | 8 ++++----
 .../IsolatedGalaxy_feedback/isolated_galaxy.yml          | 9 ++++++++-
 examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh   | 2 +-
 src/runner_main_clean.cu                                 | 2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index b96f552291..463e616637 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,10 +7,10 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 8
+  max_top_level_cells: 16
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
-  cell_split_size: 80
+  cell_split_size: 200
   cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
   cell_sub_size_self_hydro:  50     # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
@@ -25,7 +25,7 @@ Snapshots:
   basename:            gresho # Common part of the name of output files
   time_first:          0.     # Time of the first output (in internal units)
   delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
-  compression:         1
+  #  compression:         1
   
 # Parameters governing the conserved quantities statistics
 Statistics:
@@ -38,6 +38,6 @@ SPH:
   
 # Parameters related to the initial conditions
 InitialConditions:
-  file_name:  ./greshoVortex.hdf5     # The file to read
+  file_name:  greshoVortex_128.hdf5
   periodic:   1
   replicate:  2
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
index 8717af63bd..bcabd810dd 100644
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/isolated_galaxy.yml
@@ -10,6 +10,13 @@ InternalUnitSystem:
   UnitCurrent_in_cgs:  1             # Amperes
   UnitTemp_in_cgs:     1             # Kelvin
 
+
+
+
+
+
+
+
 # Parameters for the self-gravity scheme
 Gravity:
   eta:          0.025                 # Constant dimensionless multiplier for time integration.
@@ -24,7 +31,7 @@ TimeIntegration:
   time_begin:        0.    # The starting time of the simulation (in internal units).
   time_end:          0.1   # The end time of the simulation (in internal units).
   dt_min:            1e-9  # The minimal time-step size of the simulation (in internal units).
-  dt_max:            1e-2  # The maximal time-step size of the simulation (in internal units).
+  dt_max:            1e-6  # The maximal time-step size of the simulation (in internal units).
 
 # Parameters governing the snapshots
 Snapshots:
diff --git a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
index 6931897b2c..6a2fa4d897 100755
--- a/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
+++ b/examples/IsolatedGalaxy/IsolatedGalaxy_feedback/run.sh
@@ -30,7 +30,7 @@ then
     ./getEaglePhotometryTable.sh
 fi
 
-../../../swift --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
+../../../swift_mpicuda --threads=16 --feedback --external-gravity --self-gravity --stars --star-formation --cooling --hydro --limiter --sync isolated_galaxy.yml 2>&1 | tee output.log
 
 # Kennicutt-Schmidt law plot
 python3 plotSolution.py 100
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 50be174a90..e965b9afd7 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -673,7 +673,7 @@ void *runner_main2(void *data) {
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
    *  the allocated memory on buffers and GPU. This can happen if calculated h
    * is larger than cell width and splitting makes bigger than target cells*/
-  int count_max_parts_tmp = 2 * target_n_tasks * (np_per_cell + buff);
+  int count_max_parts_tmp = 20 * target_n_tasks * (np_per_cell + buff);
 
   //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
   //  count_max_parts_tmp, target_n_tasks);

From d7806bdae505aaf0066b3afe66caa88d1f10f84b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 14:30:30 +0000
Subject: [PATCH 160/217] Added code to prevent update of inactive particles
 during unpacking

---
 src/runner_gpu_pack_functions.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
index c51d503352..c69e4202a3 100644
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -12,6 +12,7 @@
 #include "scheduler.h"
 #include "space_getsid.h"
 #include "timers.h"
+#include "runner_doiact_hydro.h"
 
 // #ifdef WITHCUDA
 // extern "C" {
@@ -950,7 +951,7 @@ void unpack_neat_aos_f4(struct cell *c,
     float4 rho_dh_wcount = p_tmp.rho_dh_wcount;
     float4 rot_ux_div_v = p_tmp.rot_ux_div_v;
     struct part *p = &c->hydro.parts[i];
-
+    if(!PART_IS_ACTIVE(p, e))continue;
     p->rho += rho_dh_wcount.x;
     p->density.rho_dh += rho_dh_wcount.y;
     p->density.wcount += rho_dh_wcount.z;
@@ -989,6 +990,7 @@ void unpack_neat_aos_f4_g(struct cell *c,
   for (int i = 0; i < count; i++) {
     struct part_aos_f4_g_recv p_tmp = parts_tmp[i];
     struct part *p = &c->hydro.parts[i];
+    if(!PART_IS_ACTIVE(p, e))continue;
     const float v_sig = p->viscosity.v_sig;
     p->viscosity.v_sig = fmaxf(p_tmp.vsig_lapu_aviscmax.x, v_sig);
     p->diffusion.laplace_u += p_tmp.vsig_lapu_aviscmax.y;
@@ -1033,11 +1035,13 @@ void unpack_neat_aos_f4_f(struct cell *restrict c,
   for (int i = 0; i < count; i++) {
     //	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
     //	      struct part *restrict p = &c->hydro.parts[i];
+	if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
     c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
     c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
     c->hydro.parts[i].a_hydro[2] += parts_aos_buffer[i + pp].a_hydro.z;
   }
   for (int i = 0; i < count; i++) {
+	if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
     c->hydro.parts[i].viscosity.v_sig =
         fmaxf(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.z,
               c->hydro.parts[i].viscosity.v_sig);
@@ -1045,6 +1049,7 @@ void unpack_neat_aos_f4_f(struct cell *restrict c,
         (int)(parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.w + 0.5f);
   }
   for (int i = 0; i < count; i++) {
+    if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
     c->hydro.parts[i].u_dt +=
         parts_aos_buffer[i + pp].udt_hdt_vsig_mintimebin_ngb.x;
     c->hydro.parts[i].force.h_dt +=

From 037c469c5cf8e6a14cba2091712520cbcc4f72a1 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 31 Jan 2025 15:20:02 +0000
Subject: [PATCH 161/217] Stopped the formation of star particles. Changed
 cell_unskip so that only gpu tasks are not activated if they have zero
 particles

---
 src/cell_unskip.c   | 2 +-
 src/runner_others.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cell_unskip.c b/src/cell_unskip.c
index 53d29b6a18..a9572ea3bc 100644
--- a/src/cell_unskip.c
+++ b/src/cell_unskip.c
@@ -1904,7 +1904,7 @@ int cell_unskip_hydro_tasks(struct cell *c, struct scheduler *s) {
   }
   /* Unskip all the other task types. */
   int c_active = cell_is_active_hydro(c, e);
-  if (c->nodeID == nodeID && c_active && c->hydro.count > 0) {
+  if (c->nodeID == nodeID && c_active) {
     for (struct link *l = c->hydro.density_pack; l != NULL;
          l = l->next) { /* A. Nasar */
     	if(l->t->type == task_type_self && l->t->ci->hydro.count > 0)
diff --git a/src/runner_others.c b/src/runner_others.c
index cbace92a63..914b1f47a3 100644
--- a/src/runner_others.c
+++ b/src/runner_others.c
@@ -381,7 +381,7 @@ void runner_do_star_formation(struct runner *r, struct cell *c, int timer) {
 
     /* Loop over the gas particles in this cell. */
     for (int k = 0; k < count; k++) {
-
+      continue; //A. Nasar: Commented out to try without inhibited particles
       /* Get a handle on the part. */
       struct part *restrict p = &parts[k];
       struct xpart *restrict xp = &xparts[k];

From 243030b9614aa7c49b0dd04d76ef9d2816973f63 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 10:56:46 +0000
Subject: [PATCH 162/217] Minor change to gresho.yml

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 463e616637..03f3ca3222 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -38,6 +38,6 @@ SPH:
   
 # Parameters related to the initial conditions
 InitialConditions:
-  file_name:  greshoVortex_128.hdf5
+  file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  2
+  replicate:  4

From 00a53d6efeef6a66e0b57b493f9aaef0d1147118 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 11:00:28 +0000
Subject: [PATCH 163/217] Removed excessive allocation of GPU and Buffer
 arrays. Back to 2x instead of 20x

---
 src/runner_main_clean.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index e965b9afd7..50be174a90 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -673,7 +673,7 @@ void *runner_main2(void *data) {
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
    *  the allocated memory on buffers and GPU. This can happen if calculated h
    * is larger than cell width and splitting makes bigger than target cells*/
-  int count_max_parts_tmp = 20 * target_n_tasks * (np_per_cell + buff);
+  int count_max_parts_tmp = 2 * target_n_tasks * (np_per_cell + buff);
 
   //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
   //  count_max_parts_tmp, target_n_tasks);

From f083d6bc6f4815587c085ee76858aaf02a9a33da Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 13:01:14 +0000
Subject: [PATCH 164/217] Reverted space_splitsize_default to 400

---
 src/space.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/space.h b/src/space.h
index 870ec7aae1..a5358c913c 100644
--- a/src/space.h
+++ b/src/space.h
@@ -46,7 +46,7 @@ struct hydro_props;
 
 /* Some constants. */
 #define space_cellallocchunk 1000
-#define space_splitsize_default 100
+#define space_splitsize_default 400
 #define space_maxsize_default 8000000
 #define space_grid_split_threshold_default 100
 #define space_extra_parts_default 0

From 1871a08e37a9e2b2f2d3cf0f7e30bfa82d29b8e2 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 13:23:24 +0000
Subject: [PATCH 165/217] Small changes: Increased count_max_parts_tmp to 8x
 for now. Edited debug message

---
 src/runner_main_clean.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 50be174a90..c1e0bc1626 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -673,7 +673,7 @@ void *runner_main2(void *data) {
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
    *  the allocated memory on buffers and GPU. This can happen if calculated h
    * is larger than cell width and splitting makes bigger than target cells*/
-  int count_max_parts_tmp = 2 * target_n_tasks * (np_per_cell + buff);
+  int count_max_parts_tmp = 8 * target_n_tasks * (np_per_cell + buff);
 
   //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
   //  count_max_parts_tmp, target_n_tasks);
@@ -1967,8 +1967,8 @@ void *runner_main2(void *data) {
       }
     } /* main loop. */
 
-    message("cpu %i packed %i cells with %i containing more parts than target",
-            r->cpuid, n_cells, n_w_prts_gtr_target);
+    message("cpu %i packed %i cells with %i containing more parts than target of %i",
+            r->cpuid, n_cells, n_w_prts_gtr_target, np_per_cell);
 
     //    message("Worked on %i supers w more than 100 parts", g100);
     // Stuff for writing debug data to file for validation

From 8ab4d45c74a5685f2cc62cd41e47124c7df8ee14 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 15:42:46 +0000
Subject: [PATCH 166/217] Nearly done with fixing splitting. Apparently I only
 fixed it for density tasks in previous commit. I am now extending the revised
 splitting strategy to gradient and force subtypes

---
 src/engine_maketasks.c | 131 ++++++++++++++++++++++++++---------------
 1 file changed, 83 insertions(+), 48 deletions(-)

diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index b774d29912..ebb48f8150 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2847,6 +2847,46 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
       }
+#ifdef EXTRA_HYDRO_LOOP
+      /* Start by constructing the task for the second and third GPU hydro loop
+       * A. Nasar */
+      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
+                                         task_subtype_gpu_pack_g, 0, 0, ci, cj);
+      //      /* Add the link between the new loop and both cells */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        /*Same for GPU tasks*/
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        /*Same for GPU tasks*/
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+#else
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        // GPU tasks A. Nasar
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        // GPU tasks A. Nasar
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+#endif
+
     }
 
     /* Otherwise, pair interaction? */
@@ -3031,17 +3071,10 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-      /* Start by constructing the task for the second and third GPU hydro loop
-       * A. Nasar */
-      t_gradient_gpu = scheduler_addtask(sched, task_type_pair,
-                                         task_subtype_gpu_pack_g, 0, 0, ci, cj);
 
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
-      //      /* Add the link between the new loop and both cells */
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
@@ -3049,21 +3082,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
-        /*Same for GPU tasks*/
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                            t_gradient_gpu);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
-        /*Same for GPU tasks*/
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-                            t_gradient_gpu);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
-                            t_force_gpu);
       }
 #else
 
@@ -3073,17 +3096,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                              with_cooling,
                                              with_timestep_limiter);
-        // GPU tasks A. Nasar
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
-        // GPU tasks A. Nasar
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-                            t_force_gpu);
       }
 
 #endif
@@ -3723,12 +3740,54 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.drift, t);
       }
-
       /* Make all density tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
       }
+      t_force_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+      /* Make all force tasks depend on the sorts */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+      }
+#endif
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj);
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      engine_addlink(e, &cj->hydro.gradient_pack, t_gradient_gpu);
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_gradient_gpu);
+        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
+                            t_force_gpu);
+      }
+#else
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      if (ci->nodeID == nodeID) {
+        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+      if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
+                            t_force_gpu);
+      }
+#endif
+
     } else if (t_type == task_type_sub_pair &&
                t_subtype == task_subtype_density) {
 
@@ -3752,8 +3811,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force */
       t_force = scheduler_addtask(sched, task_type_sub_pair, task_subtype_force,
                                   flags, 0, ci, cj);
-      t_force_gpu = scheduler_addtask(
-          sched, task_type_sub_pair, task_subtype_gpu_pack_f, flags, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -3763,10 +3820,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -3864,8 +3919,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
-      engine_addlink(e, &ci->hydro.force, t_force_gpu);
-      engine_addlink(e, &cj->hydro.force, t_force_gpu);
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3919,34 +3972,20 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_pair,
                                      task_subtype_gradient, flags, 0, ci, cj);
-      t_gradient_gpu = scheduler_addtask(
-          sched, task_type_sub_pair, task_subtype_gpu_pack_g, flags, 0, ci, cj);
-
       /* Add the link between the new loop and both cells */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
       engine_addlink(e, &cj->hydro.gradient, t_gradient);
-      engine_addlink(e, &ci->hydro.gradient, t_gradient_gpu);
-      engine_addlink(e, &cj->hydro.gradient, t_gradient_gpu);
-
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       if (ci->nodeID == nodeID) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, ci, with_cooling,
                                              with_timestep_limiter);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                            t_gradient_gpu);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                              t_limiter, cj, with_cooling,
                                              with_timestep_limiter);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-                            t_gradient_gpu);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.extra_ghost,
-                            t_force_gpu);
       }
 #else
 
@@ -3956,15 +3995,11 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                              with_cooling,
                                              with_timestep_limiter);
-        scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                            t_force_gpu);
       }
       if ((cj->nodeID == nodeID) && (ci->hydro.super != cj->hydro.super)) {
         engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, cj,
                                              with_cooling,
                                              with_timestep_limiter);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.ghost_out,
-                            t_force_gpu);
       }
 #endif
 

From 0901a733eec4e8dff588ffc3a61d7488f8054ff6 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 16:35:22 +0000
Subject: [PATCH 167/217] Implemented splitting correctly. Code gives correct
 results and dependency graphs are looking good. Split domain with 8^3 cells
 into 16^3 cells with no issues

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   3 +-
 src/engine_maketasks.c                        | 114 +++++----
 src/scheduler.c                               | 233 +++++++++++++++++-
 3 files changed, 303 insertions(+), 47 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 03f3ca3222..1ec79dff52 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 8
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
   cell_split_size: 200
@@ -40,4 +40,3 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  4
diff --git a/src/engine_maketasks.c b/src/engine_maketasks.c
index ebb48f8150..a0ff23b2be 100644
--- a/src/engine_maketasks.c
+++ b/src/engine_maketasks.c
@@ -2552,6 +2552,28 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /*Make packing depend on sorts and drift A. Nasar */
     else if (t_type == task_type_self && t_subtype == task_subtype_gpu_pack_d) {
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
+      /* Task for the second GPU hydro loop A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_self,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, NULL);
+      /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+      /* Same work for the additional GPU hydro loop A. Nasar */
+      t_gradient_gpu = scheduler_addtask(
+          sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL);
+      /* Add the link between the new loops and the cell. Same for GPU task A.
+       * Nasar */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and
+      // will be used to create downstream deps later
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                          t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                          t_force_gpu);
+#else
+      /* Now, build all the dependencies for the hydro */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
     }
 
     /* Sort tasks depend on the drift of the cell (stars version). */
@@ -2570,9 +2592,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Task for the second hydro loop, */
       t_force = scheduler_addtask(sched, task_type_self, task_subtype_force,
                                   flags, 0, ci, NULL);
-      /* Task for the second GPU hydro loop A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_self,
-                                      task_subtype_gpu_pack_f, 0, 0, ci, NULL);
 
       /* the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -2638,9 +2657,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
                               flags, 0, ci, NULL);
       }
 
-      /* Link the tasks to the cells. Do the same for GPU tasks A. Nasar */
+      /* Link the tasks to the cells */
       engine_addlink(e, &ci->hydro.force, t_force);
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
 
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
@@ -2675,32 +2693,18 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Same work for the additional hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-      /* Same work for the additional GPU hydro loop A. Nasar */
-      t_gradient_gpu = scheduler_addtask(
-          sched, task_type_self, task_subtype_gpu_pack_g, 0, 0, ci, NULL);
-
       /* Add the link between the new loops and the cell. Same for GPU task A.
        * Nasar */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
-
-      // A. Nasar add unlocks for pack tasks here. Unpacks depend on packs and
-      // will be used to create downstream deps later
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                          t_gradient_gpu);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-                          t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
@@ -2847,6 +2851,24 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t);
       }
+      /* New task for the force A. Nasar */
+      t_force_gpu = scheduler_addtask(sched, task_type_pair,
+                                      task_subtype_gpu_pack_f, 0, 0, ci, cj);
+#ifdef MPI_SYMMETRIC_FORCE_INTERACTION
+      /* The order of operations for an inactive local cell interacting
+       * with an active foreign cell is not guaranteed because the density
+       * (and gradient) iact loops don't exist in that case. So we need
+       * an explicit dependency here to have sorted cells. */
+
+      /* Make GPU force tasks depend on the sorts A. Nasar */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
+      if (ci->hydro.super != cj->hydro.super) {
+        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
+      }
+#endif
+      /* Do teh same for GPU tasks A. Nasar*/
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
 #ifdef EXTRA_HYDRO_LOOP
       /* Start by constructing the task for the second and third GPU hydro loop
        * A. Nasar */
@@ -2912,9 +2934,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* New task for the force */
       t_force = scheduler_addtask(sched, task_type_pair, task_subtype_force,
                                   flags, 0, ci, cj);
-      /* New task for the force A. Nasar */
-      t_force_gpu = scheduler_addtask(sched, task_type_pair,
-                                      task_subtype_gpu_pack_f, 0, 0, ci, cj);
 
 #ifdef MPI_SYMMETRIC_FORCE_INTERACTION
       /* The order of operations for an inactive local cell interacting
@@ -2924,11 +2943,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Make all force tasks depend on the sorts */
       scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force);
-      /* Make GPU force tasks depend on the sorts A. Nasar */
-      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t_force_gpu);
       if (ci->hydro.super != cj->hydro.super) {
         scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force);
-        scheduler_addunlock(sched, cj->hydro.super->hydro.sorts, t_force_gpu);
       }
 #endif
 
@@ -3017,9 +3033,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       engine_addlink(e, &ci->hydro.force, t_force);
       engine_addlink(e, &cj->hydro.force, t_force);
-      /* Do teh same for GPU tasks A. Nasar*/
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
-      engine_addlink(e, &cj->hydro.force_pack, t_force_gpu);
+
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
         engine_addlink(e, &cj->hydro.limiter, t_limiter);
@@ -3444,8 +3458,35 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
     /*Make packing depend on sorts and drift A. Nasar */
     else if (t_type == task_type_sub_self &&
              t_subtype == task_subtype_gpu_pack_d) {
+
       scheduler_addunlock(sched, ci->hydro.super->hydro.drift, t);
-      //      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.sorts, t);
+      /* Start by constructing the task for the second hydro loop */
+      t_force_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
+                            flags, 0, ci, NULL);
+      /* Add the link between the new loop and the cell */
+      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
+#ifdef EXTRA_HYDRO_LOOP
+
+      /* Start by constructing the task for the second and third hydro loop */
+      t_gradient_gpu =
+          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g,
+                            flags, 0, ci, NULL);
+      /* Add the link between the new loop and the cell */
+      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
+                          t_gradient_gpu);
+      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
+                          t_force_gpu);
+#else
+
+      /* Now, build all the dependencies for the hydro for the cells */
+      /* that are local and are not descendant of the same super_hydro-cells */
+      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
+#endif
     }
     /* Otherwise, sub-self interaction? */
     else if (t_type == task_type_sub_self &&
@@ -3460,9 +3501,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second hydro loop */
       t_force = scheduler_addtask(sched, task_type_sub_self, task_subtype_force,
                                   flags, 0, ci, NULL);
-      t_force_gpu =
-          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_f,
-                            flags, 0, ci, NULL);
 
       /* and the task for the time-step limiter */
       if (with_timestep_limiter) {
@@ -3535,7 +3573,6 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
 
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.force, t_force);
-      engine_addlink(e, &ci->hydro.force_pack, t_force_gpu);
 
       if (with_timestep_limiter) {
         engine_addlink(e, &ci->hydro.limiter, t_limiter);
@@ -3570,30 +3607,19 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements,
       /* Start by constructing the task for the second and third hydro loop */
       t_gradient = scheduler_addtask(sched, task_type_sub_self,
                                      task_subtype_gradient, flags, 0, ci, NULL);
-      t_gradient_gpu =
-          scheduler_addtask(sched, task_type_sub_self, task_subtype_gpu_pack_g,
-                            flags, 0, ci, NULL);
-
       /* Add the link between the new loop and the cell */
       engine_addlink(e, &ci->hydro.gradient, t_gradient);
-      engine_addlink(e, &ci->hydro.gradient_pack, t_gradient_gpu);
-
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_gradient, t_force,
                                            t_limiter, ci, with_cooling,
                                            with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out,
-                          t_gradient_gpu);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.extra_ghost,
-                          t_force_gpu);
 #else
 
       /* Now, build all the dependencies for the hydro for the cells */
       /* that are local and are not descendant of the same super_hydro-cells */
       engine_make_hydro_loops_dependencies(sched, t, t_force, t_limiter, ci,
                                            with_cooling, with_timestep_limiter);
-      scheduler_addunlock(sched, ci->hydro.super->hydro.ghost_out, t_force_gpu);
 #endif
 
       /* Create the task dependencies */
diff --git a/src/scheduler.c b/src/scheduler.c
index 1f52c147e1..1ed27abc95 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -1173,6 +1173,237 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) {
   const int with_black_holes =
       (s->space->e->policy & engine_policy_black_holes);
 
+  /* Iterate on this task until we're done with it. */
+  int redo = 1;
+  while (redo) {
+    /* Reset the redo flag. */
+    redo = 0;
+
+    /* Is this a non-empty self-task? */
+    const int is_self =
+        (t->type == task_type_self) && (t->ci != NULL) &&
+        ((t->ci->hydro.count > 0) || (with_stars && t->ci->stars.count > 0) ||
+         (with_sinks && t->ci->sinks.count > 0) ||
+         (with_black_holes && t->ci->black_holes.count > 0));
+
+    /* Is this a non-empty pair-task? */
+    const int is_pair = (t->type == task_type_pair) && (t->ci != NULL) &&
+                        (t->cj != NULL) &&
+                        ((t->ci->hydro.count > 0) ||
+                         (with_feedback && t->ci->stars.count > 0) ||
+                         (with_sinks && t->ci->sinks.count > 0) ||
+                         (with_black_holes && t->ci->black_holes.count > 0)) &&
+                        ((t->cj->hydro.count > 0) ||
+                         (with_feedback && t->cj->stars.count > 0) ||
+                         (with_sinks && t->cj->sinks.count > 0) ||
+                         (with_black_holes && t->cj->black_holes.count > 0));
+
+    /* Empty task? */
+    if (!is_self && !is_pair) {
+      t->type = task_type_none;
+      t->subtype = task_subtype_none;
+      t->ci = NULL;
+      t->cj = NULL;
+      t->skip = 1;
+      break;
+    }
+
+    /* Self-interaction? */
+    if (t->type == task_type_self) {
+      /* Get a handle on the cell involved. */
+      struct cell *ci = t->ci;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Is this cell even split and the task does not violate h ? */
+      if (cell_can_split_self_hydro_task(ci)) {
+        /* Make a sub? */
+        if (scheduler_dosub && (ci->hydro.count < space_subsize_self_hydro_default) &&
+            (ci->stars.count < space_subsize_self_stars)) {
+          /* convert to a self-subtask. */
+          t->type = task_type_sub_self;
+
+          /* Otherwise, make tasks explicitly. */
+        } else {
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Add the self tasks. */
+          int first_child = 0;
+          while (ci->progeny[first_child] == NULL) first_child++;
+
+          t->ci = ci->progeny[first_child];
+          cell_set_flag(t->ci, cell_flag_has_tasks);
+
+          for (int k = first_child + 1; k < 8; k++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[k] != NULL &&
+                (ci->progeny[k]->hydro.count ||
+                 (with_stars && ci->progeny[k]->stars.count))) {
+              scheduler_splittask_hydro(
+                  scheduler_addtask(s, task_type_self, t->subtype, 0, 0,
+                                    ci->progeny[k], NULL),
+                  s);
+            }
+          }
+
+          /* Make a task for each pair of progeny */
+          for (int j = 0; j < 8; j++) {
+            /* Do we have a non-empty progenitor? */
+            if (ci->progeny[j] != NULL &&
+                (ci->progeny[j]->hydro.count ||
+                 (with_feedback && ci->progeny[j]->stars.count))) {
+              for (int k = j + 1; k < 8; k++) {
+                /* Do we have a second non-empty progenitor? */
+                if (ci->progeny[k] != NULL &&
+                    (ci->progeny[k]->hydro.count ||
+                     (with_feedback && ci->progeny[k]->stars.count))) {
+                  scheduler_splittask_hydro(
+                      scheduler_addtask(s, task_type_pair, t->subtype,
+                                        sub_sid_flag[j][k], 0, ci->progeny[j],
+                                        ci->progeny[k]),
+                      s);
+                }
+              }
+            }
+          }
+        }
+
+      } /* Cell is split */
+
+    } /* Self interaction */
+
+    /* Pair interaction? */
+    else if (t->type == task_type_pair) {
+      /* Get a handle on the cells involved. */
+      struct cell *ci = t->ci;
+      struct cell *cj = t->cj;
+
+      /* Foreign task? */
+      if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) {
+        t->skip = 1;
+        break;
+      }
+
+      /* Get the sort ID, use space_getsid_and_swap_cells and not t->flags
+         to make sure we get ci and cj swapped if needed. */
+      double shift[3];
+      const int sid = space_getsid_and_swap_cells(s->space, &ci, &cj, shift);
+
+#ifdef SWIFT_DEBUG_CHECKS
+      if (sid != t->flags)
+        error("Got pair task with incorrect flags: sid=%d flags=%lld", sid,
+              t->flags);
+#endif
+
+      /* Should this task be split-up? */
+      if (cell_can_split_pair_hydro_task(ci) &&
+          cell_can_split_pair_hydro_task(cj)) {
+
+        const int h_count_i = ci->hydro.count;
+        const int h_count_j = cj->hydro.count;
+
+        const int s_count_i = ci->stars.count;
+        const int s_count_j = cj->stars.count;
+
+        int do_sub_hydro = 1;
+        int do_sub_stars_i = 1;
+        int do_sub_stars_j = 1;
+        if (h_count_i > 0 && h_count_j > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_hydro =
+              h_count_i * sid_scale[sid] < space_subsize_pair_hydro_default / h_count_j;
+        }
+        if (s_count_i > 0 && h_count_j > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_stars_i =
+              s_count_i * sid_scale[sid] < space_subsize_pair_stars / h_count_j;
+        }
+        if (s_count_j > 0 && h_count_i > 0) {
+
+          /* Note: Use division to avoid integer overflow. */
+          do_sub_stars_j =
+              s_count_j * sid_scale[sid] < space_subsize_pair_stars / h_count_i;
+        }
+
+        /* Replace by a single sub-task? */
+        if (scheduler_dosub &&
+            (do_sub_hydro && do_sub_stars_i && do_sub_stars_j) &&
+            !sort_is_corner(sid)) {
+
+          /* Make this task a sub task. */
+          t->type = task_type_sub_pair;
+
+          /* Otherwise, split it. */
+        } else {
+          /* Take a step back (we're going to recycle the current task)... */
+          redo = 1;
+
+          /* Loop over the sub-cell pairs for the current sid and add new tasks
+           * for them. */
+          struct cell_split_pair *csp = &cell_split_pairs[sid];
+
+          t->ci = ci->progeny[csp->pairs[0].pid];
+          t->cj = cj->progeny[csp->pairs[0].pjd];
+          if (t->ci != NULL) cell_set_flag(t->ci, cell_flag_has_tasks);
+          if (t->cj != NULL) cell_set_flag(t->cj, cell_flag_has_tasks);
+
+          t->flags = csp->pairs[0].sid;
+          for (int k = 1; k < csp->count; k++) {
+            scheduler_splittask_hydro(
+                scheduler_addtask(s, task_type_pair, t->subtype,
+                                  csp->pairs[k].sid, 0,
+                                  ci->progeny[csp->pairs[k].pid],
+                                  cj->progeny[csp->pairs[k].pjd]),
+                s);
+          }
+        }
+
+        /* Otherwise, break it up if it is too large? */
+      } else if (scheduler_doforcesplit && ci->split && cj->split &&
+                 (ci->hydro.count > space_maxsize / cj->hydro.count)) {
+
+        /* Replace the current task. */
+        t->type = task_type_none;
+
+        for (int j = 0; j < 8; j++)
+          if (ci->progeny[j] != NULL && ci->progeny[j]->hydro.count)
+            for (int k = 0; k < 8; k++)
+              if (cj->progeny[k] != NULL && cj->progeny[k]->hydro.count) {
+                struct task *tl =
+                    scheduler_addtask(s, task_type_pair, t->subtype, 0, 0,
+                                      ci->progeny[j], cj->progeny[k]);
+                scheduler_splittask_hydro(tl, s);
+                tl->flags = space_getsid_and_swap_cells(s->space, &t->ci,
+                                                        &t->cj, shift);
+              }
+      }
+    } /* pair interaction? */
+  } /* iterate over the current task. */
+}
+
+/**
+ * @brief Split a hydrodynamic task if too large.
+ *
+ * @param t The #task
+ * @param s The #scheduler we are working in.
+ */
+static void scheduler_splittask_hydro_GPU(struct task *t, struct scheduler *s) {
+  /* Are we considering both stars and hydro when splitting? */
+  /* Note this is not very clean as the scheduler should not really
+     access the engine... */
+  const int with_feedback = (s->space->e->policy & engine_policy_feedback);
+  const int with_stars = (s->space->e->policy & engine_policy_stars);
+  const int with_sinks = (s->space->e->policy & engine_policy_sinks);
+  const int with_black_holes =
+      (s->space->e->policy & engine_policy_black_holes);
+
   /* Iterate on this task until we're done with it. */
   int redo = 1;
   while (redo) {
@@ -1659,7 +1890,7 @@ void scheduler_splittasks_mapper(void *map_data, int num_elements,
     } else if (t->subtype == task_subtype_gpu_pack_d ||
                t->subtype == task_subtype_gpu_pack_g ||
                t->subtype == task_subtype_gpu_pack_f) {
-      scheduler_splittask_hydro(t, s);
+      scheduler_splittask_hydro_GPU(t, s);
     } else if (t->subtype == task_subtype_gpu_unpack_d ||
                t->subtype == task_subtype_gpu_unpack_g ||
                t->subtype == task_subtype_gpu_unpack_f) {

From 9b214a187d0d6ad38846197abc6b619661f61964 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Wed, 5 Feb 2025 16:52:14 +0000
Subject: [PATCH 168/217] Tested and code works better for case with ~512 parts
 cell in comparison to with ideal size of ~64 parts per cell and 2 million
 parts. Will now test on G-H Isamabrd-AI

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 1ec79dff52..1ea28f26b8 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,12 +7,12 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 8
+  max_top_level_cells: 16
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
-  cell_split_size: 200
-  cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
-  cell_sub_size_self_hydro:  50     # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+  cell_split_size: 100
+  cell_sub_size_pair_hydro:  40 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  40 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -40,3 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
+  replicate:  2

From ecb94891731564c92d29f208bc03e377d705b55c Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 6 Feb 2025 17:30:27 +0000
Subject: [PATCH 169/217] Added counters to double check how many cells have
 target part number and how many have more

---
 src/runner_main_clean.cu | 60 ++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index c1e0bc1626..6041d32ed4 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -984,8 +984,18 @@ void *runner_main2(void *data) {
     sched->nr_packs_pair_forc_done = 0;
     sched->nr_packs_self_grad_done = 0;
     sched->nr_packs_pair_grad_done = 0;
-    int n_cells = 0;
-    int n_w_prts_gtr_target = 0;
+    int n_cells_d = 0;
+    int n_cells_g = 0;
+    int n_cells_f = 0;
+    int n_cells_p_d = 0;
+    int n_cells_p_g = 0;
+    int n_cells_p_f = 0;
+    int n_w_prts_gtr_target_d = 0;
+    int n_w_prts_gtr_target_g = 0;
+    int n_w_prts_gtr_target_f = 0;
+    int n_w_prts_gtr_target_p_d = 0;
+    int n_w_prts_gtr_target_p_g = 0;
+    int n_w_prts_gtr_target_p_f = 0;
     int g100 = 0;
     int l100 = 0;
     int maxcount = 0;
@@ -1071,10 +1081,10 @@ void *runner_main2(void *data) {
 
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-            n_cells++;
+            n_cells_d++;
             if (ci->hydro.count > 1.5 * np_per_cell) {
-              n_w_prts_gtr_target++;
-              message("count %i target %i", ci->hydro.count, np_per_cell);
+              n_w_prts_gtr_target_d++;
+//              message("count %i target %i", ci->hydro.count, np_per_cell);
             }
             //            	error("There's %i parts in a cell when it should
             //            be %i max", ci->hydro.count, np_per_cell);
@@ -1107,6 +1117,11 @@ void *runner_main2(void *data) {
 
             ticks tic_cpu_pack = getticks();
 
+            n_cells_g++;
+            if (ci->hydro.count > 1.5 * np_per_cell) {
+              n_w_prts_gtr_target_g++;
+//              message("count %i target %i", ci->hydro.count, np_per_cell);
+            }
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
@@ -1140,6 +1155,11 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_FORCE
             ticks tic_cpu_pack = getticks();
 
+            n_cells_f++;
+            if (ci->hydro.count > 1.5 * np_per_cell) {
+              n_w_prts_gtr_target_f++;
+//              message("count %i target %i", ci->hydro.count, np_per_cell);
+            }
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
@@ -1302,7 +1322,11 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
 
               ticks tic_cpu_pack = getticks();
-
+              n_cells_p_d++;
+              if (ci->hydro.count > 1.5 * np_per_cell) {
+                n_w_prts_gtr_target_p_d++;
+  //              message("count %i target %i", ci->hydro.count, np_per_cell);
+              }
               packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
@@ -1386,7 +1410,11 @@ void *runner_main2(void *data) {
             } else {
 #endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
-
+              n_cells_p_g++;
+              if (ci->hydro.count > 1.5 * np_per_cell) {
+                n_w_prts_gtr_target_p_g++;
+  //              message("count %i target %i", ci->hydro.count, np_per_cell);
+              }
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
@@ -1476,7 +1504,11 @@ void *runner_main2(void *data) {
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
                                            fparti_fpartj_lparti_lpartj_forc);
-
+              n_cells_p_f++;
+              if (ci->hydro.count > 1.5 * np_per_cell) {
+                n_w_prts_gtr_target_p_f++;
+  //              message("count %i target %i", ci->hydro.count, np_per_cell);
+              }
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
               /* No pack tasks left in queue, flag that we want to run */
@@ -1968,7 +2000,17 @@ void *runner_main2(void *data) {
     } /* main loop. */
 
     message("cpu %i packed %i cells with %i containing more parts than target of %i",
-            r->cpuid, n_cells, n_w_prts_gtr_target, np_per_cell);
+            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell);
+    message("cpu %i packed %i cells_G with %i containing more parts than target of %i",
+            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell);
+    message("cpu %i packed %i cells_F with %i containing more parts than target of %i",
+            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell);
+    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i",
+            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell);
+    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i",
+            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell);
+    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i",
+            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell);
 
     //    message("Worked on %i supers w more than 100 parts", g100);
     // Stuff for writing debug data to file for validation

From 81361f4ac57590f0e39516e974d69f3436eab3d7 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 6 Feb 2025 17:59:34 +0000
Subject: [PATCH 170/217] Modified the debugging code in prev. commit to track
 max number of particles in a task

---
 src/runner_main_clean.cu | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 6041d32ed4..1817936a98 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1082,6 +1082,7 @@ void *runner_main2(void *data) {
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
             n_cells_d++;
+            maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_d++;
 //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1118,6 +1119,7 @@ void *runner_main2(void *data) {
             ticks tic_cpu_pack = getticks();
 
             n_cells_g++;
+            maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_g++;
 //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1156,6 +1158,7 @@ void *runner_main2(void *data) {
             ticks tic_cpu_pack = getticks();
 
             n_cells_f++;
+            maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_f++;
 //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1323,6 +1326,7 @@ void *runner_main2(void *data) {
 
               ticks tic_cpu_pack = getticks();
               n_cells_p_d++;
+              maxcount = max(maxcount, ci->hydro.count);
               if (ci->hydro.count > 1.5 * np_per_cell) {
                 n_w_prts_gtr_target_p_d++;
   //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1411,6 +1415,7 @@ void *runner_main2(void *data) {
 #endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
               n_cells_p_g++;
+              maxcount = max(maxcount, ci->hydro.count);
               if (ci->hydro.count > 1.5 * np_per_cell) {
                 n_w_prts_gtr_target_p_g++;
   //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1505,6 +1510,7 @@ void *runner_main2(void *data) {
                                            cj, t, parts_aos_pair_f4_f_send, e,
                                            fparti_fpartj_lparti_lpartj_forc);
               n_cells_p_f++;
+              maxcount = max(maxcount, ci->hydro.count);
               if (ci->hydro.count > 1.5 * np_per_cell) {
                 n_w_prts_gtr_target_p_f++;
   //              message("count %i target %i", ci->hydro.count, np_per_cell);
@@ -1999,18 +2005,18 @@ void *runner_main2(void *data) {
       }
     } /* main loop. */
 
-    message("cpu %i packed %i cells with %i containing more parts than target of %i",
-            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell);
-    message("cpu %i packed %i cells_G with %i containing more parts than target of %i",
-            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell);
-    message("cpu %i packed %i cells_F with %i containing more parts than target of %i",
-            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell);
-    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i",
-            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell);
-    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i",
-            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell);
-    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i",
-            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell);
+    message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount);
+    message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount);
+    message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount);
+    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount);
+    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount);
+    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i",
+            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount);
 
     //    message("Worked on %i supers w more than 100 parts", g100);
     // Stuff for writing debug data to file for validation

From f4c98984c7ee8f1d2903f5e1ae02e87295ab2ce9 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 24 Feb 2025 13:15:24 +0000
Subject: [PATCH 171/217] Played around with weights in scheduler_reweight(),
 not much use. Removed un-necessary atomics from scheduler_gettask()

---
 src/scheduler.c | 95 +++++++++++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index 1ed27abc95..69203e37b6 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -2292,17 +2292,23 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         } else if (t->subtype == task_subtype_external_grav)
           cost = 1.f * wscale * gcount_i;
         else if (t->subtype == task_subtype_gpu_pack_d)  // A. Nasar
-          cost = 1.f * (wscale * count_i) * count_i;     // * s->pack_size;
+          cost = 1.f * (wscale * count_i * count_i);     // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_f)
-          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+          cost = 1.f * (wscale * count_i * count_i);  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_pack_g)
-          cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
+          cost = 1.f * (wscale * count_i * count_i);  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_d)
-          cost = 1.f * wscale * s->pack_size;
+	  //cost = wscale * s->pack_size;	
+          cost = (wscale * count_i) * count_i * s->pack_size;
+	//          cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_f)
-          cost = 1.f * wscale * s->pack_size;
+	  cost = (wscale * count_i) * count_i * s->pack_size;
+//	  cost = wscale * s->pack_size;	
+//          cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_g)
-          cost = 1.f * wscale * s->pack_size;
+	  cost = (wscale * count_i) * count_i * s->pack_size;
+//	  cost = wscale * s->pack_size;	
+//          cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_stars_density ||
                  t->subtype == task_subtype_stars_prep1 ||
                  t->subtype == task_subtype_stars_prep2 ||
@@ -2343,17 +2349,34 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
             cost = 2.f * (wscale * gcount_i) * gcount_j;
           // Abouzied: Think about good cost (for rainy days) A. Nasar
         } else if (t->subtype == task_subtype_gpu_pack_d) {
-          cost = 2.f * (wscale * count_i) * count_i;
-        } else if (t->subtype == task_subtype_gpu_pack_f) {
-          cost = 2.f * (wscale * count_i) * count_i;
+         // cost = 2.f * (wscale * count_i) * count_i;
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i);
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];        
+	} else if (t->subtype == task_subtype_gpu_pack_f) {
+//          cost = 2.f * (wscale * count_i) * count_i;
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+	  
         } else if (t->subtype == task_subtype_gpu_pack_g) {
-          cost = 2.f * (wscale * count_i) * count_i;
+          if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID)
+            cost = 3.f * (wscale * count_i * count_i) * sid_scale[t->flags];
+          else
+            cost = 2.f * (wscale * count_i) * count_j * sid_scale[t->flags];
+		
+//          cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_unpack_d) {
-          cost = 1.f * wscale;
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
-          cost = 1.f * wscale;
+	  cost = (wscale * count_i) * count_i * s->pack_size;      	
+          //cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_g) {
-          cost = 1.f * wscale;
+	  cost = (wscale * count_i) * count_i * s->pack_size;      	
+          //cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_stars_density ||
                    t->subtype == task_subtype_stars_prep1 ||
                    t->subtype == task_subtype_stars_prep2 ||
@@ -2492,11 +2515,14 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         } else if (t->subtype == task_subtype_gpu_pack_g) {
           cost = 2.f * (wscale * count_i) * count_i;
         } else if (t->subtype == task_subtype_gpu_unpack_d) {
-          cost = 1.f * wscale;
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_f) {
-          cost = 1.f * wscale;
+          cost = (wscale * count_i) * count_i * s->pack_size;
+		//cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_gpu_unpack_g) {
-          cost = 1.f * wscale;
+          cost = (wscale * count_i) * count_i * s->pack_size;
+		//cost = 1.f * wscale;
         } else if (t->subtype == task_subtype_density ||
                    t->subtype == task_subtype_gradient ||
                    t->subtype == task_subtype_force ||
@@ -2542,11 +2568,14 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         else if (t->subtype == task_subtype_gpu_pack_g)
           cost = 1.f * (wscale * count_i) * count_i;  // * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_d)
-          cost = 1.f * wscale * s->pack_size;
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_f)
-          cost = 1.f * wscale * s->pack_size;
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_gpu_unpack_g)
-          cost = 1.f * wscale * s->pack_size;
+	  cost = (wscale * count_i) * count_i * s->pack_size;	
+          //cost = 1.f * wscale * s->pack_size;
         else if (t->subtype == task_subtype_density ||
                  t->subtype == task_subtype_gradient ||
                  t->subtype == task_subtype_force ||
@@ -2562,10 +2591,10 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         }
         break;
       case task_type_ghost:
-        if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+        if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
         break;
       case task_type_extra_ghost:
-        if (t->ci == t->ci->hydro.super) cost = wscale * count_i;
+        if (t->ci == t->ci->hydro.super) cost = wscale * count_i * count_i;
         break;
       case task_type_stars_ghost:
         if (t->ci == t->ci->hydro.super) cost = wscale * scount_i;
@@ -2577,7 +2606,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         if (t->ci == t->ci->hydro.super) cost = wscale * bcount_i;
         break;
       case task_type_drift_part:
-        cost = wscale * count_i;
+        cost = wscale * count_i * count_i;
         break;
       case task_type_drift_gpart:
         cost = wscale * gcount_i;
@@ -2604,7 +2633,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         cost = wscale * (gcount_i + gcount_j);
         break;
       case task_type_end_hydro_force:
-        cost = wscale * count_i;
+        cost = wscale * count_i * count_i;
         break;
       case task_type_end_grav_force:
         cost = wscale * gcount_i;
@@ -2640,15 +2669,15 @@ void scheduler_reweight(struct scheduler *s, int verbose) {
         break;
       case task_type_kick1:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_kick2:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_timestep:
         cost =
-            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i);
+            wscale * (count_i + gcount_i + scount_i + sink_count_i + bcount_i) * count_i;
         break;
       case task_type_timestep_limiter:
         cost = wscale * count_i;
@@ -3415,7 +3444,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
   /* Loop as long as there are tasks... */
   while (s->waiting > 0 && res == NULL) {
     /* Try more than once before sleeping. */
-    for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries;
+    for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries * 100;
          tries++) {
       /* Try to get a task from the suggested queue. */
       if (s->queues[qid].count > 0 || s->queues[qid].count_incoming > 0) {
@@ -3493,43 +3522,31 @@ struct task *scheduler_gettask(struct scheduler *s, int qid,
                 subtype == task_subtype_gpu_pack_d) {
               q->n_packs_self_left_d--;
               q_stl->n_packs_self_left_d--;
-              atomic_inc(&s->s_d_left[qid]);
-              atomic_dec(&s->s_d_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self) &&
                 subtype == task_subtype_gpu_pack_g) {
               q->n_packs_self_left_g--;
               q_stl->n_packs_self_left_g--;
-              atomic_inc(&s->s_g_left[qid]);
-              atomic_dec(&s->s_g_left[qstl_id]);
             }
             if ((type == task_type_self || type == task_type_sub_self) &&
                 subtype == task_subtype_gpu_pack_f) {
               q->n_packs_self_left_f--;
               q_stl->n_packs_self_left_f--;
-              atomic_inc(&s->s_f_left[qid]);
-              atomic_dec(&s->s_f_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack_d) {
               q->n_packs_pair_left_d--;
               q_stl->n_packs_pair_left_d--;
-              atomic_inc(&s->p_d_left[qid]);
-              atomic_dec(&s->p_d_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack_g) {
               q->n_packs_pair_left_g--;
               q_stl->n_packs_pair_left_g--;
-              atomic_inc(&s->p_g_left[qid]);
-              atomic_dec(&s->p_g_left[qstl_id]);
             }
             if ((type == task_type_pair || type == task_type_sub_pair) &&
                 subtype == task_subtype_gpu_pack_f) {
               q->n_packs_pair_left_f--;
               q_stl->n_packs_pair_left_f--;
-              atomic_inc(&s->p_f_left[qid]);
-              atomic_dec(&s->p_f_left[qstl_id]);
             }
             /* Run with the task */
             break;

From 387d768f3c74152f6fd77a8a2b9bfe2d54c7ba65 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 24 Feb 2025 13:17:04 +0000
Subject: [PATCH 172/217] Changed offload parameters to match the new bigger
 cell size(s) such that same number of particles is offloaded as with the case
 of ideal sized cells

---
 src/cuda/BLOCK_SIZE.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
index 15259a7883..351033adac 100644
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -2,11 +2,11 @@
 #define BLOCK_SIZE_H
 
 #define BLOCK_SIZE 64
-#define N_TASKS_PER_PACK_SELF 128
-#define N_TASKS_BUNDLE_SELF 16
+#define N_TASKS_PER_PACK_SELF 256
+#define N_TASKS_BUNDLE_SELF 64 
 
 #define BLOCK_SIZE_PAIR 64
-#define N_TASKS_PER_PACK_PAIR 64
-#define N_TASKS_BUNDLE_PAIR 8
+#define N_TASKS_PER_PACK_PAIR 128
+#define N_TASKS_BUNDLE_PAIR 32
 
 #endif  // BLOCK_SIZE_H

From 3993ab43901e7c6848fc1de0757b06df337cfea1 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 24 Feb 2025 13:18:08 +0000
Subject: [PATCH 173/217] Small changes to split size parameters and the max
 number of top leve cells

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 1ea28f26b8..507f8a5d66 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,12 +7,12 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 64
   tasks_per_cell: 200
   deadlock_waiting_time_s:   10
-  cell_split_size: 100
-  cell_sub_size_pair_hydro:  40 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
-  cell_sub_size_self_hydro:  40 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+  cell_split_size: 50
+  cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  2
+  replicate:  4

From a86387cbfa0b10632cd2fcff03ae4428534c8458 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 24 Feb 2025 13:19:00 +0000
Subject: [PATCH 174/217] CODE USED FOR PAPER I. Added a different .yml file
 for offloading larger than ideal cell sizes

---
 .../GreshoVortex_3D/gresho_split_size_500.yml | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
new file mode 100644
index 0000000000..3105787d75
--- /dev/null
+++ b/examples/HydroTests/GreshoVortex_3D/gresho_split_size_500.yml
@@ -0,0 +1,42 @@
+# Define the system of units to use internally. 
+InternalUnitSystem:
+  UnitMass_in_cgs:     1   # Grams
+  UnitLength_in_cgs:   1   # Centimeters
+  UnitVelocity_in_cgs: 1   # Centimeters per second
+  UnitCurrent_in_cgs:  1   # Amperes
+  UnitTemp_in_cgs:     1   # Kelvin
+
+Scheduler:
+  max_top_level_cells: 16
+  tasks_per_cell: 200
+  cell_split_size: 700
+  cell_sub_size_pair_hydro:  49000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+  cell_sub_size_self_hydro:  700 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+# Parameters governing the time integration
+TimeIntegration:
+  time_begin: 0.    # The starting time of the simulation (in internal units).
+  time_end:   1.    # The end time of the simulation (in internal units).
+  dt_min:     1e-6  # The minimal time-step size of the simulation (in internal units).
+  dt_max:     1e-4  # The maximal time-step size of the simulation (in internal units).
+
+# Parameters governing the snapshots
+Snapshots:
+  basename:            gresho # Common part of the name of output files
+  time_first:          0.     # Time of the first output (in internal units)
+  delta_time:          1e-3   # Time difference between consecutive outputs (in internal units)
+  #  compression:         1
+  
+# Parameters governing the conserved quantities statistics
+Statistics:
+  delta_time:          1e-2 # Time between statistics output
+
+# Parameters for the hydrodynamics scheme
+SPH:
+  resolution_eta:        1.9   # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
+  CFL_condition:         0.1      # Courant-Friedrich-Levy condition for time integration.
+  
+# Parameters related to the initial conditions
+InitialConditions:
+  file_name:  greshoVortex.hdf5
+  periodic:   1
+  replicate:  8

From d36e513c37a495679183ddc78ffb8cd3a4b78bf6 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Feb 2025 11:00:36 +0000
Subject: [PATCH 175/217] Added code to get available GPU memory

---
 src/runner_main_clean.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 1817936a98..c2964f5c4d 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -274,6 +274,10 @@ void *runner_main2(void *data) {
           mpi_rank);
 
   cudaError_t cu_error;
+  size_t free_mem, total_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+
+  fprintf(stderr, "free mem %lu, total mem %lu\n", free_mem, total_mem);
   // how many tasks do we want for each launch of GPU kernel
   //  fprintf(stderr,"pack_size is %i\n", sched->pack_size);
   const int target_n_tasks = sched->pack_size;

From 851792f5a65086d6cc7054171c13e8a84d135dd7 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 27 Feb 2025 12:13:05 +0000
Subject: [PATCH 176/217] Implemented recursion before packing. Still not
 packing recursed cells though

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  6 +--
 src/runner_doiact_functions_hydro_gpu.h       | 54 ++++++++++++++++++-
 src/runner_main_clean.cu                      | 46 +++++++++++-----
 3 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 507f8a5d66..afc7cdc38d 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,9 +7,9 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 64
+  max_top_level_cells: 16
   tasks_per_cell: 200
-  deadlock_waiting_time_s:   10
+    #  deadlock_waiting_time_s:   10
   cell_split_size: 50
   cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
   cell_sub_size_self_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  4
+# replicate:  4
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 34f6fa9034..89326062da 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1,5 +1,6 @@
 #include "scheduler.h"
-
+#include "runner_doiact_hydro.h"
+#include "active.h"
 #include <atomic.h>
 struct pack_vars_self {
   /*List of tasks and respective cells to be packed*/
@@ -601,6 +602,57 @@ void runner_dopair1_pack(struct runner *r, struct scheduler *s,
   cell_unlocktree(cj);
 }
 
+void runner_recurse_gpu(struct runner *r, struct scheduler *s,
+                              struct pack_vars_pair *restrict pack_vars,
+                              struct cell *ci, struct cell *cj, struct task *t,
+                              struct part_aos_f4_send *parts_send,
+                              struct engine *e,
+                              int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
+							  struct cell ** cells_left, struct cell ** cells_right, int depth) {
+	  /* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
+	  if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
+	  if (ci->hydro.count == 0 || cj->hydro.count == 0) return;
+
+	  /* Get the type of pair and flip ci/cj if needed. */
+	  double shift[3];
+	  const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift);
+
+	  /* Recurse? */
+	  if (cell_can_recurse_in_pair_hydro_task(ci) &&
+	      cell_can_recurse_in_pair_hydro_task(cj)) {
+	    struct cell_split_pair *csp = &cell_split_pairs[sid];
+	    for (int k = 0; k < csp->count; k++) {
+	      const int pid = csp->pairs[k].pid;
+	      const int pjd = csp->pairs[k].pjd;
+	      /*Do we want to do anything before we recurse?*/
+
+	      /*We probably want to record */
+	      if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
+	        runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
+	        		n_leafs_found, cells_left, cells_right, depth + 1);
+//	        message("recursing to depth %i", depth + 1);
+	      }
+	    }
+	  }
+	  else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
+//	  else { //A .Nasar: WE DEFO HAVE A LEAF
+		/* if both cells inactive: skip; later: skip only asymmetric iact */
+		if(!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
+		/* if any cell empty: skip */
+		if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
+		/* if cells too far apart (check rshift and compare with hmax), skip */
+
+		/*for all leafs to be sent add to cell list */
+        cells_left[*n_leafs_found] = ci;
+        cells_right[*n_leafs_found] = cj;
+//        message("incrementing");
+		*n_leafs_found = *n_leafs_found + 1;
+		if(*n_leafs_found >= 1024)
+			error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
+	  }
+
+}
+
 double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
                               struct pack_vars_pair *restrict pack_vars,
                               struct cell *ci, struct cell *cj, struct task *t,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index c2964f5c4d..32470aa920 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -914,6 +914,7 @@ void *runner_main2(void *data) {
     int cpu_pair = 0;
     int cpu_pair_f = 0;
     int cpu_pair_g = 0;
+    int n_leafs_total = 0;
     //	Initialise timers to zero
     double time_for_density_cpu = 0.0;
     double time_for_density_cpu_pair = 0.0;
@@ -1335,9 +1336,29 @@ void *runner_main2(void *data) {
                 n_w_prts_gtr_target_p_d++;
   //              message("count %i target %i", ci->hydro.count, np_per_cell);
               }
+
+
+              /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
+               * We are recursing separately to find out how much work we have before offloading*/
+              //We need to allocate a list to put cell pointers into. We need to allocate a list of cell pair interaction.
+              int n_expected_cells = 1024;
+              int n_leafs_found = 0;
+              int depth = 0;
+//              struct cell ** cells_left = (struct cell **)calloc(n_expected_cells, sizeof(struct cell *));
+//              struct cell ** cells_right = (struct cell **)calloc(n_expected_cells, sizeof(struct cell *));
+              struct cell * cells_left[n_expected_cells];
+              struct cell * cells_right[n_expected_cells];
+              runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
+                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found, cells_left, cells_right, depth);
+//              for(int i = 0; i < n_leafs_found; i++)
+//              message("number of leafs found %i", n_leafs_found);
+              n_leafs_total += n_leafs_found;
+              /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
+              /*for (cid = 0; cid = n_daughters; cid++){*/
               packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, ci, cj, t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+              /*}*/
 
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
@@ -2009,18 +2030,19 @@ void *runner_main2(void *data) {
       }
     } /* main loop. */
 
-    message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount);
-    message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount);
-    message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount);
-    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount);
-    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount);
-    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i",
-            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount);
+    message("n_leafs found %i", n_leafs_total);
+//    message("cpu %i packed %i cells with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_d, n_w_prts_gtr_target_d, np_per_cell, maxcount);
+//    message("cpu %i packed %i cells_G with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_g, n_w_prts_gtr_target_g, np_per_cell, maxcount);
+//    message("cpu %i packed %i cells_F with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_f, n_w_prts_gtr_target_f, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_D with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_d, n_w_prts_gtr_target_p_d, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_G with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_g, n_w_prts_gtr_target_p_g, np_per_cell, maxcount);
+//    message("cpu %i packed %i pairs_F with %i containing more parts than target of %i max_count %i",
+//            r->cpuid, n_cells_p_f, n_w_prts_gtr_target_p_f, np_per_cell, maxcount);
 
     //    message("Worked on %i supers w more than 100 parts", g100);
     // Stuff for writing debug data to file for validation

From ee6e9df7014edd47238f4de4aae323321eb65506 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 27 Feb 2025 14:18:31 +0000
Subject: [PATCH 177/217] Implemented recursion working with packing and giving
 correct results

---
 src/runner_main_clean.cu | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 32470aa920..36e9d07c10 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -677,7 +677,7 @@ void *runner_main2(void *data) {
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
    *  the allocated memory on buffers and GPU. This can happen if calculated h
    * is larger than cell width and splitting makes bigger than target cells*/
-  int count_max_parts_tmp = 8 * target_n_tasks * (np_per_cell + buff);
+  int count_max_parts_tmp = 64 * 8 * target_n_tasks * (np_per_cell + buff);
 
   //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
   //  count_max_parts_tmp, target_n_tasks);
@@ -1025,6 +1025,10 @@ void *runner_main2(void *data) {
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
 
+      if (ci == NULL && (t->subtype != task_subtype_gpu_unpack_d
+    		  && t->subtype != task_subtype_gpu_unpack_g
+			  && t->subtype != task_subtype_gpu_unpack_f)) error("This cannot be");
+
 #ifdef SWIFT_DEBUG_TASKS
       /* Mark the thread we run on */
       t->rid = r->cpuid;
@@ -1350,15 +1354,17 @@ void *runner_main2(void *data) {
               struct cell * cells_right[n_expected_cells];
               runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found, cells_left, cells_right, depth);
-//              for(int i = 0; i < n_leafs_found; i++)
-//              message("number of leafs found %i", n_leafs_found);
               n_leafs_total += n_leafs_found;
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-              /*for (cid = 0; cid = n_daughters; cid++){*/
-              packing_time_pair += runner_dopair1_pack_f4(
-                  r, sched, pack_vars_pair_dens, ci, cj, t,
+              for (int cid = 0; cid < n_leafs_found; cid++){
+                packing_time_pair += runner_dopair1_pack_f4(
+                  r, sched, pack_vars_pair_dens, cells_left[cid], cells_right[cid], t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-              /*}*/
+//                message("Packing task %i in recursed tasks\n", cid);
+              }
+              /* Copies done. Release the lock ! */
+              cell_unlocktree(ci);
+              cell_unlocktree(cj);
 
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that

From 3768569f7f85da4644afa26662021d78768855a6 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Thu, 27 Feb 2025 14:18:54 +0000
Subject: [PATCH 178/217] Cleaned code up. removed a few redundant lines

---
 src/runner_doiact_functions_hydro_gpu.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 89326062da..476a1e0213 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -636,12 +636,8 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	  }
 	  else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
 //	  else { //A .Nasar: WE DEFO HAVE A LEAF
-		/* if both cells inactive: skip; later: skip only asymmetric iact */
-		if(!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
 		/* if any cell empty: skip */
 		if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
-		/* if cells too far apart (check rshift and compare with hmax), skip */
-
 		/*for all leafs to be sent add to cell list */
         cells_left[*n_leafs_found] = ci;
         cells_right[*n_leafs_found] = cj;
@@ -673,6 +669,13 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   //  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
+  struct cell *citmp, *cjtmp;
+  citmp=ci;
+  cjtmp=cj;
+  /* Get the type of pair and flip ci/cj if needed. */
+  double shift[3];
+  const int sid = space_getsid_and_swap_cells(s, &citmp, &cjtmp, shift);
+  if(citmp != ci) error("I'm flipped");
   /*Get the shifts in case of periodics*/
   space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
@@ -740,9 +743,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 
   /* Record that we have now done a packing (self) */
   t->done = 1;
-  /* Copies done. Release the lock ! */
-  cell_unlocktree(ci);
-  cell_unlocktree(cj);
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;

From 00d1e1538a2f4e24e1535b99cffbc3c1d0d71940 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 28 Feb 2025 10:30:12 +0000
Subject: [PATCH 179/217] Changed initial condition setup so that we do not try
 to use more memory than the GPU has

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml | 10 +++++-----
 examples/HydroTests/GreshoVortex_3D/makeIC.py  |  2 +-
 src/cuda/BLOCK_SIZE.h                          |  8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index afc7cdc38d..0a46c24072 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,12 +7,12 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 32
   tasks_per_cell: 200
     #  deadlock_waiting_time_s:   10
-  cell_split_size: 50
-  cell_sub_size_pair_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
-  cell_sub_size_self_hydro:  50 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
+    #  cell_split_size: 100
+    #  cell_sub_size_pair_hydro:  10000 # (Optional) Maximal number of hydro-hydro interactions per sub-pair hydro/star task (this is the default value).
+    #  cell_sub_size_self_hydro:  100 # (Optional) Maximal number of hydro-hydro interactions per sub-self hydro/star task. Set to how many cells are targeted for GPU tasks  
 # Parameters governing the time integration
 TimeIntegration:
   time_begin: 0.    # The starting time of the simulation (in internal units).
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-# replicate:  4
+    #  replicate:  2
diff --git a/examples/HydroTests/GreshoVortex_3D/makeIC.py b/examples/HydroTests/GreshoVortex_3D/makeIC.py
index c611132715..19b38352eb 100644
--- a/examples/HydroTests/GreshoVortex_3D/makeIC.py
+++ b/examples/HydroTests/GreshoVortex_3D/makeIC.py
@@ -28,7 +28,7 @@
 rho0 = 1  # Gas density
 P0 = 0.0  # Constant additional pressure (should have no impact on the dynamics)
 fileOutputName = "greshoVortex.hdf5"
-fileGlass = "glassCube_64.hdf5"
+fileGlass = "glassCube_128.hdf5"
 # ---------------------------------------------------
 
 # Get position and smoothing lengths from the glass
diff --git a/src/cuda/BLOCK_SIZE.h b/src/cuda/BLOCK_SIZE.h
index 351033adac..2d5dda1af2 100644
--- a/src/cuda/BLOCK_SIZE.h
+++ b/src/cuda/BLOCK_SIZE.h
@@ -2,11 +2,11 @@
 #define BLOCK_SIZE_H
 
 #define BLOCK_SIZE 64
-#define N_TASKS_PER_PACK_SELF 256
-#define N_TASKS_BUNDLE_SELF 64 
+#define N_TASKS_PER_PACK_SELF 8
+#define N_TASKS_BUNDLE_SELF 2 
 
 #define BLOCK_SIZE_PAIR 64
-#define N_TASKS_PER_PACK_PAIR 128
-#define N_TASKS_BUNDLE_PAIR 32
+#define N_TASKS_PER_PACK_PAIR 4
+#define N_TASKS_BUNDLE_PAIR 1
 
 #endif  // BLOCK_SIZE_H

From 11d626ca23bacda078b14b18e48c7ef44d0ad755 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 28 Feb 2025 15:29:14 +0000
Subject: [PATCH 180/217] Began changing offload algorithm to be able to handle
 deep recursions where the number of leaf tasks will exceed target_n_tasks.
 Code is not working as we need to figure out when to lock/unlock and also
 correctly monitor how we are locking/unlocking on a top task level (not leaf
 level)

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |   2 +-
 src/runner_doiact_functions_hydro_gpu.h       |  55 ++++++----
 src/runner_main_clean.cu                      | 101 ++++++++++++------
 3 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 0a46c24072..ccdb08c3e0 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 32
+  max_top_level_cells: 16
   tasks_per_cell: 200
     #  deadlock_waiting_time_s:   10
     #  cell_split_size: 100
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 476a1e0213..a158b95cec 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -5,6 +5,7 @@
 struct pack_vars_self {
   /*List of tasks and respective cells to be packed*/
   struct task **task_list;
+  struct task **top_task_list;
   struct cell **cell_list;
   /*List of cell positions*/
   double *cellx;
@@ -19,6 +20,7 @@ struct pack_vars_self {
   int count_parts;
   /**/
   int tasks_packed;
+  int top_tasks_packed;
   int *task_first_part;
   int *task_last_part;
   int *d_task_first_part;
@@ -38,6 +40,7 @@ struct pack_vars_self {
 struct pack_vars_pair {
   /*List of tasks and respective cells to be packed*/
   struct task **task_list;
+  struct task **top_task_list;
   struct cell **ci_list;
   struct cell **cj_list;
   /*List of cell shifts*/
@@ -53,6 +56,7 @@ struct pack_vars_pair {
   int count_parts;
   /**/
   int tasks_packed;
+  int top_tasks_packed;
   int *task_first_part;
   int *task_last_part;
   int *d_task_first_part;
@@ -66,6 +70,7 @@ struct pack_vars_pair {
   int target_n_tasks;
   int nBundles;
   int tasksperbundle;
+  int task_locked;
 
 } pack_vars_pair;
 
@@ -751,6 +756,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
   //  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
 
+  //A. Nasar: Need to come back to this at some point!
   lock_lock(&s->queues[qid].lock);
 
   s->queues[qid].n_packs_pair_left_d--;
@@ -759,8 +765,9 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 
   lock_unlock(&s->queues[qid].lock);
 
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
+  if (pack_vars->tasks_packed == pack_vars->target_n_tasks){
     pack_vars->launch = 1;
+  }
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
    * launch_leftovers statement)*/
   clock_gettime(CLOCK_REALTIME, &t1);
@@ -3044,14 +3051,17 @@ void runner_dopair1_launch_f4_one_memcpy(
         struct cell *cjj = pack_vars->cj_list[tid];
         struct task *tii = pack_vars->task_list[tid];
 
-        /*Let's lock ci*/
-        while (cell_locktree(cii)) {
-          ; /* spin until we acquire the lock */
-        }
-        /*Let's lock cj*/
-        while (cell_locktree(cjj)) {
-          ; /* spin until we acquire the lock */
-        }
+//        if(!pack_vars->task_locked){
+//          /*Let's lock ci*/
+//          while (cell_locktree(cii)) {
+//            ; /* spin until we acquire the lock */
+//          }
+//          /*Let's lock cj*/
+//          while (cell_locktree(cjj)) {
+//            ; /* spin until we acquire the lock */
+//          }
+//          pack_vars->task_locked = 1;
+//        }
 
         const ticks tic = getticks();
 
@@ -3067,22 +3077,19 @@ void runner_dopair1_launch_f4_one_memcpy(
         /* Record things for debugging */
         cii->gpu_done_pair++;
         cjj->gpu_done_pair++;
-        pthread_mutex_lock(&s->sleep_mutex);
-        atomic_dec(&s->waiting);
-        pthread_cond_broadcast(&s->sleep_cond);
-        pthread_mutex_unlock(&s->sleep_mutex);
-        //		  /* Release the locks */
-        cell_unlocktree(cii);
-        //		  /* Release the locks */
-        cell_unlocktree(cjj);
+
+        if(pack_vars->task_locked){
+          /* Release the locks */
+          cell_unlocktree(cii);
+          /* Release the locks */
+          cell_unlocktree(cjj);
+          pack_vars->task_locked = 0;
+        }
 
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-        /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
@@ -3090,9 +3097,17 @@ void runner_dopair1_launch_f4_one_memcpy(
       }
     }
   }
+
+
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_sub(&s->waiting, pack_vars->top_tasks_packed);
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
+
   //	/*Time end of unpacking*/
   //	clock_gettime(CLOCK_REALTIME, &t1);
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 36e9d07c10..8cc5cb9218 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -849,6 +849,8 @@ void *runner_main2(void *data) {
 
   pack_vars_pair_dens->task_list =
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  pack_vars_pair_dens->top_task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_pair_dens->ci_list =
       (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
   pack_vars_pair_dens->cj_list =
@@ -941,6 +943,8 @@ void *runner_main2(void *data) {
     pack_vars_pair_dens->tasks_packed = 0;
     pack_vars_self_dens->count_parts = 0;
     pack_vars_pair_dens->count_parts = 0;
+    pack_vars_pair_dens->task_locked = 0;
+    pack_vars_pair_dens->top_tasks_packed = 0;
     // Initialise packing counters
     pack_vars_self_forc->tasks_packed = 0;
     pack_vars_pair_forc->tasks_packed = 0;
@@ -1004,7 +1008,6 @@ void *runner_main2(void *data) {
     int g100 = 0;
     int l100 = 0;
     int maxcount = 0;
-
     /* Loop while there are tasks... */
     tasks_done_gpu_inc = 0;
     ticks hang_time = getticks();
@@ -1341,58 +1344,86 @@ void *runner_main2(void *data) {
   //              message("count %i target %i", ci->hydro.count, np_per_cell);
               }
 
-
               /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
                * We are recursing separately to find out how much work we have before offloading*/
-              //We need to allocate a list to put cell pointers into. We need to allocate a list of cell pair interaction.
-              int n_expected_cells = 1024;
+              //We need to allocate a list to put cell pointers into for each new task
+              int n_expected_tasks = 1024;
               int n_leafs_found = 0;
               int depth = 0;
-//              struct cell ** cells_left = (struct cell **)calloc(n_expected_cells, sizeof(struct cell *));
-//              struct cell ** cells_right = (struct cell **)calloc(n_expected_cells, sizeof(struct cell *));
-              struct cell * cells_left[n_expected_cells];
-              struct cell * cells_right[n_expected_cells];
+//              struct cell ** cells_left = (struct cell **)calloc(n_expected_tasks, sizeof(struct cell *));
+//              struct cell ** cells_right = (struct cell **)calloc(n_expected_tasks, sizeof(struct cell *));
+              struct cell * cells_left[n_expected_tasks];
+              struct cell * cells_right[n_expected_tasks];
               runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found, cells_left, cells_right, depth);
               n_leafs_total += n_leafs_found;
-              /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-              for (int cid = 0; cid < n_leafs_found; cid++){
-                packing_time_pair += runner_dopair1_pack_f4(
+
+              int cstart = 0, cend = n_leafs_found;
+
+              int cid = 0;
+              pack_vars_pair_dens->task_locked = 1;
+              int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
+              pack_vars_pair_dens->top_tasks_packed++;
+              pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+              int t_s, t_e;
+              t_s = 0;
+              while(cid < n_leafs_found){
+                //////////////////////////////////////////////////////////////////////////////////
+                /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
+                for (cid = cstart; pack_vars_pair_dens->tasks_packed < pack_vars_pair_dens->target_n_tasks
+                     && cid < n_leafs_found; cid++){
+
+                  packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, cells_left[cid], cells_right[cid], t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+//                  if (pack_vars_pair_dens->unfinished)
+//                	break;
 //                message("Packing task %i in recursed tasks\n", cid);
-              }
-              /* Copies done. Release the lock ! */
-              cell_unlocktree(ci);
-              cell_unlocktree(cj);
-
-              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-              /* Packed enough tasks or no pack tasks left in queue, flag that
-               * we want to run */
-              int launch = pack_vars_pair_dens->launch;
-              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-
-              /* Do we have enough stuff to run the GPU ? */
-              if (launch) n_full_p_d_bundles++;
-              if (launch_leftovers) n_partial_p_d_bundles++;
-
-              //              if ((sched->p_d_left[qid] < 1)){
-              //            	  launch_leftovers = 1;
-              //            	  pack_vars_pair_dens->launch_leftovers = 1;
-              //              }
-              if (launch || launch_leftovers) {
-                /*Launch GPU tasks*/
-                int t_packed = pack_vars_pair_dens->tasks_packed;
-                //                signal_sleeping_runners(sched, t, t_packed);
-                runner_dopair1_launch_f4_one_memcpy(
+                }
+                /* Copies done. Release the lock ! */
+                pack_vars_pair_dens->task_locked = 0;
+//                if(cid == n_leafs_found){
+//                  cell_unlocktree(ci);
+//                  cell_unlocktree(cj);
+//                  pack_vars_pair_dens->task_locked = 0;
+//                }
+                cstart = cid + 1;
+                t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+                /* Packed enough tasks or no pack tasks left in queue, flag that
+                 * we want to run */
+                int launch = pack_vars_pair_dens->launch;
+                int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+
+                /* Do we have enough stuff to run the GPU ? */
+                if (launch) n_full_p_d_bundles++;
+                if (launch_leftovers) n_partial_p_d_bundles++;
+
+                if (launch || launch_leftovers) {
+                  /*Launch GPU tasks*/
+                  int t_packed = pack_vars_pair_dens->tasks_packed;
+                  //                signal_sleeping_runners(sched, t, t_packed);
+                  runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
+                  for (int tid = 0; tid < pack_vars_pair_dens->top_tasks_packed -1; tid++){
+                    /*schedule my dependencies (Only unpacks really)*/
+                	struct task *tii = pack_vars_pair_dens->top_task_list[tid];
+                    enqueue_dependencies(sched, tii);
+                  }
+                  pack_vars_pair_dens->top_tasks_packed = 1;
+                  pack_vars_pair_dens->top_task_list[0] = t;
+                }
+                ///////////////////////////////////////////////////////////////////////
               }
+              cell_unlocktree(ci);
+              cell_unlocktree(cj);
+              pack_vars_pair_dens->task_locked = 0;
               pack_vars_pair_dens->launch_leftovers = 0;
+
 #ifdef DO_CORNERS
             } /* End of GPU work Pairs */
 #endif  // DO_CORNERS

From 7afbac12e263897622a163209c1e165ffc7dea02 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 3 Mar 2025 13:00:35 +0000
Subject: [PATCH 181/217] Removed incusion of un-necessary files at start of
 runner_main_clean.cu

---
 src/runner_main_clean.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 8cc5cb9218..f439e16d87 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -158,16 +158,19 @@ extern "C" {
 #endif
 
 #include "cuda/part_gpu.h"
-#include "files_for_new_functions/arrays_malloc.h"
-#include "files_for_new_functions/host_device_data_transfer.h"
+//#include "files_for_new_functions/arrays_malloc.h"
+//#include "files_for_new_functions/host_device_data_transfer.h"
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
 #include "runner_doiact_functions_hydro_gpu.h"
 #include "runner_gpu_pack_functions.h"
 // #include "./cuda/BLOCK_SIZE.h"
 #include "cuda/GPU_runner_functions.h"
 
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
+//#include <cuda.h>
+//#include <cuda_profiler_api.h>
+//#include <cuda_runtime.h>
 
 #ifdef __cplusplus
 }

From 5f623f4e2df92cdfa59ed99f5cfd65a37a999f13 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 4 Mar 2025 13:46:59 +0000
Subject: [PATCH 182/217] Removed un-necessary includes

---
 src/runner_main_clean.cu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index f439e16d87..3df1e3041f 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -158,20 +158,13 @@ extern "C" {
 #endif
 
 #include "cuda/part_gpu.h"
-//#include "files_for_new_functions/arrays_malloc.h"
-//#include "files_for_new_functions/host_device_data_transfer.h"
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include "runner_doiact_functions_hydro_gpu.h"
 #include "runner_gpu_pack_functions.h"
-// #include "./cuda/BLOCK_SIZE.h"
 #include "cuda/GPU_runner_functions.h"
 
-//#include <cuda.h>
-//#include <cuda_profiler_api.h>
-//#include <cuda_runtime.h>
-
 #ifdef __cplusplus
 }
 #endif

From db4b681d693849355328c627e63fdc7cefd9b82f Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 4 Mar 2025 14:07:35 +0000
Subject: [PATCH 183/217] Removed commented out code which is no longer used.
 Left some bits of code at the end for usein de-bugging/timing if required

---
 src/runner_main_clean.cu | 109 +++------------------------------------
 1 file changed, 6 insertions(+), 103 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 3df1e3041f..3100145413 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -180,24 +180,6 @@ inline cudaError_t checkCuda(cudaError_t result) {
   return result;
 }
 
-// inline void gpuErrchk(cudaError_t code) {
-// #define __FILE__ __LINE__
-//   inline void gpuAssert(cudaError_t code, const char *file, int line) {
-//     int abort = 0;
-//     if (code != cudaSuccess) {
-//       //			fprintf( stderr, "cudaCheckError() failed at
-//       //%s:%i : %s\n",
-//       //                 file, line, cudaGetErrorString( code ) );
-//       abort = 1;
-//       fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),
-//       file,
-//               line);
-//       if (abort)
-//         exit(code);
-//     }
-//   }
-// }
-
 void *runner_main2(void *data) {
   struct runner *r = (struct runner *)data;
   struct engine *e = r->e;
@@ -275,7 +257,6 @@ void *runner_main2(void *data) {
 
   fprintf(stderr, "free mem %lu, total mem %lu\n", free_mem, total_mem);
   // how many tasks do we want for each launch of GPU kernel
-  //  fprintf(stderr,"pack_size is %i\n", sched->pack_size);
   const int target_n_tasks = sched->pack_size;
   const int target_n_tasks_pair = sched->pack_size_pair;
   pack_vars_self_dens->target_n_tasks = target_n_tasks;
@@ -295,8 +276,6 @@ void *runner_main2(void *data) {
   pack_vars_pair_forc->bundle_size = bundle_size_pair;
   pack_vars_self_grad->bundle_size = bundle_size;
   pack_vars_pair_grad->bundle_size = bundle_size_pair;
-  //  fprintf(stderr, "size %i size %i\n", sizeof(*pack_vars_self_dens),
-  //  sizeof(pack_vars_self)); const int bundle_size_pair = bundle_size/2;
   // Keep track of first and last particles for each task (particle data is
   // arranged in long arrays containing particles from all the tasks we will
   // work with)
@@ -332,8 +311,6 @@ void *runner_main2(void *data) {
 
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens,
                  target_n_tasks * sizeof(int4));
-  //  cudaMalloc((void**)&d_fparti_fpartj_lparti_lpartj_dens,
-  //  		  target_n_tasks * sizeof(int4));
 
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc,
                  target_n_tasks * sizeof(int4));
@@ -345,9 +322,6 @@ void *runner_main2(void *data) {
   cudaMalloc((void **)&d_fparti_fpartj_lparti_lpartj_grad,
              target_n_tasks * sizeof(int4));
 
-  //  cudaMallocManaged((void**)&d_task_last_part_self_dens_f4,
-  //		  target_n_tasks * sizeof(int), cudaMemAttachGlobal);
-
   // Arrays keeping track of the row numbers of the first and last particles
   // within each bundle. Required by the GPU code
 
@@ -738,54 +712,36 @@ void *runner_main2(void *data) {
   struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
   struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
 
-  //  cudaMalloc((void**)&d_parts_aos_dens, count_max_parts_tmp * sizeof(struct
-  //  part_aos));
   cudaMalloc((void **)&d_parts_aos_f4_send,
              count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMalloc((void **)&d_parts_aos_f4_recv,
              count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-  //  cudaMalloc((void**)&d_parts_aos_dens_f4, count_max_parts_tmp *
-  //  sizeof(struct part_aos_f4)); cudaMalloc((void**)&d_parts_aos_forc,
-  //  count_max_parts_tmp * sizeof(struct part_aos_f));
-  //  cudaMalloc((void**)&d_parts_aos_forc_f4, count_max_parts_tmp *
-  //  sizeof(struct part_aos_f4_f));
+
   cudaMalloc((void **)&d_parts_aos_forc_f4_send,
              count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
   cudaMalloc((void **)&d_parts_aos_forc_f4_recv,
              count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-  //  cudaMalloc((void**)&d_parts_aos_grad, count_max_parts_tmp * sizeof(struct
-  //  part_aos_g)); cudaMalloc((void**)&d_parts_aos_grad_f4, count_max_parts_tmp
-  //  * sizeof(struct part_aos_f4_g));
+
   cudaMalloc((void **)&d_parts_aos_grad_f4_send,
              count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
   cudaMalloc((void **)&d_parts_aos_grad_f4_recv,
              count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
 
-  //  cudaMallocHost((void **)&parts_aos_dens, count_max_parts_tmp *
-  //  sizeof(struct part_aos));
   cudaMallocHost((void **)&parts_aos_f4_send,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMallocHost((void **)&parts_aos_f4_recv,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_recv));
-  //  cudaMallocHost((void **)&parts_aos_dens_f4, count_max_parts_tmp *
-  //  sizeof(struct part_aos_f4)); cudaMallocHost((void **)&parts_aos_forc,
-  //  count_max_parts_tmp * sizeof(struct part_aos_f)); cudaMallocHost((void
-  //  **)&parts_aos_forc_f4, count_max_parts_tmp * sizeof(struct
-  //  part_aos_f4_f));
+
   cudaMallocHost((void **)&parts_aos_forc_f4_send,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_f_send));
   cudaMallocHost((void **)&parts_aos_forc_f4_recv,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
-  //  cudaMallocHost((void **)&parts_aos_grad, count_max_parts_tmp *
-  //  sizeof(struct part_aos_g)); cudaMallocHost((void **)&parts_aos_grad_f4,
-  //  count_max_parts_tmp * sizeof(struct part_aos_f4_g));
+
   cudaMallocHost((void **)&parts_aos_grad_f4_send,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_g_send));
   cudaMallocHost((void **)&parts_aos_grad_f4_recv,
                  count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
 
-  //  cudaMalloc((void**)&d_parts_aos_pair_dens, 2 * count_max_parts_tmp *
-  //  sizeof(struct part_aos));
   cudaMalloc((void **)&d_parts_aos_pair_f4_send,
              2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMalloc((void **)&d_parts_aos_pair_f4_recv,
@@ -810,8 +766,6 @@ void *runner_main2(void *data) {
   ///////////Probably not needed
   /// anymore////////////////////////////////////////////////////////////////
 
-  //  cudaMallocHost((void **)&parts_aos_pair_dens, 2 * count_max_parts_tmp *
-  //  sizeof(struct part_aos));
   cudaMallocHost((void **)&parts_aos_pair_f4_send,
                  2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMallocHost((void **)&parts_aos_pair_f4_recv,
@@ -977,7 +931,6 @@ void *runner_main2(void *data) {
     snprintf(buf5, sizeof(buf5), "t%dr%dstep%d", r->cpuid, engine_rank, step);
 #ifdef DUMP_TIMINGS
     FILE *fgpu_steps;
-    //    if(step == 0 || step%10 == 0)fgpu_steps = fopen(buf5, "w");
     fgpu_steps = fopen(buf5, "w");
 #endif
     //    if (step == 0) cudaProfilerStart();
@@ -1008,7 +961,6 @@ void *runner_main2(void *data) {
     tasks_done_gpu_inc = 0;
     ticks hang_time = getticks();
     while (1) {
-      //      ticks tic_get_task = getticks();
       // A. Nasar: Get qid for re-use later
       int qid = r->qid;
       /* If there's no old task, try to get a new one. */
@@ -1093,24 +1045,15 @@ void *runner_main2(void *data) {
             maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_d++;
-//              message("count %i target %i", ci->hydro.count, np_per_cell);
             }
-            //            	error("There's %i parts in a cell when it should
-            //            be %i max", ci->hydro.count, np_per_cell);
-            /*Packed enough tasks let's go*/
+            /*Packed enough tasks. Let's go*/
             int launch = pack_vars_self_dens->launch;
-
-            //            if ((sched->s_d_left[qid] < 1)){
-            //            	launch_leftovers = 1;
-            //            	pack_vars_self_dens->launch_leftovers = 1;
-            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch) n_full_d_bundles++;
             if (launch_leftovers) n_partial_d_bundles++;
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_dens->tasks_packed;
-              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4(
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
@@ -1130,7 +1073,6 @@ void *runner_main2(void *data) {
             maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_g++;
-//              message("count %i target %i", ci->hydro.count, np_per_cell);
             }
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
@@ -1143,10 +1085,6 @@ void *runner_main2(void *data) {
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
 
-            //            if ((sched->s_g_left[qid] < 1)){
-            //              launch_leftovers = 1;
-            //          	  pack_vars_self_grad->launch_leftovers = 1;
-            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1169,7 +1107,6 @@ void *runner_main2(void *data) {
             maxcount = max(maxcount, ci->hydro.count);
             if (ci->hydro.count > 1.5 * np_per_cell) {
               n_w_prts_gtr_target_f++;
-//              message("count %i target %i", ci->hydro.count, np_per_cell);
             }
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
@@ -1182,10 +1119,6 @@ void *runner_main2(void *data) {
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
 
-            //            if ((sched->s_f_left[qid] < 1)){
-            //              launch_leftovers = 1;
-            //          	  pack_vars_self_forc->launch_leftovers = 1;
-            //            }
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
@@ -1296,8 +1229,6 @@ void *runner_main2(void *data) {
             packing_time_pair += (t1.tv_sec - t0.tv_sec) +
                                  (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
             if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              //		  if((sid != 4 && sid != 10 && sid == 12) &&
-              // step > 1){
               clock_gettime(CLOCK_REALTIME, &t0);
               runner_dopair1_branch_density(r, ci, cj);
               t->corner_pair = 1;
@@ -1337,7 +1268,6 @@ void *runner_main2(void *data) {
               maxcount = max(maxcount, ci->hydro.count);
               if (ci->hydro.count > 1.5 * np_per_cell) {
                 n_w_prts_gtr_target_p_d++;
-  //              message("count %i target %i", ci->hydro.count, np_per_cell);
               }
 
               /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
@@ -1346,8 +1276,6 @@ void *runner_main2(void *data) {
               int n_expected_tasks = 1024;
               int n_leafs_found = 0;
               int depth = 0;
-//              struct cell ** cells_left = (struct cell **)calloc(n_expected_tasks, sizeof(struct cell *));
-//              struct cell ** cells_right = (struct cell **)calloc(n_expected_tasks, sizeof(struct cell *));
               struct cell * cells_left[n_expected_tasks];
               struct cell * cells_right[n_expected_tasks];
               runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
@@ -1490,10 +1418,6 @@ void *runner_main2(void *data) {
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
 
-              //              if ((sched->p_g_left[qid] < 1)){
-              //            	  launch_leftovers = 1;
-              //            	  pack_vars_pair_grad->launch_leftovers = 1;
-              //              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1558,9 +1482,7 @@ void *runner_main2(void *data) {
               }
             } else {
 #endif  // DO_CORNERS
-        //            runner_dopair1_pack_f(r, sched, pack_vars_pair_forc,
-        //            ci, 		cj, t, parts_aos_pair_forc, e,
-        //            &packing_time_f);
+
               ticks tic_cpu_pack = getticks();
 
               packing_time_pair_f +=
@@ -1689,12 +1611,10 @@ void *runner_main2(void *data) {
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_self1_gradient(r, ci, 1);
-            //            fprintf(stderr, "split a g task\n");
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_self2_force(r, ci, 1);
-            //            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_limiter)
             runner_dosub_self1_limiter(r, ci, 1);
           else if (t->subtype == task_subtype_stars_density)
@@ -1735,18 +1655,15 @@ void *runner_main2(void *data) {
         case task_type_sub_pair:
           if (t->subtype == task_subtype_density) {
             int nothing = 0;
-            //            message("Doing a pair sub task");
             runner_dosub_pair1_density(r, ci, cj, 1);
           }
 #ifdef EXTRA_HYDRO_LOOP
           else if (t->subtype == task_subtype_gradient) {
             runner_dosub_pair1_gradient(r, ci, cj, 1);
-            //            fprintf(stderr, "split a g task\n");
           }
 #endif
           else if (t->subtype == task_subtype_force) {
             runner_dosub_pair2_force(r, ci, cj, 1);
-            //            fprintf(stderr, "split a f task\n");
           } else if (t->subtype == task_subtype_limiter)
             runner_dosub_pair1_limiter(r, ci, cj, 1);
           else if (t->subtype == task_subtype_stars_density)
@@ -1999,9 +1916,6 @@ void *runner_main2(void *data) {
           error("Unknown/invalid task type (%d).", t->type);
       }
       r->active_time += (getticks() - task_beg);
-//      if(g100 > 0)
-//    	  message("less than 100 %i more than 100 %i max count %i", l100, g100,
-//    maxcount);
 
 /* Mark that we have run this task on these cells */
 #ifdef SWIFT_DEBUG_CHECKS
@@ -2323,14 +2237,3 @@ void *runner_main2(void *data) {
 
 #endif  // WITH_CUDA
 
-#include <stdio.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-
-// uint64_t time_used ( ) {
-//    struct rusage ru;
-//    struct timeval t;
-//    getrusage(RUSAGE_THREAD,&ru);
-//    t = ru.ru_utime;
-//    return (uint64_t) t.tv_sec*1000 + t.tv_usec/1000;
-// }

From f79b477c34afc551ca65f1bdd86ceacea107fbd6 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 4 Mar 2025 14:15:53 +0000
Subject: [PATCH 184/217] Removed DO_CORNERS ifdefs as will not be required
 when code recurses

---
 src/runner_main_clean.cu | 146 +--------------------------------------
 1 file changed, 3 insertions(+), 143 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 3100145413..0ec0add743 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1219,49 +1219,6 @@ void *runner_main2(void *data) {
           else if (t->subtype == task_subtype_gpu_pack_d) {
             packed_pair++;
 #ifdef GPUOFFLOAD_DENSITY
-#ifdef DO_CORNERS
-            struct timespec t0, t1, dt;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            double shift[3] = {0.0};
-            t->corner_pair = 0;
-            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              clock_gettime(CLOCK_REALTIME, &t0);
-              runner_dopair1_branch_density(r, ci, cj);
-              t->corner_pair = 1;
-              int qid = r->qid;
-              atomic_dec(&(sched->queues[qid].n_packs_pair_left));
-              /* Tell the cells they have been packed */
-              ci->pack_done++;
-              cj->pack_done++;
-              t->done = 1;
-              int launch = 0, launch_leftovers = 0;
-              if ((sched->queues[qid].n_packs_pair_left == 0))
-                launch_leftovers = 1;
-              /* Tasks done. Release the lock ! */
-              task_unlock(t);
-              /*schedule my dependencies (Only unpacks really)*/
-              enqueue_dependencies(sched, t);
-              /*Signal sleeping runners*/
-              signal_sleeping_runners(sched, t);
-              clock_gettime(CLOCK_REALTIME, &t1);
-              packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                   (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-              if (launch_leftovers) {
-                pack_vars_pair_dens->launch_leftovers = 1;
-                runner_dopair1_launch_f4_one_memcpy(
-                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
-                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
-                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair, &time_for_density_gpu_pair,
-                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end);
-              }
-            } else {
-#endif  // DO_CORNERS
 
               ticks tic_cpu_pack = getticks();
               n_cells_p_d++;
@@ -1269,7 +1226,7 @@ void *runner_main2(void *data) {
               if (ci->hydro.count > 1.5 * np_per_cell) {
                 n_w_prts_gtr_target_p_d++;
               }
-
+              /////////////////////W.I.P!!!////////////////////////////////////////////////////////
               /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
                * We are recursing separately to find out how much work we have before offloading*/
               //We need to allocate a list to put cell pointers into for each new task
@@ -1348,57 +1305,13 @@ void *runner_main2(void *data) {
               pack_vars_pair_dens->task_locked = 0;
               pack_vars_pair_dens->launch_leftovers = 0;
 
-#ifdef DO_CORNERS
-            } /* End of GPU work Pairs */
-#endif  // DO_CORNERS
+              /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+
 #endif  // GPUOFFLOAD_DENSITY
           } /* pair / pack */
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_pair_g++;
 #ifdef GPUOFFLOAD_GRADIENT
-#ifdef DO_CORNERS
-            struct timespec t0, t1, dt;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            double shift[3] = {0.0};
-            t->corner_pair = 0;
-            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              clock_gettime(CLOCK_REALTIME, &t0);
-              runner_dopair1_branch_gradient(r, ci, cj);
-              t->corner_pair = 1;
-              int qid = r->qid;
-              atomic_dec(&(sched->queues[qid].n_packs_pair_left_g));
-              /* Tell the cells they have been packed */
-              ci->pack_done++;
-              cj->pack_done++;
-              t->done = 1;
-              int launch = 0, launch_leftovers = 0;
-              if ((sched->queues[qid].n_packs_pair_left_g == 0))
-                launch_leftovers = 1;
-              /* Tasks done. Release the lock ! */
-              task_unlock(t);
-              /*schedule my dependencies (Only unpacks really)*/
-              enqueue_dependencies(sched, t);
-              /*Signal sleeping runners*/
-              signal_sleeping_runners(sched, t);
-              clock_gettime(CLOCK_REALTIME, &t1);
-              packing_time_pair_g += (t1.tv_sec - t0.tv_sec) +
-                                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-              if (launch_leftovers) {
-                pack_vars_pair_grad->launch_leftovers = 1;
-                runner_dopair1_launch_f4_g_one_memcpy(
-                    r, sched, pack_vars_pair_grad, t, parts_aos_pair_f4_g_send,
-                    parts_aos_pair_f4_g_recv, d_parts_aos_pair_f4_g_send,
-                    d_parts_aos_pair_f4_g_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_g, &time_for_gpu_pair_g,
-                    &unpacking_time_pair_g, fparti_fpartj_lparti_lpartj_grad,
-                    pair_end_g);
-              }
-            } else {
-#endif  // DO_CORNERS
               ticks tic_cpu_pack = getticks();
               n_cells_p_g++;
               maxcount = max(maxcount, ci->hydro.count);
@@ -1410,14 +1323,11 @@ void *runner_main2(void *data) {
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
                                            fparti_fpartj_lparti_lpartj_grad);
-
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_grad->launch_leftovers;
               /*Packed enough tasks, let's go*/
               int launch = pack_vars_pair_grad->launch;
-
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
@@ -1432,57 +1342,10 @@ void *runner_main2(void *data) {
                     pair_end_g);
               }
               pack_vars_pair_grad->launch_leftovers = 0;
-#ifdef DO_CORNERS
-            } /* End of GPU work Pairs */
-#endif  // DO_CORNERS
 #endif  // GPUOFFLOAD_GRADIENT
           } else if (t->subtype == task_subtype_gpu_pack_f) {
             packed_pair_f++;
 #ifdef GPUOFFLOAD_FORCE
-#ifdef DO_CORNERS
-            struct timespec t0, t1, dt;
-            clock_gettime(CLOCK_REALTIME, &t0);
-            double shift[3] = {0.0};
-            t->corner_pair = 0;
-            int sid = space_getsid_filter(e->s, &ci, &cj, shift);
-            clock_gettime(CLOCK_REALTIME, &t1);
-            packing_time_pair += (t1.tv_sec - t0.tv_sec) +
-                                 (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-            if ((sid == 0 || sid == 2 || sid == 6 || sid == 8) && step > 1) {
-              //          if((sid != 4 && sid != 10 && sid == 12) && step > 1){
-              runner_dopair1_branch_force(r, ci, cj);
-              t->corner_pair = 1;
-              int qid = r->qid;
-              atomic_dec(&(sched->queues[qid].n_packs_pair_left_f));
-              /* Tell the cells they have been packed */
-              ci->pack_done++;
-              cj->pack_done++;
-              t->done = 1;
-              int launch = 0, launch_leftovers = 0;
-              if ((sched->queues[qid].n_packs_pair_left_f == 0))
-                launch_leftovers = 1;
-              /* Tasks done. Release the lock ! */
-              task_unlock(t);
-              /*schedule my dependencies (Only unpacks really)*/
-              enqueue_dependencies(sched, t);
-              /*Signal sleeping runners*/
-              signal_sleeping_runners(sched, t);
-              clock_gettime(CLOCK_REALTIME, &t1);
-              packing_time_pair_f += (t1.tv_sec - t0.tv_sec) +
-                                     (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-              if (launch_leftovers) {
-                pack_vars_pair_forc->launch_leftovers = 1;
-                runner_dopair1_launch_f4_f_one_memcpy(
-                    r, sched, pack_vars_pair_forc, t, parts_aos_pair_f4_f_send,
-                    parts_aos_pair_f4_f_recv, d_parts_aos_pair_f4_f_send,
-                    d_parts_aos_pair_f4_f_recv, stream_pairs, d_a, d_H, e,
-                    &packing_time_pair_f, &time_for_gpu_pair_f,
-                    &unpacking_time_pair_f, fparti_fpartj_lparti_lpartj_forc,
-                    pair_end_f);
-              }
-            } else {
-#endif  // DO_CORNERS
-
               ticks tic_cpu_pack = getticks();
 
               packing_time_pair_f +=
@@ -1520,9 +1383,6 @@ void *runner_main2(void *data) {
 
                 pack_vars_pair_forc->launch_leftovers = 0;
               } /* End of GPU work Pairs */
-#ifdef DO_CORNERS
-            }
-#endif  // DO_CORNERS
 #endif  // GPUOFFLOAD_FORCE
           } else if (t->subtype == task_subtype_gpu_unpack_d) {
             unpacked_pair++;

From 0a108fab219772d55a37e51ff8530adb83221641 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 4 Mar 2025 14:29:33 +0000
Subject: [PATCH 185/217] Cleaned up runner_main_clean.cu a bit more: Removed
 un-necessary debug code and some more commented out code

---
 src/runner_main_clean.cu | 56 +++-------------------------------------
 1 file changed, 4 insertions(+), 52 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 0ec0add743..02ee5e2c15 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1032,25 +1032,16 @@ void *runner_main2(void *data) {
             packed_self++;
 #ifdef GPUOFFLOAD_DENSITY
             ticks tic_cpu_pack = getticks();
-
             packing_time +=
                 runner_doself1_pack_f4(r, sched, pack_vars_self_dens, ci, t,
                                        parts_aos_f4_send, task_first_part_f4);
-
+            //Record times for task analysis
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_dens->launch_leftovers;
-            n_cells_d++;
-            maxcount = max(maxcount, ci->hydro.count);
-            if (ci->hydro.count > 1.5 * np_per_cell) {
-              n_w_prts_gtr_target_d++;
-            }
             /*Packed enough tasks. Let's go*/
             int launch = pack_vars_self_dens->launch;
             /* Do we have enough stuff to run the GPU ? */
-            if (launch) n_full_d_bundles++;
-            if (launch_leftovers) n_partial_d_bundles++;
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_dens->tasks_packed;
@@ -1066,30 +1057,20 @@ void *runner_main2(void *data) {
           else if (t->subtype == task_subtype_gpu_pack_g) {
             packed_self_g++;
 #ifdef GPUOFFLOAD_GRADIENT
-
             ticks tic_cpu_pack = getticks();
-
-            n_cells_g++;
-            maxcount = max(maxcount, ci->hydro.count);
-            if (ci->hydro.count > 1.5 * np_per_cell) {
-              n_w_prts_gtr_target_g++;
-            }
             packing_time_g += runner_doself1_pack_f4_g(
                 r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                 task_first_part_f4_g);
-
+            //Record times for task analysis
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_grad->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_grad->launch;
-
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_grad->tasks_packed;
-              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_g(
                   r, sched, pack_vars_self_grad, ci, t, parts_aos_grad_f4_send,
                   parts_aos_grad_f4_recv, d_parts_aos_grad_f4_send,
@@ -1102,28 +1083,19 @@ void *runner_main2(void *data) {
             packed_self_f++;
 #ifdef GPUOFFLOAD_FORCE
             ticks tic_cpu_pack = getticks();
-
-            n_cells_f++;
-            maxcount = max(maxcount, ci->hydro.count);
-            if (ci->hydro.count > 1.5 * np_per_cell) {
-              n_w_prts_gtr_target_f++;
-            }
             packing_time_f += runner_doself1_pack_f4_f(
                 r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                 task_first_part_f4_f);
-
+            //Record times for task analysis
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
             /* No pack tasks left in queue, flag that we want to run */
             int launch_leftovers = pack_vars_self_forc->launch_leftovers;
             /*Packed enough tasks let's go*/
             int launch = pack_vars_self_forc->launch;
-
             /* Do we have enough stuff to run the GPU ? */
             if (launch || launch_leftovers) {
               /*Launch GPU tasks*/
               int t_packed = pack_vars_self_forc->tasks_packed;
-              //              signal_sleeping_runners(sched, t, t_packed);
               runner_doself1_launch_f4_f(
                   r, sched, pack_vars_self_forc, ci, t, parts_aos_forc_f4_send,
                   parts_aos_forc_f4_recv, d_parts_aos_forc_f4_send,
@@ -1201,8 +1173,6 @@ void *runner_main2(void *data) {
 
         case task_type_pair:
           if (t->subtype == task_subtype_density) {
-            /* Abouzied: To be commented out when the GPU pairs have been coded
-             * up */
             cpu_pair++;
 #ifndef GPUOFFLOAD_DENSITY
             struct timespec t0, t1, dt;
@@ -1313,12 +1283,6 @@ void *runner_main2(void *data) {
             packed_pair_g++;
 #ifdef GPUOFFLOAD_GRADIENT
               ticks tic_cpu_pack = getticks();
-              n_cells_p_g++;
-              maxcount = max(maxcount, ci->hydro.count);
-              if (ci->hydro.count > 1.5 * np_per_cell) {
-                n_w_prts_gtr_target_p_g++;
-  //              message("count %i target %i", ci->hydro.count, np_per_cell);
-              }
               packing_time_pair_g +=
                   runner_dopair1_pack_f4_g(r, sched, pack_vars_pair_grad, ci,
                                            cj, t, parts_aos_pair_f4_g_send, e,
@@ -1347,27 +1311,15 @@ void *runner_main2(void *data) {
             packed_pair_f++;
 #ifdef GPUOFFLOAD_FORCE
               ticks tic_cpu_pack = getticks();
-
+              /*Pack data and increment counters checking if we should run on the GPU after packing this task*/
               packing_time_pair_f +=
                   runner_dopair1_pack_f4_f(r, sched, pack_vars_pair_forc, ci,
                                            cj, t, parts_aos_pair_f4_f_send, e,
                                            fparti_fpartj_lparti_lpartj_forc);
-              n_cells_p_f++;
-              maxcount = max(maxcount, ci->hydro.count);
-              if (ci->hydro.count > 1.5 * np_per_cell) {
-                n_w_prts_gtr_target_p_f++;
-  //              message("count %i target %i", ci->hydro.count, np_per_cell);
-              }
-              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-
               /* No pack tasks left in queue, flag that we want to run */
               int launch_leftovers = pack_vars_pair_forc->launch_leftovers;
               /*Packed enough tasks let's go*/
               int launch = pack_vars_pair_forc->launch;
-              //              if ((sched->p_f_left[qid] < 1)){
-              //            	  launch_leftovers = 1;
-              //            	  pack_vars_pair_forc->launch_leftovers = 1;
-              //              }
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/

From 1dd7b16f421f2ae0f25aa1ac19cdf9945bfb2a97 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 4 Mar 2025 14:48:20 +0000
Subject: [PATCH 186/217] Removed DO_CORNERS def and un-necessary code from
 start of runner_main_clean.cu

---
 src/runner_main_clean.cu | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 02ee5e2c15..c9f697c661 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -23,7 +23,6 @@
 #define GPUOFFLOAD_GRADIENT 1  // off-load hydro gradient to GPU
 #define GPUOFFLOAD_FORCE 1     // off-load hydro force to GPU
 
-// #define DO_CORNERS 1 //do corner pair tasks on CPU
 // #define DUMP_TIMINGS 1
 #include "../config.h"
 
@@ -151,7 +150,7 @@ extern "C" {
  * @param data A pointer to this thread's data.
  **/
 
-/* CUDA Header */
+/* CUDA Header. Wrap in extern "C" to prevent C++ function name mangling */
 #ifdef WITH_CUDA
 #ifdef __cplusplus
 extern "C" {
@@ -168,17 +167,6 @@ extern "C" {
 #ifdef __cplusplus
 }
 #endif
-// Convenience function for checking CUDA runtime API results
-// can be wrapped around any runtime API call. No-op in release builds.
-#define CUDA_DEBUG
-
-inline cudaError_t checkCuda(cudaError_t result) {
-  if (result != cudaSuccess) {
-    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
-    assert(result == cudaSuccess);
-  }
-  return result;
-}
 
 void *runner_main2(void *data) {
   struct runner *r = (struct runner *)data;

From 6ffe37d558cb42bb00ca3a960aaa05325829f566 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 16:35:49 +0000
Subject: [PATCH 187/217] Removed un-necessary (no longer used) self launch
 functions

---
 src/runner_doiact_functions_hydro_gpu.h | 553 ------------------------
 1 file changed, 553 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index a158b95cec..8ed9f3b72d 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1214,192 +1214,6 @@ double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_doself1_launch(struct runner *r, struct scheduler *s,
-                           struct pack_vars_self *pack_vars, struct cell *ci,
-                           struct task *t, struct part_aos *parts_aos,
-                           struct part_aos *d_parts_aos, cudaStream_t *stream,
-                           float d_a, float d_H, struct engine *e,
-                           double *packing_time, double *gpu_time,
-                           double *hmemcpy_time) {
-
-  struct timespec t0, t1, t0hmemcpy, t1hmemcpy, tp0, tp1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[tasks_packed - 1];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  clock_gettime(CLOCK_REALTIME, &t0hmemcpy);
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  clock_gettime(CLOCK_REALTIME, &t1hmemcpy);
-  *hmemcpy_time += (t1hmemcpy.tv_sec - t0hmemcpy.tv_sec) +
-                   (t1hmemcpy.tv_nsec - t0hmemcpy.tv_nsec) / 1000000000.0;
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    max_parts = 0;
-    int parts_in_bundle = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in the bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count =
-            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-        parts_in_bundle += count;
-        max_parts = max(max_parts, count);
-      }
-    }
-
-    const int first_part_tmp = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-    // #ifdef CUDA_DEBUG
-    //	  cudaError_t cu_error = cudaPeekAtLastError(); // cudaGetLastError();
-    ////
-    //										//
-    // Get error code 	  if (cu_error != cudaSuccess) { 		fprintf(
-    // stderr, 			"CUDA error in density self host 2 device
-    // memcpy: %s cpuid id is: %i\n ",
-    // cudaGetErrorString(cu_error), r->cpuid);
-    //		exit(0);
-    //	  }
-    // #endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-    //	  fprintf(stderr, "Launching kernel with %i tasks leftovers %i\n",
-    //			  tasks_packed, pack_vars->launch_leftovers);
-    // Launch the kernel
-    launch_density_aos(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
-        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
-        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
-    // #ifdef CUDA_DEBUG
-    //	  cu_error = cudaPeekAtLastError(); // Get error code
-    //	  if (cu_error != cudaSuccess) {
-    //		fprintf(stderr,
-    //				"CUDA error with self density kernel launch: %s
-    // cpuid id is: %i\n ",
-    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
-    //	  }
-    // #endif
-    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-    // #ifdef CUDA_DEBUG
-    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-    //										//
-    // Get error code 	  if (cu_error != cudaSuccess) {
-    // fprintf(stderr, 				"CUDA error with self density
-    // D2H memcpy: %s cpuid id is: %i\n ",
-    // cudaGetErrorString(cu_error),
-    // r->cpuid); 		error("Something's up with your cuda code");
-    //	  }
-    // #endif
-  } /*End of looping over bundles to launch in streams*/
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &tp0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    struct cell *cii = pack_vars->cell_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    //              struct cell *cii = ci_list_self_dens[tid];
-    //              struct task *tii = task_list_self_dens[tid];
-
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_doself1_gpu_unpack_neat_aos(r, cii, parts_aos, 0,
-                                       &pack_length_unpack, tid,
-                                       pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done++;
-
-    /* Release the lock */
-    cell_unlocktree(cii);
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    tii->gpu_done = 1;
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &tp1);
-  *packing_time +=
-      (tp1.tv_sec - tp0.tv_sec) + (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-} /*End of GPU work Self*/
-
 void runner_doself1_launch_f4(
     struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
     struct cell *ci, struct task *t, struct part_aos_f4_send *parts_send,
@@ -1652,190 +1466,6 @@ void runner_doself1_launch_f4(
 
 } /*End of GPU work Self*/
 
-void runner_doself1_launch_g(struct runner *r, struct scheduler *s,
-                             struct pack_vars_self *pack_vars, struct cell *ci,
-                             struct task *t, struct part_aos_g *parts_aos,
-                             struct part_aos_g *d_parts_aos,
-                             cudaStream_t *stream, float d_a, float d_H,
-                             struct engine *e, double *packing_time,
-                             double *gpu_time) {
-
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[tasks_packed - 1];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    max_parts = 0;
-    int parts_in_bundle = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in the bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count =
-            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-        parts_in_bundle += count;
-        max_parts = max(max_parts, count);
-      }
-    }
-
-    const int first_part_tmp = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos_g),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error in density self host 2 device memcpy: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
-    }
-#endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-    // Launch the kernel
-    launch_gradient_aos(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
-        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
-        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-          cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
-    }
-#endif
-    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos_g),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    struct cell *cii = pack_vars->cell_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    //              struct cell *cii = ci_list_self_dens[tid];
-    //              struct task *tii = task_list_self_dens[tid];
-
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_doself1_gpu_unpack_neat_aos_g(r, cii, parts_aos, 0,
-                                         &pack_length_unpack, tid,
-                                         pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_g++;
-
-    /* Release the lock */
-    cell_unlocktree(cii);
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    tii->gpu_done = 1;
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work Self Gradient*/
-
 void runner_doself1_launch_f4_g(
     struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
     struct cell *ci, struct task *t, struct part_aos_f4_g_send *parts_send,
@@ -2054,189 +1684,6 @@ void runner_doself1_launch_f4_g(
 
 } /*End of GPU work Self Gradient*/
 
-void runner_doself1_launch_f(struct runner *r, struct scheduler *s,
-                             struct pack_vars_self *pack_vars, struct cell *ci,
-                             struct task *t, struct part_aos_f *parts_aos,
-                             struct part_aos_f *d_parts_aos,
-                             cudaStream_t *stream, float d_a, float d_H,
-                             struct engine *e, double *packing_time,
-                             double *gpu_time) {
-
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[tasks_packed - 1];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  cudaMemcpy(pack_vars->d_cellx, pack_vars->cellx,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_celly, pack_vars->celly,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_cellz, pack_vars->cellz,
-             tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    max_parts = 0;
-    int parts_in_bundle = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in the bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count =
-            pack_vars->task_last_part[tid] - pack_vars->task_first_part[tid];
-        parts_in_bundle += count;
-        max_parts = max(max_parts, count);
-      }
-    }
-
-    const int first_part_tmp = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp;
-
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp], &parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos_f),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error in density self host 2 device memcpy: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
-    }
-#endif
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-    // Will launch a 2d grid of GPU thread blocks (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-    // Launch the kernel
-    launch_force_aos(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], BLOCK_SIZE, tasks_packed,
-        tasksperbundle, numBlocks_x, numBlocks_y, bundle_first_task, max_parts,
-        pack_vars->d_cellx, pack_vars->d_celly, pack_vars->d_cellz);
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self force kernel launch: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
-    }
-#endif
-    cudaMemcpyAsync(&parts_aos[first_part_tmp], &d_parts_aos[first_part_tmp],
-                    bundle_n_parts * sizeof(struct part_aos_f),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self firce D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    struct cell *cii = pack_vars->cell_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    //              struct cell *cii = ci_list_self_dens[tid];
-    //              struct task *tii = task_list_self_dens[tid];
-
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_doself1_gpu_unpack_neat_aos_f(r, cii, parts_aos, 0,
-                                         &pack_length_unpack, tid,
-                                         pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_f++;
-
-    /* Release the lock */
-    cell_unlocktree(cii);
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    tii->gpu_done = 1;
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work Self Gradient*/
-
 void runner_doself1_launch_f4_f(
     struct runner *r, struct scheduler *s, struct pack_vars_self *pack_vars,
     struct cell *ci, struct task *t, struct part_aos_f4_f_send *parts_send,

From 5fb146d2f71f297e028ce1fdb0f80946f8ff8a98 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 16:45:37 +0000
Subject: [PATCH 188/217] Removed all other versions of functions which launch
 GPU offload aside from those which we actually use. We now basically have 6
 functions for self and pair dens, grad and forc loops

---
 src/runner_doiact_functions_hydro_gpu.h | 2050 ++---------------------
 1 file changed, 173 insertions(+), 1877 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 8ed9f3b72d..2667a6160d 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1904,231 +1904,16 @@ void runner_doself1_launch_f4_f(
   t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
 } /*End of GPU work Self Gradient*/
 
-void runner_dopair1_launch(struct runner *r, struct scheduler *s,
-                           struct pack_vars_pair *pack_vars, struct cell *ci,
-                           struct task *t, struct part_aos *parts_aos,
-                           struct part_aos *d_parts_aos, cudaStream_t *stream,
-                           float d_a, float d_H, struct engine *e,
-                           double *packing_time, double *gpu_time) {
-
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[packed_tmp - 2];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  //    cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-  //    		2 * tasks_packed * sizeof(double),
-  //    cudaMemcpyHostToDevice); cudaMemcpy(pack_vars->d_shifty,
-  //    pack_vars->shifty, 		2 * tasks_packed * sizeof(double),
-  //    cudaMemcpyHostToDevice); cudaMemcpy(pack_vars->d_shiftz,
-  //    pack_vars->shiftz, 		2 * tasks_packed * sizeof(double),
-  //    cudaMemcpyHostToDevice);
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        const int tid_tmp = 2 * tid;
-        int count_i = pack_vars->task_last_part[tid_tmp] -
-                      pack_vars->task_first_part[tid_tmp];
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
-                      pack_vars->task_first_part[tid_tmp + 1];
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
-                    &parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-    // #ifdef CUDA_DEBUG
-    //       cudaError_t cu_error = cudaPeekAtLastError(); //
-    //       cudaGetLastError();        //
-    //                                                 // Get error code
-    //       if (cu_error != cudaSuccess) {
-    //         fprintf(stderr,
-    //         "CUDA error with pair density H2D async  memcpy ci: %s cpuid id
-    //         is: %i\n ", cudaGetErrorString(cu_error), r->cpuid);
-    //         error("Something's up with your cuda code");
-    //       }
-    // #endif
-
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    int tid = 0;
-    int offset = bid * tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopairci_branch_density_gpu_aos(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopaircj_branch_density_gpu_aos(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-    // #ifdef CUDA_DEBUG
-    //	  cu_error = cudaPeekAtLastError(); // Get error code
-    //	  if (cu_error != cudaSuccess) {
-    //		fprintf(stderr,
-    //				"CUDA error with self density kernel launch: %s
-    // cpuid id is: %i\n ",
-    // cudaGetErrorString(cu_error), r->cpuid); 		exit(0);
-    //	  }
-    // #endif
-
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
-                    &d_parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-    // #ifdef CUDA_DEBUG
-    //	  cu_error = cudaPeekAtLastError(); // cudaGetLastError();        //
-    //										//
-    // Get error code 	  if (cu_error != cudaSuccess) {
-    // fprintf(stderr, 				"CUDA error with self density
-    // D2H memcpy: %s cpuid id is: %i\n ",
-    // cudaGetErrorString(cu_error),
-    // r->cpuid); 		error("Something's up with your cuda code");
-    //	  }
-    // #endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    /*grab cell and task pointers*/
-    struct cell *cii = pack_vars->ci_list[tid];
-    struct cell *cjj = pack_vars->cj_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    /*Let's lock ci*/
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /*Let's lock cj*/
-    while (cell_locktree(cjj)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_do_ci_cj_gpu_unpack_neat_aos(r, cii, cjj, parts_aos, 0,
-                                        &pack_length_unpack, tid,
-                                        2 * pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_pair++;
-    cjj->gpu_done_pair++;
-
-    tii->gpu_done = 1;
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    /* Release the locks */
-    cell_unlocktree(cii);
-    /* Release the locks */
-    cell_unlocktree(cjj);
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f4(
+void runner_dopair1_launch_f4_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
     struct task *t, struct part_aos_f4_send *parts_send,
     struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
     struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    int4 *d_fparti_fpartj_lparti_lpartj_dens, cudaEvent_t *pair_end) {
+    cudaEvent_t *pair_end) {
 
-  struct timespec t0, t1;  //
+  struct timespec t0, t1, tp0, tp1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
 
   /* Identify the number of GPU bundles to run in ideal case*/
@@ -2169,8 +1954,6 @@ void runner_dopair1_launch_f4(
     int max_parts_j = 0;
     int parts_in_bundle_ci = 0;
     int parts_in_bundle_cj = 0;
-    const int first_task = bid * pack_vars->bundle_size;
-    int last_task = (bid + 1) * bundle_size;
     for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
       if (tid < tasks_packed) {
         /*Get an estimate for the max number of parts per cell in each bundle.
@@ -2183,19 +1966,15 @@ void runner_dopair1_launch_f4(
                       fparti_fpartj_lparti_lpartj_dens[tid].y;
         parts_in_bundle_cj += count_j;
         max_parts_j = max(max_parts_j, count_j);
-
-        last_task = tid;
+        //        if(count_i > 100 || count_j > 100)
+        //        	error("Sending data for excessive n parts %i %i",
+        //        count_i, count_j);
       }
     }
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
     const int bundle_n_parts =
         pack_vars->bundle_last_part[bid] - first_part_tmp_i;
 
-    cudaMemcpyAsync(&d_fparti_fpartj_lparti_lpartj_dens[first_task],
-                    &fparti_fpartj_lparti_lpartj_dens[first_task],
-                    (last_task + 1 - first_task) * sizeof(int4),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
     cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
                     bundle_n_parts * sizeof(struct part_aos_f4_send),
@@ -2210,1526 +1989,40 @@ void runner_dopair1_launch_f4(
               "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
               "is: %i\n ",
               cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
+      error("Something's up with your cuda code first_part %i bundle size %i",
+            first_part_tmp_i, bundle_n_parts);
     }
 #endif
-
-    const int tasksperbundle = pack_vars->tasksperbundle;
     /* LAUNCH THE GPU KERNELS for ci & cj */
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-
     // Setup 2d grid of GPU thread blocks for ci (number of tasks is
     // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopairci_branch_density_gpu_aos_f4(
-        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
-
-    numBlocks_x = (max_parts_j + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
+    int numBlocks_y = 0;  // tasks_left;
+    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int bundle_part_0 = pack_vars->bundle_first_part[bid];
     /* Launch the kernel for ci using data for ci and cj */
-    runner_dopaircj_branch_density_gpu_aos_f4(
+    runner_dopair_branch_density_gpu_aos_f4(
         d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_first_task, d_fparti_fpartj_lparti_lpartj_dens);
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // Get error code
     if (cu_error != cudaSuccess) {
       fprintf(
-          stderr,
-          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
-          max_parts_i, max_parts_j);
-      exit(0);
-    }
-#endif
-
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
-                    &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    /*grab cell and task pointers*/
-    struct cell *cii = pack_vars->ci_list[tid];
-    struct cell *cjj = pack_vars->cj_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    /*Let's lock ci*/
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /*Let's lock cj*/
-    while (cell_locktree(cjj)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_do_ci_cj_gpu_unpack_neat_aos_f4(r, cii, cjj, parts_recv, 0,
-                                           &pack_length_unpack, tid,
-                                           2 * pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_pair++;
-    cjj->gpu_done_pair++;
-
-    tii->gpu_done = 1;
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    /* Release the locks */
-    cell_unlocktree(cii);
-    /* Release the locks */
-    cell_unlocktree(cjj);
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f4_one_memcpy(
-    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-    struct task *t, struct part_aos_f4_send *parts_send,
-    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
-    float d_H, struct engine *e, double *packing_time, double *gpu_time,
-    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    cudaEvent_t *pair_end) {
-
-  struct timespec t0, t1, tp0, tp1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    //	  pack_vars->bundle_first_part[nBundles_temp] =
-    // pack_vars->task_first_part[packed_tmp - 2];
-    pack_vars->bundle_first_part[nBundles_temp] =
-        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
-                      fparti_fpartj_lparti_lpartj_dens[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
-                      fparti_fpartj_lparti_lpartj_dens[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-        //        if(count_i > 100 || count_j > 100)
-        //        	error("Sending data for excessive n parts %i %i",
-        //        count_i, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
-                    &parts_send[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_send),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code first_part %i bundle size %i",
-            first_part_tmp_i, bundle_n_parts);
-    }
-#endif
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_density_gpu_aos_f4(
-        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_part_0, bundle_n_parts);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
-          max_parts_i, max_parts_j);
-      error("Something's up with kernel launch.");
-    }
-#endif
-
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
-                    &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  //	cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-
-  int pack_length_unpack = 0;
-  ticks total_cpu_unpack_ticks = 0;
-
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-    /*Time unpacking*/
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-    //		cudaStreamSynchronize(stream[bid]);
-    cudaEventSynchronize(pair_end[bid]);
-
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *gpu_time +=
-        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-    ////////////
-
-    /*Time unpacking*/
-    //		clock_gettime(CLOCK_REALTIME, &tp0);
-
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-      if (tid < tasks_packed) {
-        clock_gettime(CLOCK_REALTIME, &tp0);
-        /*grab cell and task pointers*/
-        struct cell *cii = pack_vars->ci_list[tid];
-        struct cell *cjj = pack_vars->cj_list[tid];
-        struct task *tii = pack_vars->task_list[tid];
-
-//        if(!pack_vars->task_locked){
-//          /*Let's lock ci*/
-//          while (cell_locktree(cii)) {
-//            ; /* spin until we acquire the lock */
-//          }
-//          /*Let's lock cj*/
-//          while (cell_locktree(cjj)) {
-//            ; /* spin until we acquire the lock */
-//          }
-//          pack_vars->task_locked = 1;
-//        }
-
-        const ticks tic = getticks();
-
-        /* Do the copy */
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
-            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
-            2 * pack_vars->count_max_parts, e);
-
-        const ticks toc = getticks();
-
-        total_cpu_unpack_ticks += toc - tic;
-
-        /* Record things for debugging */
-        cii->gpu_done_pair++;
-        cjj->gpu_done_pair++;
-
-        if(pack_vars->task_locked){
-          /* Release the locks */
-          cell_unlocktree(cii);
-          /* Release the locks */
-          cell_unlocktree(cjj);
-          pack_vars->task_locked = 0;
-        }
-
-        /*Time end of unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp1);
-        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-        /*Signal sleeping runners*/
-        // MATTHIEU signal_sleeping_runners(s, tii);
-
-        tii->gpu_done = 1;
-      }
-    }
-  }
-
-
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_sub(&s->waiting, pack_vars->top_tasks_packed);
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
-
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-
-  //	/*Time end of unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t1);
-  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
-  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-  /* Write the timers back to the task */
-  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
-
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f4_mcpy_Ker_mcpy(
-    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-    struct task *t, struct part_aos_f4_send *parts_send,
-    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
-    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
-    float d_H, struct engine *e, double *packing_time, double *gpu_time,
-    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    cudaEvent_t *pair_end) {
-
-  struct timespec t0, t1, tp0, tp1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  //	const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    //	  pack_vars->bundle_first_part[nBundles_temp] =
-    // pack_vars->task_first_part[packed_tmp - 2];
-    pack_vars->bundle_first_part[nBundles_temp] =
-        fparti_fpartj_lparti_lpartj_dens[tasks_packed - 1].x;
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  //	int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
-                      fparti_fpartj_lparti_lpartj_dens[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
-                      fparti_fpartj_lparti_lpartj_dens[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
-                    &parts_send[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_send),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  }
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
-                      fparti_fpartj_lparti_lpartj_dens[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
-                      fparti_fpartj_lparti_lpartj_dens[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    //////////////////////////////////
-    //	  const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
-    //              bundle_part_0, bundle_first_task);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_density_gpu_aos_f4(
-        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_part_0, bundle_n_parts);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      error(
-          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
-          max_parts_i, max_parts_j);
-    }
-#endif
-  }
-
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj_dens[tid].z -
-                      fparti_fpartj_lparti_lpartj_dens[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj_dens[tid].w -
-                      fparti_fpartj_lparti_lpartj_dens[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    ///////////////////////////////////////////////////////////////////
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
-                    &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  //	cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-    //		cudaStreamSynchronize(stream[bid]);
-    cudaEventSynchronize(pair_end[bid]);
-
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *gpu_time +=
-        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-    /*Time unpacking*/
-    //	clock_gettime(CLOCK_REALTIME, &tp0);
-    //	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-      if (tid < tasks_packed) {
-
-        /*grab cell and task pointers*/
-        struct cell *cii = pack_vars->ci_list[tid];
-        struct cell *cjj = pack_vars->cj_list[tid];
-        struct task *tii = pack_vars->task_list[tid];
-
-        /*Let's lock ci*/
-        while (cell_locktree(cii)) {
-          ; /* spin until we acquire the lock */
-        }
-        /*Let's lock cj*/
-        while (cell_locktree(cjj)) {
-          ; /* spin until we acquire the lock */
-        }
-        /* Do the copy */
-        /*Time unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp0);
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
-            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
-            2 * pack_vars->count_max_parts, e);
-
-        /* Record things for debugging */
-        cii->gpu_done_pair++;
-        cjj->gpu_done_pair++;
-
-        tii->gpu_done = 1;
-        /*Time end of unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp1);
-        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-        /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
-        /*Signal sleeping runners*/
-        // MATTHIEU signal_sleeping_runners(s, tii);
-
-        /* Release the locks */
-        cell_unlocktree(cii);
-        /* Release the locks */
-        cell_unlocktree(cjj);
-      }
-    }
-    /*Time end of unpacking*/
-    //	  clock_gettime(CLOCK_REALTIME, &tp1);
-    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t1);
-  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
-  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_g(struct runner *r, struct scheduler *s,
-                             struct pack_vars_pair *pack_vars, struct cell *ci,
-                             struct task *t, struct part_aos_g *parts_aos,
-                             struct part_aos_g *d_parts_aos,
-                             cudaStream_t *stream, float d_a, float d_H,
-                             struct engine *e, double *packing_time,
-                             double *gpu_time) {
-
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[packed_tmp - 2];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        const int tid_tmp = 2 * tid;
-        int count_i = pack_vars->task_last_part[tid_tmp] -
-                      pack_vars->task_first_part[tid_tmp];
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
-                      pack_vars->task_first_part[tid_tmp + 1];
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
-                    &parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_g),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    int tid = 0;
-    int offset = bid * tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopairci_branch_density_gpu_aos_g(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopaircj_branch_density_gpu_aos_g(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-          cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
-    }
-#endif
-
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
-                    &d_parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_g),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
-
-    /*grab cell and task pointers*/
-    struct cell *cii = pack_vars->ci_list[tid];
-    struct cell *cjj = pack_vars->cj_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
-
-    /*Let's lock ci*/
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /*Let's lock cj*/
-    while (cell_locktree(cjj)) {
-      ; /* spin until we acquire the lock */
-    }
-    /* Do the copy */
-    runner_do_ci_cj_gpu_unpack_neat_aos_g(r, cii, cjj, parts_aos, 0,
-                                          &pack_length_unpack, tid,
-                                          2 * pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_pair_g++;
-    cjj->gpu_done_pair_g++;
-
-    tii->gpu_done = 1;
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    /* Release the locks */
-    cell_unlocktree(cii);
-    /* Release the locks */
-    cell_unlocktree(cjj);
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f4_g_one_memcpy(
-    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-    struct task *t, struct part_aos_f4_g_send *parts_send,
-    struct part_aos_f4_g_recv *parts_recv,
-    struct part_aos_f4_g_send *d_parts_send,
-    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
-    float d_H, struct engine *e, double *packing_time, double *gpu_time,
-    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-    cudaEvent_t *pair_end) {
-
-  struct timespec t0, t1, tp0, tp1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  //	const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    //	  pack_vars->bundle_first_part[nBundles_temp] =
-    // pack_vars->task_first_part[packed_tmp - 2];
-    pack_vars->bundle_first_part[nBundles_temp] =
-        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  //	int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
-                    &parts_send[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-
-    //	  const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
-    //              bundle_part_0, bundle_first_task);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_gradient_gpu_aos_f4(
-        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_part_0, bundle_n_parts);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
-          max_parts_i, max_parts_j);
-      exit(0);
-    }
-#endif
-
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
-                    &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
-                                       // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  //	cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-
-  ticks total_cpu_unpack_ticks = 0.;
-
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-    /*Time unpacking*/
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-    //		cudaStreamSynchronize(stream[bid]);
-    cudaEventSynchronize(pair_end[bid]);
-
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *gpu_time +=
-        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-    /*Time unpacking*/
-    //		clock_gettime(CLOCK_REALTIME, &tp0);
-    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-      if (tid < tasks_packed) {
-        clock_gettime(CLOCK_REALTIME, &tp0);
-        /*grab cell and task pointers*/
-        struct cell *cii = pack_vars->ci_list[tid];
-        struct cell *cjj = pack_vars->cj_list[tid];
-        struct task *tii = pack_vars->task_list[tid];
-        /*Let's lock ci*/
-        while (cell_locktree(cii)) {
-          ; /* spin until we acquire the lock */
-        }
-        /*Let's lock cj*/
-        while (cell_locktree(cjj)) {
-          ; /* spin until we acquire the lock */
-        }
-
-        const ticks tic = getticks();
-
-        /* Do the copy */
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
-            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
-            2 * pack_vars->count_max_parts, e);
-
-        const ticks toc = getticks();
-
-        total_cpu_unpack_ticks += toc - tic;
-
-        /* Record things for debugging */
-        cii->gpu_done_pair_g++;
-        cjj->gpu_done_pair_g++;
-        pthread_mutex_lock(&s->sleep_mutex);
-        atomic_dec(&s->waiting);
-        pthread_cond_broadcast(&s->sleep_cond);
-        pthread_mutex_unlock(&s->sleep_mutex);
-        /* Release the locks */
-        cell_unlocktree(cii);
-        /* Release the locks */
-        cell_unlocktree(cjj);
-
-        /*Time end of unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp1);
-        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-
-        /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
-        /*Signal sleeping runners*/
-        // MATTHIEU signal_sleeping_runners(s, tii);
-
-        tii->gpu_done = 1;
-      }
-    }
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-
-  /* Write the timers back to the task */
-  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
-  //	/*Time end of unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t1);
-  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
-  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f4_g_mcpy_Ker_mcpy(
-    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-    struct task *t, struct part_aos_f4_g_send *parts_send,
-    struct part_aos_f4_g_recv *parts_recv,
-    struct part_aos_f4_g_send *d_parts_send,
-    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
-    float d_H, struct engine *e, double *packing_time, double *gpu_time,
-    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
-    cudaEvent_t *pair_end) {
-
-  struct timespec t0, t1, tp0, tp1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  //	const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    //	  pack_vars->bundle_first_part[nBundles_temp] =
-    // pack_vars->task_first_part[packed_tmp - 2];
-    pack_vars->bundle_first_part[nBundles_temp] =
-        fparti_fpartj_lparti_lpartj[tasks_packed - 1].x;
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  //	int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    //      const int first_task = bid * pack_vars->bundle_size;
-    //	  int last_task = (bid + 1) * bundle_size;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-
-        //		  last_task = tid;
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-
-    cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
-                    &parts_send[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  }
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    //          const int first_task = bid * pack_vars->bundle_size;
-    //      	  int last_task = (bid + 1) * bundle_size;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-
-        //      		  last_task = tid;
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    //////////////////////////////////
-    //	      const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = 0;  // tasks_left;
-    int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    //          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
-    //              bundle_part_0, bundle_first_task);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_gradient_gpu_aos_f4(
-        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
-        numBlocks_y, bundle_part_0, bundle_n_parts);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
-          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
-          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
-          max_parts_i, max_parts_j);
-      exit(0);
-    }
-#endif
-  }
-
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    //          const int first_task = bid * pack_vars->bundle_size;
-    //    	  int last_task = (bid + 1) * bundle_size;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-
-        //    		  last_task = tid;
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    ///////////////////////////////////////////////////////////////////
-    // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
-                    &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
-                    cudaMemcpyDeviceToHost, stream[bid]);
-    cudaEventRecord(pair_end[bid], stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-  } /*End of looping over bundles to launch in streams*/
-
-  /* Make sure all the kernels and copies back are finished */
-  //	cudaDeviceSynchronize();
-
-  /*Time end of GPU work*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *gpu_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t0);
-  /* Now copy the data back from the CPU thread-local buffers to the cells */
-  /* Pack length counter for use in unpacking */
-  int pack_length_unpack = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-    clock_gettime(CLOCK_REALTIME, &t0);
-
-    //		cudaStreamSynchronize(stream[bid]);
-    cudaEventSynchronize(pair_end[bid]);
-
-    clock_gettime(CLOCK_REALTIME, &t1);
-    *gpu_time +=
-        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-
-    /*Time unpacking*/
-    //	clock_gettime(CLOCK_REALTIME, &tp0);
-    //	  int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-      if (tid < tasks_packed) {
-
-        /*grab cell and task pointers*/
-        struct cell *cii = pack_vars->ci_list[tid];
-        struct cell *cjj = pack_vars->cj_list[tid];
-        struct task *tii = pack_vars->task_list[tid];
-
-        /*Let's lock ci*/
-        while (cell_locktree(cii)) {
-          ; /* spin until we acquire the lock */
-        }
-        /*Let's lock cj*/
-        while (cell_locktree(cjj)) {
-          ; /* spin until we acquire the lock */
-        }
-        /* Do the copy */
-        /*Time unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp0);
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
-            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
-            2 * pack_vars->count_max_parts, e);
-
-        /* Record things for debugging */
-        cii->gpu_done_pair_g++;
-        cjj->gpu_done_pair_g++;
-
-        tii->gpu_done = 1;
-        /*Time end of unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp1);
-        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-        /*schedule my dependencies (Only unpacks really)*/
-        enqueue_dependencies(s, tii);
-        /*Signal sleeping runners*/
-        // MATTHIEU signal_sleeping_runners(s, tii);
-
-        /* Release the locks */
-        cell_unlocktree(cii);
-        /* Release the locks */
-        cell_unlocktree(cjj);
-      }
-    }
-    /*Time end of unpacking*/
-    //	  clock_gettime(CLOCK_REALTIME, &tp1);
-    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-  }
-  /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t1);
-  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
-  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-} /*End of GPU work*/
-
-void runner_dopair1_launch_f(struct runner *r, struct scheduler *s,
-                             struct pack_vars_pair *pack_vars, struct cell *ci,
-                             struct task *t, struct part_aos_f *parts_aos,
-                             struct part_aos_f *d_parts_aos,
-                             cudaStream_t *stream, float d_a, float d_H,
-                             struct engine *e, double *packing_time,
-                             double *gpu_time) {
-
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  /* Identify the number of GPU bundles to run in ideal case*/
-  int nBundles_temp = pack_vars->nBundles;
-  /*How many tasks have we packed?*/
-  const int tasks_packed = pack_vars->tasks_packed;
-
-  /*How many tasks should be in a bundle?*/
-  const int bundle_size = pack_vars->bundle_size;
-
-  /*tasks-packed needs decrementing before calculating packed_tmp as it was
-   * incremented in runner_dopair1_pack*/
-  const int packed_tmp = 2 * (tasks_packed - 1);
-
-  /* Special case for incomplete bundles (when having leftover tasks not enough
-   * to fill a bundle) */
-  if (pack_vars->launch_leftovers) {
-    nBundles_temp = (tasks_packed + bundle_size - 1) / bundle_size;
-    if (tasks_packed == 0)
-      error("zero pair tasks packed but somehow got into GPU loop");
-    pack_vars->bundle_first_part[nBundles_temp] =
-        pack_vars->task_first_part[packed_tmp - 2];
-  }
-  /* Identify the last particle for each bundle of tasks */
-  for (int bid = 0; bid < nBundles_temp - 1; bid++) {
-    pack_vars->bundle_last_part[bid] = pack_vars->bundle_first_part[bid + 1];
-  }
-  /* special treatment for the last bundle */
-  if (nBundles_temp > 1)
-    pack_vars->bundle_last_part[nBundles_temp - 1] = pack_vars->count_parts;
-  else
-    pack_vars->bundle_last_part[0] = pack_vars->count_parts;
-  /*Copy arrays containing first and last part for each task to GPU*/
-  cudaMemcpy(pack_vars->d_task_first_part, pack_vars->task_first_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_task_last_part, pack_vars->task_last_part,
-             2 * tasks_packed * sizeof(int), cudaMemcpyHostToDevice);
-
-  /*Copy cell shifts to device*/
-  cudaMemcpy(pack_vars->d_shiftx, pack_vars->shiftx,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_shifty, pack_vars->shifty,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-  cudaMemcpy(pack_vars->d_shiftz, pack_vars->shiftz,
-             2 * tasks_packed * sizeof(double), cudaMemcpyHostToDevice);
-
-  /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
-  //	int max_parts = 0;
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    //      const int first_task = bid * pack_vars->bundle_size;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        const int tid_tmp = 2 * tid;
-        int count_i = pack_vars->task_last_part[tid_tmp] -
-                      pack_vars->task_first_part[tid_tmp];
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = pack_vars->task_last_part[tid_tmp + 1] -
-                      pack_vars->task_first_part[tid_tmp + 1];
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    cudaMemcpyAsync(&d_parts_aos[first_part_tmp_i],
-                    &parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f),
-                    cudaMemcpyHostToDevice, stream[bid]);
-
-#ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(stderr,
-              "CUDA error with pair density H2D async  memcpy ci: %s cpuid id "
-              "is: %i\n ",
-              cudaGetErrorString(cu_error), r->cpuid);
-      error("Something's up with your cuda code");
-    }
-#endif
-
-    const int tasksperbundle = pack_vars->tasksperbundle;
-    /* LAUNCH THE GPU KERNELS for ci & cj */
-    int tid = 0;
-    int offset = bid * tasksperbundle;
-    int tasks_left = tasksperbundle;
-    if (bid == nBundles_temp - 1) {
-      tasks_left = tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    }
-
-    // Setup 2d grid of GPU thread blocks for ci (number of tasks is
-    // the y dimension and max_parts is the x dimension
-    int numBlocks_y = tasks_left;
-    //      int numBlocks_x = (max_parts_i + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int bundle_first_task = pack_vars->bundle_first_task_list[bid];
-    const char *loop_type = "density";
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopairci_branch_density_gpu_aos_f(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-    /* Launch the kernel for ci using data for ci and cj */
-    runner_dopaircj_branch_density_gpu_aos_f(
-        d_parts_aos, pack_vars->d_task_first_part, pack_vars->d_task_last_part,
-        d_a, d_H, loop_type, stream[bid], bid, BLOCK_SIZE, tasks_packed,
-        tasksperbundle, max_parts_i, max_parts_j, numBlocks_y, tid, offset,
-        bundle_first_task, pack_vars->d_shiftx, pack_vars->d_shifty,
-        pack_vars->d_shiftz);
-
-#ifdef CUDA_DEBUG
-    cu_error = cudaPeekAtLastError();  // Get error code
-    if (cu_error != cudaSuccess) {
-      fprintf(
-          stderr,
-          "CUDA error with self density kernel launch: %s cpuid id is: %i\n ",
-          cudaGetErrorString(cu_error), r->cpuid);
-      exit(0);
+          stderr,
+          "CUDA error with pair density kernel launch: %s cpuid id is: %i\n "
+          "nbx %i nby %i max_parts_i %i max_parts_j %i\n",
+          cudaGetErrorString(cu_error), r->cpuid, numBlocks_x, numBlocks_y,
+          max_parts_i, max_parts_j);
+      error("Something's up with kernel launch.");
     }
 #endif
 
     // Copy results back to CPU BUFFERS
-    cudaMemcpyAsync(&parts_aos[first_part_tmp_i],
-                    &d_parts_aos[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f),
+    cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
+                    &d_parts_recv[first_part_tmp_i],
+                    bundle_n_parts * sizeof(struct part_aos_f4_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
+    cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
@@ -3744,68 +2037,117 @@ void runner_dopair1_launch_f(struct runner *r, struct scheduler *s,
   } /*End of looping over bundles to launch in streams*/
 
   /* Make sure all the kernels and copies back are finished */
-  cudaDeviceSynchronize();
+  //	cudaDeviceSynchronize();
 
   /*Time end of GPU work*/
   clock_gettime(CLOCK_REALTIME, &t1);
   *gpu_time +=
       (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t0);
+
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
+
   int pack_length_unpack = 0;
-  for (int tid = 0; tid < tasks_packed; tid++) {
+  ticks total_cpu_unpack_ticks = 0;
+
+  for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
+    clock_gettime(CLOCK_REALTIME, &t0);
 
-    /*grab cell and task pointers*/
-    struct cell *cii = pack_vars->ci_list[tid];
-    struct cell *cjj = pack_vars->cj_list[tid];
-    struct task *tii = pack_vars->task_list[tid];
+    //		cudaStreamSynchronize(stream[bid]);
+    cudaEventSynchronize(pair_end[bid]);
 
-    /*Let's lock ci*/
-    while (cell_locktree(cii)) {
-      ; /* spin until we acquire the lock */
-    }
-    /*Let's lock cj*/
-    while (cell_locktree(cjj)) {
-      ; /* spin until we acquire the lock */
+    clock_gettime(CLOCK_REALTIME, &t1);
+    *gpu_time +=
+        (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+    ////////////
+
+    /*Time unpacking*/
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+
+    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+
+      if (tid < tasks_packed) {
+        clock_gettime(CLOCK_REALTIME, &tp0);
+        /*grab cell and task pointers*/
+        struct cell *cii = pack_vars->ci_list[tid];
+        struct cell *cjj = pack_vars->cj_list[tid];
+        struct task *tii = pack_vars->task_list[tid];
+
+//        if(!pack_vars->task_locked){
+//          /*Let's lock ci*/
+//          while (cell_locktree(cii)) {
+//            ; /* spin until we acquire the lock */
+//          }
+//          /*Let's lock cj*/
+//          while (cell_locktree(cjj)) {
+//            ; /* spin until we acquire the lock */
+//          }
+//          pack_vars->task_locked = 1;
+//        }
+
+        const ticks tic = getticks();
+
+        /* Do the copy */
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+            2 * pack_vars->count_max_parts, e);
+
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+
+        /* Record things for debugging */
+        cii->gpu_done_pair++;
+        cjj->gpu_done_pair++;
+
+        if(pack_vars->task_locked){
+          /* Release the locks */
+          cell_unlocktree(cii);
+          /* Release the locks */
+          cell_unlocktree(cjj);
+          pack_vars->task_locked = 0;
+        }
+
+        /*Time end of unpacking*/
+        clock_gettime(CLOCK_REALTIME, &tp1);
+        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+        /*Signal sleeping runners*/
+        // MATTHIEU signal_sleeping_runners(s, tii);
+
+        tii->gpu_done = 1;
+      }
     }
-    /* Do the copy */
-    runner_do_ci_cj_gpu_unpack_neat_aos_f(r, cii, cjj, parts_aos, 0,
-                                          &pack_length_unpack, tid,
-                                          2 * pack_vars->count_max_parts, e);
-
-    /* Record things for debugging */
-    cii->gpu_done_pair_f++;
-    cjj->gpu_done_pair_f++;
-
-    tii->gpu_done = 1;
-
-    /*schedule my dependencies (Only unpacks really)*/
-    enqueue_dependencies(s, tii);
-    /*Signal sleeping runners*/
-    // MATTHIEU signal_sleeping_runners(s, tii);
-
-    /* Release the locks */
-    cell_unlocktree(cii);
-    /* Release the locks */
-    cell_unlocktree(cjj);
   }
+
+
+  pthread_mutex_lock(&s->sleep_mutex);
+  atomic_sub(&s->waiting, pack_vars->top_tasks_packed);
+  pthread_cond_broadcast(&s->sleep_cond);
+  pthread_mutex_unlock(&s->sleep_mutex);
+
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  //	/*Time end of unpacking*/
+  //	clock_gettime(CLOCK_REALTIME, &t1);
+  //	*packing_time += (t1.tv_sec - t0.tv_sec) +
+  //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+
 } /*End of GPU work*/
 
-void runner_dopair1_launch_f4_f_one_memcpy(
+void runner_dopair1_launch_f4_g_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
-    struct task *t, struct part_aos_f4_f_send *parts_send,
-    struct part_aos_f4_f_recv *parts_recv,
-    struct part_aos_f4_f_send *d_parts_send,
-    struct part_aos_f4_f_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    struct task *t, struct part_aos_f4_g_send *parts_send,
+    struct part_aos_f4_g_recv *parts_recv,
+    struct part_aos_f4_g_send *d_parts_send,
+    struct part_aos_f4_g_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj,
     cudaEvent_t *pair_end) {
@@ -3856,8 +2198,6 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     int max_parts_j = 0;
     int parts_in_bundle_ci = 0;
     int parts_in_bundle_cj = 0;
-    //      const int first_task = bid * pack_vars->bundle_size;
-    //	  int last_task = (bid + 1) * bundle_size;
     for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
       if (tid < tasks_packed) {
         /*Get an estimate for the max number of parts per cell in each bundle.
@@ -3870,8 +2210,6 @@ void runner_dopair1_launch_f4_f_one_memcpy(
                       fparti_fpartj_lparti_lpartj[tid].y;
         parts_in_bundle_cj += count_j;
         max_parts_j = max(max_parts_j, count_j);
-
-        //		  last_task = tid;
       }
     }
     const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
@@ -3880,7 +2218,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
 
     cudaMemcpyAsync(&d_parts_send[first_part_tmp_i],
                     &parts_send[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_f_send),
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_send),
                     cudaMemcpyHostToDevice, stream[bid]);
 
 #ifdef CUDA_DEBUG
@@ -3898,27 +2236,18 @@ void runner_dopair1_launch_f4_f_one_memcpy(
 
     //	  const int tasksperbundle = pack_vars->tasksperbundle;
     /* LAUNCH THE GPU KERNELS for ci & cj */
-    //      int tid = 0;
-    //      int offset = bid * tasksperbundle;
-    //      int tasks_left = tasksperbundle;
-    //      if (bid == nBundles_temp - 1) {
-    //        tasks_left =
-    //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
-    //      }
-
     // Setup 2d grid of GPU thread blocks for ci (number of tasks is
     // the y dimension and max_parts is the x dimension
     int numBlocks_y = 0;  // tasks_left;
     int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
 
     /* Launch the kernel for ci using data for ci and cj */
-    runner_dopair_branch_force_gpu_aos_f4(d_parts_send, d_parts_recv, d_a, d_H,
-                                          stream[bid], numBlocks_x, numBlocks_y,
-                                          bundle_part_0, bundle_n_parts);
+    runner_dopair_branch_gradient_gpu_aos_f4(
+        d_parts_send, d_parts_recv, d_a, d_H, stream[bid], numBlocks_x,
+        numBlocks_y, bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
     cu_error = cudaPeekAtLastError();  // Get error code
@@ -3936,7 +2265,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
     // Copy results back to CPU BUFFERS
     cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
-                    bundle_n_parts * sizeof(struct part_aos_f4_f_recv),
+                    bundle_n_parts * sizeof(struct part_aos_f4_g_recv),
                     cudaMemcpyDeviceToHost, stream[bid]);
     cudaEventRecord(pair_end[bid], stream[bid]);
 
@@ -3962,7 +2291,9 @@ void runner_dopair1_launch_f4_f_one_memcpy(
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+
   ticks total_cpu_unpack_ticks = 0.;
+
   for (int bid = 0; bid < nBundles_temp; bid++) {
     /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
@@ -3998,7 +2329,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
         const ticks tic = getticks();
 
         /* Do the copy */
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
+        runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
@@ -4007,15 +2338,15 @@ void runner_dopair1_launch_f4_f_one_memcpy(
         total_cpu_unpack_ticks += toc - tic;
 
         /* Record things for debugging */
-        cii->gpu_done_pair_f++;
-        cjj->gpu_done_pair_f++;
+        cii->gpu_done_pair_g++;
+        cjj->gpu_done_pair_g++;
         pthread_mutex_lock(&s->sleep_mutex);
         atomic_dec(&s->waiting);
         pthread_cond_broadcast(&s->sleep_cond);
         pthread_mutex_unlock(&s->sleep_mutex);
-        //		  /* Release the locks */
+        /* Release the locks */
         cell_unlocktree(cii);
-        //		  /* Release the locks */
+        /* Release the locks */
         cell_unlocktree(cjj);
 
         /*Time end of unpacking*/
@@ -4044,7 +2375,7 @@ void runner_dopair1_launch_f4_f_one_memcpy(
   //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 } /*End of GPU work*/
 
-void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
+void runner_dopair1_launch_f4_f_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
     struct task *t, struct part_aos_f4_f_send *parts_send,
     struct part_aos_f4_f_recv *parts_recv,
@@ -4065,6 +2396,10 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
   /*How many tasks should be in a bundle?*/
   const int bundle_size = pack_vars->bundle_size;
 
+  /*tasks-packed needs decrementing before calculating packed_tmp as it was
+   * incremented in runner_dopair1_pack*/
+  //	const int packed_tmp = 2 * (tasks_packed - 1);
+
   /* Special case for incomplete bundles (when having leftover tasks not enough
    * to fill a bundle) */
   if (pack_vars->launch_leftovers) {
@@ -4089,6 +2424,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
   /* Launch the copies for each bundle and run the GPU kernel */
   /*We don't go into this loop if tasks_left_self == 1 as
    nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
+  //	int max_parts = 0;
   for (int bid = 0; bid < nBundles_temp; bid++) {
 
     int max_parts_i = 0;
@@ -4134,42 +2470,23 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
       error("Something's up with your cuda code");
     }
 #endif
-  }
-  for (int bid = 0; bid < nBundles_temp; bid++) {
-
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    //          const int first_task = bid * pack_vars->bundle_size;
-    //      	  int last_task = (bid + 1) * bundle_size;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
 
-        //      		  last_task = tid;
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    //////////////////////////////////
+    //	  const int tasksperbundle = pack_vars->tasksperbundle;
     /* LAUNCH THE GPU KERNELS for ci & cj */
+    //      int tid = 0;
+    //      int offset = bid * tasksperbundle;
+    //      int tasks_left = tasksperbundle;
+    //      if (bid == nBundles_temp - 1) {
+    //        tasks_left =
+    //        		tasks_packed - (nBundles_temp - 1) * tasksperbundle;
+    //      }
+
     // Setup 2d grid of GPU thread blocks for ci (number of tasks is
     // the y dimension and max_parts is the x dimension
     int numBlocks_y = 0;  // tasks_left;
     int numBlocks_x = (bundle_n_parts + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int bundle_part_0 = pack_vars->bundle_first_part[bid];
-    //          int bundle_first_task = pack_vars->bundle_first_task_list[bid];
+    //      int bundle_first_task = pack_vars->bundle_first_task_list[bid];
     //              fprintf(stderr, "bundle_part_0 %i bundle_first_task %i\n",
     //              bundle_part_0, bundle_first_task);
 
@@ -4179,7 +2496,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
                                           bundle_part_0, bundle_n_parts);
 
 #ifdef CUDA_DEBUG
-    cudaError_t cu_error = cudaPeekAtLastError();  // Get error code
+    cu_error = cudaPeekAtLastError();  // Get error code
     if (cu_error != cudaSuccess) {
       fprintf(
           stderr,
@@ -4190,32 +2507,7 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
       exit(0);
     }
 #endif
-  }
-
-  for (int bid = 0; bid < nBundles_temp; bid++) {
 
-    int max_parts_i = 0;
-    int max_parts_j = 0;
-    int parts_in_bundle_ci = 0;
-    int parts_in_bundle_cj = 0;
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-      if (tid < tasks_packed) {
-        /*Get an estimate for the max number of parts per cell in each bundle.
-         *  Used for determining the number of GPU CUDA blocks*/
-        int count_i = fparti_fpartj_lparti_lpartj[tid].z -
-                      fparti_fpartj_lparti_lpartj[tid].x;
-        parts_in_bundle_ci += count_i;
-        max_parts_i = max(max_parts_i, count_i);
-        int count_j = fparti_fpartj_lparti_lpartj[tid].w -
-                      fparti_fpartj_lparti_lpartj[tid].y;
-        parts_in_bundle_cj += count_j;
-        max_parts_j = max(max_parts_j, count_j);
-      }
-    }
-    const int first_part_tmp_i = pack_vars->bundle_first_part[bid];
-    const int bundle_n_parts =
-        pack_vars->bundle_last_part[bid] - first_part_tmp_i;
-    ///////////////////////////////////////////////////////////////////
     // Copy results back to CPU BUFFERS
     cudaMemcpyAsync(&parts_recv[first_part_tmp_i],
                     &d_parts_recv[first_part_tmp_i],
@@ -4224,9 +2516,8 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
     cudaEventRecord(pair_end[bid], stream[bid]);
 
 #ifdef CUDA_DEBUG
-    cudaError_t cu_error =
-        cudaPeekAtLastError();  // cudaGetLastError();        //
-                                // Get error code
+    cu_error = cudaPeekAtLastError();  // cudaGetLastError();        //
+                                       // Get error code
     if (cu_error != cudaSuccess) {
       fprintf(stderr,
               "CUDA error with self density D2H memcpy: %s cpuid id is: %i\n ",
@@ -4243,12 +2534,12 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
   clock_gettime(CLOCK_REALTIME, &t1);
   *gpu_time +=
       (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /*Time unpacking*/
-  //	clock_gettime(CLOCK_REALTIME, &t0);
   /* Now copy the data back from the CPU thread-local buffers to the cells */
   /* Pack length counter for use in unpacking */
   int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0.;
   for (int bid = 0; bid < nBundles_temp; bid++) {
+    /*Time unpacking*/
     clock_gettime(CLOCK_REALTIME, &t0);
 
     //		cudaStreamSynchronize(stream[bid]);
@@ -4259,17 +2550,17 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
         (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 
     /*Time unpacking*/
-    //	clock_gettime(CLOCK_REALTIME, &tp0);
+    //		clock_gettime(CLOCK_REALTIME, &tp0);
+    //		int bundle_first_task = pack_vars->bundle_first_task_list[bid];
 
     for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
 
       if (tid < tasks_packed) {
-
+        clock_gettime(CLOCK_REALTIME, &tp0);
         /*grab cell and task pointers*/
         struct cell *cii = pack_vars->ci_list[tid];
         struct cell *cjj = pack_vars->cj_list[tid];
         struct task *tii = pack_vars->task_list[tid];
-
         /*Let's lock ci*/
         while (cell_locktree(cii)) {
           ; /* spin until we acquire the lock */
@@ -4278,46 +2569,51 @@ void runner_dopair1_launch_f4_f_mcpy_Ker_mcpy(
         while (cell_locktree(cjj)) {
           ; /* spin until we acquire the lock */
         }
+
+        const ticks tic = getticks();
+
         /* Do the copy */
-        /*Time unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp0);
         runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
             r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
             2 * pack_vars->count_max_parts, e);
 
+        const ticks toc = getticks();
+
+        total_cpu_unpack_ticks += toc - tic;
+
         /* Record things for debugging */
         cii->gpu_done_pair_f++;
         cjj->gpu_done_pair_f++;
+        pthread_mutex_lock(&s->sleep_mutex);
+        atomic_dec(&s->waiting);
+        pthread_cond_broadcast(&s->sleep_cond);
+        pthread_mutex_unlock(&s->sleep_mutex);
+        //		  /* Release the locks */
+        cell_unlocktree(cii);
+        //		  /* Release the locks */
+        cell_unlocktree(cjj);
 
-        tii->gpu_done = 1;
         /*Time end of unpacking*/
         clock_gettime(CLOCK_REALTIME, &tp1);
         *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
                         (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+
         /*schedule my dependencies (Only unpacks really)*/
         enqueue_dependencies(s, tii);
         /*Signal sleeping runners*/
         // MATTHIEU signal_sleeping_runners(s, tii);
 
-        /* Release the locks */
-        cell_unlocktree(cii);
-        /* Release the locks */
-        cell_unlocktree(cjj);
+        tii->gpu_done = 1;
       }
     }
-    /*Time end of unpacking*/
-    //	  clock_gettime(CLOCK_REALTIME, &tp1);
-    //	  *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //	  *packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //	  (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-    //		*packing_time += (tp1.tv_sec - tp0.tv_sec) +
-    //		(tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
   }
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
-  /*Time end of unpacking*/
+
+  /* Write the timers back to the task */
+  t->total_cpu_unpack_ticks += total_cpu_unpack_ticks;
+  //	/*Time end of unpacking*/
   //	clock_gettime(CLOCK_REALTIME, &t1);
   //	*packing_time += (t1.tv_sec - t0.tv_sec) +
   //	(t1.tv_nsec - t0.tv_nsec) / 1000000000.0;

From 8a86d352820cb984fcc9cf8e36bc0ef8c841f2ff Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 16:48:30 +0000
Subject: [PATCH 189/217] Removed 3 un-used pack functions only leaving the
 functions we actually use

---
 src/runner_doiact_functions_hydro_gpu.h | 175 ------------------------
 1 file changed, 175 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 2667a6160d..9ada9f8544 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -107,63 +107,6 @@ struct pack_vars_pair_f4 {
 #include "runner_gpu_pack_functions.h"
 #include "task.h"
 #define CUDA_DEBUG
-void runner_doself1_pack(struct runner *r, struct scheduler *s,
-                         struct pack_vars_self *pack_vars, struct cell *ci,
-                         struct task *t, struct part_aos *parts_aos,
-                         int *packing_time) {
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  int tasks_packed = pack_vars->tasks_packed;
-  pack_vars->cellx[tasks_packed] = ci->loc[0];
-  pack_vars->celly[tasks_packed] = ci->loc[1];
-  pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-  int *count_parts_self = &pack_vars->count_parts;
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_doself1_gpu_pack_neat_aos(
-      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
-      count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] =
-        pack_vars->task_first_part[tasks_packed];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-  /* Tell the cell it has been packed */
-  ci->pack_done++;
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left_d));
-  t->done = 1;
-  /* Release the lock on the cell */
-  task_unlock(t);
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_d == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
 
 double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
                               struct pack_vars_self *pack_vars, struct cell *ci,
@@ -242,65 +185,6 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_doself1_pack_g(struct runner *r, struct scheduler *s,
-                           struct pack_vars_self *pack_vars, struct cell *ci,
-                           struct task *t, struct part_aos_g *parts_aos,
-                           double *packing_time) {
-
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  int tasks_packed = pack_vars->tasks_packed;
-  pack_vars->cellx[tasks_packed] = ci->loc[0];
-  pack_vars->celly[tasks_packed] = ci->loc[1];
-  pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-  int *count_parts_self = &pack_vars->count_parts;
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_doself1_gpu_pack_neat_aos_g(
-      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
-      count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] =
-        pack_vars->task_first_part[tasks_packed];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-  /* Tell the cell it has been packed */
-  ci->pack_done_g++;
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
-  t->done = 1;
-  /* Release the lock on the cell */
-  task_unlock(t);
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_g == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
-
 double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
                                 struct pack_vars_self *pack_vars,
                                 struct cell *ci, struct task *t,
@@ -377,65 +261,6 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_doself1_pack_f(struct runner *r, struct scheduler *s,
-                           struct pack_vars_self *pack_vars, struct cell *ci,
-                           struct task *t, struct part_aos_f *parts_aos,
-                           double *packing_time) {
-
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-
-  int tasks_packed = pack_vars->tasks_packed;
-  pack_vars->cellx[tasks_packed] = ci->loc[0];
-  pack_vars->celly[tasks_packed] = ci->loc[1];
-  pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-  int *count_parts_self = &pack_vars->count_parts;
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_doself1_gpu_pack_neat_aos_f(
-      r, ci, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/,
-      count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] =
-        pack_vars->task_first_part[tasks_packed];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-  /* Tell the cell it has been packed */
-  ci->pack_done_f++;
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
-  t->done = 1;
-  /* Release the lock on the cell */
-  task_unlock(t);
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_self_left_f == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
-
 double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
                                 struct pack_vars_self *pack_vars,
                                 struct cell *ci, struct task *t,

From d4e368ba41f975da263c62473157249e0d8b6523 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 16:56:46 +0000
Subject: [PATCH 190/217] Removed ALL un-necessary functions now from
 runner_doiact...gpu.h

---
 src/runner_doiact_functions_hydro_gpu.h | 372 +++---------------------
 1 file changed, 36 insertions(+), 336 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 9ada9f8544..5382ae88ed 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -338,100 +338,6 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_dopair1_pack(struct runner *r, struct scheduler *s,
-                         struct pack_vars_pair *pack_vars, struct cell *ci,
-                         struct cell *cj, struct task *t,
-                         struct part_aos *parts_aos, struct engine *e,
-                         double *packing_time) {
-
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-  int tasks_packed = pack_vars->tasks_packed;
-
-  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-  /*Get the shifts in case of periodics*/
-  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->ci_list[tasks_packed] = ci;
-  pack_vars->cj_list[tasks_packed] = cj;
-
-  float3 shift_tmp = {x_tmp, y_tmp, z_tmp};
-
-  const int count_ci = ci->hydro.count;
-  const int count_cj = cj->hydro.count;
-
-  /*Assign an id for this task*/
-  const int tid = tasks_packed;
-  /*Indexing increment per task is 2 fot these arrays*/
-  const int packed_tmp = tasks_packed * 2;
-
-  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-  pack_vars->task_first_part[packed_tmp + 1] =
-      pack_vars->count_parts + count_ci;
-
-  int *count_parts = &pack_vars->count_parts;
-  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
-  //    pack_vars->count_parts);
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_do_ci_cj_gpu_pack_neat_aos(
-      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
-      tid, pack_vars->count_max_parts, count_ci, count_cj, shift_tmp);
-  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
-  // that
-  // pack_vars->count_parts is actually increment here
-  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-
-  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
-  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
-  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-  /* Tell the cells they have been packed */
-  ci->pack_done++;
-  cj->pack_done++;
-
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_d));
-  t->done = 1;
-
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_d == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-  /* Copies done. Release the lock ! */
-  //	task_unlock(t);
-  cell_unlocktree(ci);
-  cell_unlocktree(cj);
-}
-
 void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct pack_vars_pair *restrict pack_vars,
                               struct cell *ci, struct cell *cj, struct task *t,
@@ -439,45 +345,46 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
 							  struct cell ** cells_left, struct cell ** cells_right, int depth) {
-	  /* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
-	  if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
-	  if (ci->hydro.count == 0 || cj->hydro.count == 0) return;
-
-	  /* Get the type of pair and flip ci/cj if needed. */
-	  double shift[3];
-	  const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift);
-
-	  /* Recurse? */
-	  if (cell_can_recurse_in_pair_hydro_task(ci) &&
-	      cell_can_recurse_in_pair_hydro_task(cj)) {
-	    struct cell_split_pair *csp = &cell_split_pairs[sid];
-	    for (int k = 0; k < csp->count; k++) {
-	      const int pid = csp->pairs[k].pid;
-	      const int pjd = csp->pairs[k].pjd;
-	      /*Do we want to do anything before we recurse?*/
-
-	      /*We probably want to record */
-	      if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
-	        runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
-	        		n_leafs_found, cells_left, cells_right, depth + 1);
+
+	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
+  if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
+  if (ci->hydro.count == 0 || cj->hydro.count == 0) return;
+
+  /* Get the type of pair and flip ci/cj if needed. */
+  double shift[3];
+  const int sid = space_getsid_and_swap_cells(s, &ci, &cj, shift);
+
+  /* Recurse? */
+  if (cell_can_recurse_in_pair_hydro_task(ci) &&
+	  cell_can_recurse_in_pair_hydro_task(cj)) {
+	struct cell_split_pair *csp = &cell_split_pairs[sid];
+	for (int k = 0; k < csp->count; k++) {
+	  const int pid = csp->pairs[k].pid;
+	  const int pjd = csp->pairs[k].pjd;
+	  /*Do we want to do anything before we recurse?*/
+
+	  /*We probably want to record */
+	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
+		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
+				n_leafs_found, cells_left, cells_right, depth + 1);
 //	        message("recursing to depth %i", depth + 1);
-	      }
-	    }
 	  }
-	  else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
+	}
+  }
+  else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
 //	  else { //A .Nasar: WE DEFO HAVE A LEAF
-		/* if any cell empty: skip */
-		if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
-		/*for all leafs to be sent add to cell list */
-        cells_left[*n_leafs_found] = ci;
-        cells_right[*n_leafs_found] = cj;
+	/* if any cell empty: skip */
+	if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
+	/*for all leafs to be sent add to cell list */
+	cells_left[*n_leafs_found] = ci;
+	cells_right[*n_leafs_found] = cj;
 //        message("incrementing");
-		*n_leafs_found = *n_leafs_found + 1;
-		if(*n_leafs_found >= 1024)
-			error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
-	  }
+	*n_leafs_found = *n_leafs_found + 1;
+	if(*n_leafs_found >= 1024)
+		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
+  }
 
-}
+};
 
 double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
                               struct pack_vars_pair *restrict pack_vars,
@@ -597,110 +504,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
    * launch_leftovers statement)*/
   clock_gettime(CLOCK_REALTIME, &t1);
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
-
-void runner_dopair1_pack_g(struct runner *r, struct scheduler *s,
-                           struct pack_vars_pair *pack_vars, struct cell *ci,
-                           struct cell *cj, struct task *t,
-                           struct part_aos_g *parts_aos, struct engine *e,
-                           double *packing_time) {
-
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-  int tasks_packed = pack_vars->tasks_packed;
-  const int tid_tmp = 2 * tasks_packed;
-  /*shifts for ci*/
-  pack_vars->shiftx[tid_tmp] = 0.0;
-  pack_vars->shifty[tid_tmp] = 0.0;
-  pack_vars->shiftz[tid_tmp] = 0.0;
-  /*shifts for cj. Stored using strided indexing (stride of two per task)*/
-  pack_vars->shiftx[tid_tmp + 1] = 0.0;
-  pack_vars->shifty[tid_tmp + 1] = 0.0;
-  pack_vars->shiftz[tid_tmp + 1] = 0.0;
-
-  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-  /*Get the shifts in case of periodics*/
-  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->ci_list[tasks_packed] = ci;
-  pack_vars->cj_list[tasks_packed] = cj;
-
-  const double cjx = cj->loc[0];
-  const double cjy = cj->loc[1];
-  const double cjz = cj->loc[2];
-
-  /*Correct the shifts for cell i*/
-  pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
-  pack_vars->shifty[tid_tmp] = y_tmp + cjy;
-  pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-  /*Shift for cell j is it's position. Stored using strided indexing (stride of
-   * two per task)*/
-  pack_vars->shiftx[tid_tmp + 1] = cjx;
-  pack_vars->shifty[tid_tmp + 1] = cjy;
-  pack_vars->shiftz[tid_tmp + 1] = cjz;
-
-  const int count_ci = ci->hydro.count;
-  const int count_cj = cj->hydro.count;
-
-  /*Assign an id for this task*/
-  const int tid = tasks_packed;
-  /*Indexing increment per task is 2 fot these arrays*/
-  const int packed_tmp = tasks_packed * 2;
-
-  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-  pack_vars->task_first_part[packed_tmp + 1] =
-      pack_vars->count_parts + count_ci;
-
-  int *count_parts = &pack_vars->count_parts;
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_do_ci_cj_gpu_pack_neat_aos_g(
-      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
-      tid, pack_vars->count_max_parts, count_ci, count_cj);
-  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-
-  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-  /* Tell the cells they have been packed */
-  ci->pack_done_g++;
-  cj->pack_done_g++;
-
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_g));
-  t->done = 1;
-  /* Copies done. Release the lock ! */
-  task_unlock(t);
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_g == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
+};
 
 double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
                                 struct pack_vars_pair *restrict pack_vars,
@@ -819,110 +623,6 @@ double runner_dopair1_pack_f4_g(struct runner *r, struct scheduler *s,
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
-void runner_dopair1_pack_f(struct runner *r, struct scheduler *s,
-                           struct pack_vars_pair *pack_vars, struct cell *ci,
-                           struct cell *cj, struct task *t,
-                           struct part_aos_f *parts_aos, struct engine *e,
-                           double *packing_time) {
-
-  /* Timers for how long this all takes.
-   * t0 and t1 are from start to finish including GPU calcs
-   * tp0 and tp1 only time packing and unpacking*/
-  struct timespec t0, t1;  //
-  clock_gettime(CLOCK_REALTIME, &t0);
-  int tasks_packed =
-      pack_vars->tasks_packed;  // Copy pasted this code again. Issue isn't here
-  const int tid_tmp = 2 * tasks_packed;
-  /*shifts for ci*/
-  pack_vars->shiftx[tid_tmp] = 0.0;
-  pack_vars->shifty[tid_tmp] = 0.0;
-  pack_vars->shiftz[tid_tmp] = 0.0;
-  /*shifts for cj. Stored using strided indexing (stride of two per task)*/
-  pack_vars->shiftx[tid_tmp + 1] = 0.0;
-  pack_vars->shifty[tid_tmp + 1] = 0.0;
-  pack_vars->shiftz[tid_tmp + 1] = 0.0;
-
-  double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
-  /*Get the shifts in case of periodics*/
-  space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
-
-  /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->ci_list[tasks_packed] = ci;
-  pack_vars->cj_list[tasks_packed] = cj;
-
-  const double cjx = cj->loc[0];
-  const double cjy = cj->loc[1];
-  const double cjz = cj->loc[2];
-
-  /*Correct the shifts for cell i*/
-  pack_vars->shiftx[tid_tmp] = x_tmp + cjx;
-  pack_vars->shifty[tid_tmp] = y_tmp + cjy;
-  pack_vars->shiftz[tid_tmp] = z_tmp + cjz;
-  /*Shift for cell j is it's position. Stored using strided indexing (stride of
-   * two per task)*/
-  pack_vars->shiftx[tid_tmp + 1] = cjx;
-  pack_vars->shifty[tid_tmp + 1] = cjy;
-  pack_vars->shiftz[tid_tmp + 1] = cjz;
-
-  const int count_ci = ci->hydro.count;
-  const int count_cj = cj->hydro.count;
-
-  /*Assign an id for this task*/
-  const int tid = tasks_packed;
-  /*Indexing increment per task is 2 fot these arrays*/
-  const int packed_tmp = tasks_packed * 2;
-
-  /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-  pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-  pack_vars->task_first_part[packed_tmp + 1] =
-      pack_vars->count_parts + count_ci;
-
-  int *count_parts = &pack_vars->count_parts;
-  /* This re-arranges the particle data from cell->hydro->parts into a
-  long array of part structs*/
-  runner_do_ci_cj_gpu_pack_neat_aos_f(
-      r, ci, cj, parts_aos, 0 /*timer. 0 no timing, 1 for timing*/, count_parts,
-      tid, pack_vars->count_max_parts, count_ci, count_cj);
-  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-
-  pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts - count_cj;
-  pack_vars->task_last_part[packed_tmp + 1] = pack_vars->count_parts;
-
-  /* Tell the cells they have been packed */
-  ci->pack_done_f++;
-  cj->pack_done_f++;
-
-  /* Identify first particle for each bundle of tasks */
-  const int bundle_size = pack_vars->bundle_size;
-  if (tasks_packed % bundle_size == 0) {
-    int bid = tasks_packed / bundle_size;
-    pack_vars->bundle_first_part[bid] = pack_vars->task_first_part[packed_tmp];
-    pack_vars->bundle_first_task_list[bid] = tasks_packed;
-  }
-
-  /* Record that we have now done a packing (self) */
-  int qid = r->qid;
-  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  t->done = 1;
-  /* Copies done. Release the lock ! */
-  task_unlock(t);
-  pack_vars->tasks_packed++;
-  pack_vars->launch = 0;
-  pack_vars->launch_leftovers = 0;
-  if ((s->queues[qid].n_packs_pair_left_f == 0))
-    pack_vars->launch_leftovers = 1;
-  if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
-    pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
-  clock_gettime(CLOCK_REALTIME, &t1);
-  *packing_time +=
-      (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
-}
-
 double runner_dopair1_pack_f4_f(struct runner *r, struct scheduler *s,
                                 struct pack_vars_pair *restrict pack_vars,
                                 struct cell *ci, struct cell *cj,

From 8dfaadcfcbf2c3fe0c0dd3dd3461333879653dbc Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 17:24:19 +0000
Subject: [PATCH 191/217] Cleaned up runner_doself1_pack_f4() and added some
 comments for clarity

---
 src/runner_doiact_functions_hydro_gpu.h | 38 +++++++------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 5382ae88ed..e1e5ebb6ca 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -118,24 +118,14 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
-  /* Record that we have now done a packing (self) */
+  /* Find my queue for use later*/
   int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  //  pthread_mutex_lock(&s->sleep_mutex);
-  //  atomic_dec(&(s->s_d_left[qid]));
-  //  pthread_cond_broadcast(&s->sleep_cond);
-  //  pthread_mutex_unlock(&s->sleep_mutex);
-
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
   int tasks_packed = pack_vars->tasks_packed;
-  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
-  //	pack_vars->celly[tasks_packed] = ci->loc[1];
-  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
   pack_vars->task_list[tasks_packed] = t;
   pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
-  //	d_task_first_part_self_dens_f4[tasks_packed].x = pack_vars->count_parts;
+  /* Identify row in particle arrays where this task starts*/
   task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
   int *count_parts_self = &pack_vars->count_parts;
   /* This re-arranges the particle data from cell->hydro->parts into a
@@ -143,10 +133,8 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   runner_doself1_gpu_pack_neat_aos_f4(
       r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
       count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
-  //	d_task_first_part_self_dens_f4[tasks_packed].y = pack_vars->count_parts;
+  /* Identify the row in the array where this task ends (row id of its
+     last particle)*/
   task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
   /* Identify first particle for each bundle of tasks */
   const int bundle_size = pack_vars->bundle_size;
@@ -163,25 +151,21 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
 
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
   lock_lock(&s->queues[qid].lock);
-
   s->queues[qid].n_packs_self_left_d--;
-
   if (s->queues[qid].n_packs_self_left_d < 1) pack_vars->launch_leftovers = 1;
-
   lock_unlock(&s->queues[qid].lock);
-
-  //  if ((s->s_d_left[qid] < 1)) pack_vars->launch_leftovers = 1;
+  /*Have we packed enough tasks to offload to GPU?*/
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
+
+  /*Record the end of packing time*/
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
   cell_unlocktree(ci);
   t->gpu_done = 1;
-  //		cell_unlocktree(ci);
-  //		// MATTHIEU signal_sleeping_runners(s, t);
+  /*Calculate time spen packing and return to runner_main*/
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 

From d5d5ccd3a9393931e7536a6dc928a79efe85a746 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 17:36:11 +0000
Subject: [PATCH 192/217] Cleaned up and commented up rest of self_pack
 functions. Should really be one function with if statement or some sort of
 variable over-loading to allow one function to work on 3 different data
 types. Something for another day!

---
 src/runner_doiact_functions_hydro_gpu.h | 73 +++++++------------------
 1 file changed, 20 insertions(+), 53 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index e1e5ebb6ca..a51e070815 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -165,7 +165,7 @@ double runner_doself1_pack_f4(struct runner *r, struct scheduler *s,
   /* Release the lock on the cell */
   cell_unlocktree(ci);
   t->gpu_done = 1;
-  /*Calculate time spen packing and return to runner_main*/
+  /*Calculate time spent packing and return to runner_main*/
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
@@ -180,22 +180,14 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
-  /* Record that we have now done a packing (self) */
+  /* Find my queue for use later*/
   int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  //  pthread_mutex_lock(&s->sleep_mutex);
-  //  atomic_dec(&(s->s_g_left[qid]));
-  //  pthread_cond_broadcast(&s->sleep_cond);
-  //  pthread_mutex_unlock(&s->sleep_mutex);
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
   int tasks_packed = pack_vars->tasks_packed;
-  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
-  //	pack_vars->celly[tasks_packed] = ci->loc[1];
-  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
   pack_vars->task_list[tasks_packed] = t;
   pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify row in particle arrays where this task starts*/
   task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
   int *count_parts_self = &pack_vars->count_parts;
   /* This re-arranges the particle data from cell->hydro->parts into a
@@ -203,9 +195,8 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   runner_doself1_gpu_pack_neat_aos_f4_g(
       r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
       count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  /* identify the row in the array where this task ends (row id of its
+     last particle)*/
   task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
   /* Identify first particle for each bundle of tasks */
   const int bundle_size = pack_vars->bundle_size;
@@ -217,23 +208,16 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_g++;
   /* Record that we have now done a packing (self) */
-  //  int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_self_left_g));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
   lock_lock(&s->queues[qid].lock);
-
   s->queues[qid].n_packs_self_left_g--;
-
   if (s->queues[qid].n_packs_self_left_g < 1) pack_vars->launch_leftovers = 1;
-
   lock_unlock(&s->queues[qid].lock);
 
-  //  if ((s->queues[qid].n_packs_self_left_g < 1))
-  //    pack_vars->launch_leftovers = 1;
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
   /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
@@ -241,7 +225,7 @@ double runner_doself1_pack_f4_g(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
   cell_unlocktree(ci);
-  //	// MATTHIEU signal_sleeping_runners(s, t);
+  /*Calculate time spent packing and return to runner_main*/
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 
@@ -256,22 +240,14 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
    * tp0 and tp1 only time packing and unpacking*/
   struct timespec t0, t1;  //
   clock_gettime(CLOCK_REALTIME, &t0);
-  /* Record that we have now done a packing (self) */
+  /* Find my queue for use later*/
   int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_pair_left_f));
-  //  pthread_mutex_lock(&s->sleep_mutex);
-  //  atomic_dec(&(s->s_f_left[qid]));
-  //  pthread_cond_broadcast(&s->sleep_cond);
-  //  pthread_mutex_unlock(&s->sleep_mutex);
+  /*Place pointers to the task and cells packed in an array for use later
+   * when unpacking after the GPU offload*/
   int tasks_packed = pack_vars->tasks_packed;
-  //	pack_vars->cellx[tasks_packed] = ci->loc[0];
-  //	pack_vars->celly[tasks_packed] = ci->loc[1];
-  //	pack_vars->cellz[tasks_packed] = ci->loc[2];
-  /*Get pointers to the list of tasks and cells packed*/
   pack_vars->task_list[tasks_packed] = t;
   pack_vars->cell_list[tasks_packed] = ci;
-  //    /* Identify row in particle arrays where this task starts*/
-  //	pack_vars->task_first_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify row in particle arrays where this task starts*/
   task_first_part_f4[tasks_packed].x = pack_vars->count_parts;
   int *count_parts_self = &pack_vars->count_parts;
   /* This re-arranges the particle data from cell->hydro->parts into a
@@ -279,9 +255,8 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   runner_doself1_gpu_pack_neat_aos_f4_f(
       r, ci, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
       count_parts_self, tasks_packed, pack_vars->count_max_parts);
-  //    // identify the row in the array where this task ends (row id of its
-  //    last particle)
-  //	pack_vars->task_last_part[tasks_packed] = pack_vars->count_parts;
+  /* Identify the row in the array where this task ends (row id of its
+     last particle) */
   task_first_part_f4[tasks_packed].y = pack_vars->count_parts;
   /* Identify first particle for each bundle of tasks */
   const int bundle_size = pack_vars->bundle_size;
@@ -293,32 +268,24 @@ double runner_doself1_pack_f4_f(struct runner *r, struct scheduler *s,
   /* Tell the cell it has been packed */
   ci->pack_done_f++;
   /* Record that we have now done a packing (self) */
-  //  int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_self_left_f));
   t->done = 1;
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-
+  /*Get a lock to the queue so we can safely decrement counter and check for launch leftover condition*/
   lock_lock(&s->queues[qid].lock);
-
   s->queues[qid].n_packs_self_left_f--;
-
   if (s->queues[qid].n_packs_self_left_f < 1) pack_vars->launch_leftovers = 1;
-
   lock_unlock(&s->queues[qid].lock);
-
-  //  if ((s->queues[qid].n_packs_self_left_f < 1))
-  //    pack_vars->launch_leftovers = 1;
+  /*Have we packed enough tasks to offload to GPU?*/
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks)
     pack_vars->launch = 1;
-  /*Add time to packing_time. Timer for end of GPU work after the if(launch ||
-   * launch_leftovers statement)*/
+
+  /*Record the end of packing time*/
   clock_gettime(CLOCK_REALTIME, &t1);
   /* Release the lock on the cell */
-  //	task_unlock(t);
   cell_unlocktree(ci);
-  //	// MATTHIEU signal_sleeping_runners(s, t);
+  /*Calculate time spent packing and return to runner_main*/
   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
 }
 

From ae354de16a812788e8e2466453376872fddea51a Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 5 Mar 2025 17:45:46 +0000
Subject: [PATCH 193/217] Cleaned up some redundant code

---
 src/runner_doiact_functions_hydro_gpu.h | 33 +-------------
 src/runner_main_clean.cu                | 58 ++++++++++++-------------
 2 files changed, 29 insertions(+), 62 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index a51e070815..a2bacf0c3d 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -343,7 +343,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj) {
-
   /* Timers for how long this all takes.
    * t0 and t1 are from start to finish including GPU calcs
    * tp0 and tp1 only time packing and unpacking*/
@@ -351,10 +350,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   clock_gettime(CLOCK_REALTIME, &t0);
   int tasks_packed = pack_vars->tasks_packed;
   int qid = r->qid;
-  //  pthread_mutex_lock(&s->sleep_mutex);
-  //  atomic_dec(&(s->p_d_left[qid]));
-  //  pthread_cond_broadcast(&s->sleep_cond);
-  //  pthread_mutex_unlock(&s->sleep_mutex);
 
   double x_tmp = 0.0, y_tmp = 0.0, z_tmp = 0.0;
   struct cell *citmp, *cjtmp;
@@ -382,39 +377,21 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 
   /* Find first parts in task for ci and cj. Packed_tmp is index for cell i.
    * packed_tmp+1 is index for cell j */
-  //    pack_vars->task_first_part[packed_tmp] = pack_vars->count_parts;
-  //    pack_vars->task_first_part[packed_tmp + 1] = pack_vars->count_parts +
-  //    count_ci;
-
   fparti_fpartj_lparti_lpartj[tasks_packed].x = pack_vars->count_parts;
   fparti_fpartj_lparti_lpartj[tasks_packed].y =
       pack_vars->count_parts + count_ci;
 
   int *count_parts = &pack_vars->count_parts;
-  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i before count %i\n", r->cpuid,
-  //    pack_vars->count_parts);
   /* This re-arranges the particle data from cell->hydro->parts into a
   long array of part structs*/
   runner_do_ci_cj_gpu_pack_neat_aos_f4(
       r, ci, cj, parts_send, 0 /*timer. 0 no timing, 1 for timing*/,
       count_parts, tid, pack_vars->count_max_parts, count_ci, count_cj,
       shift_tmp);
-  //	runner_doself1_gpu_pack_neat_aos(r, ci, parts_aos, 0/*timer. 0 no
-  // timing, 1 for timing*/, 		  count_parts, tasks_packed,
-  // pack_vars->count_max_parts); //This may cause an issue. Be sure to test
-  // that
-  // pack_vars->count_parts is actually increment here
-  /* Find last parts in task for ci and cj. Packed_tmp is index for cell i.
-   * packed_tmp+1 is index for cell j */
-
-  //    if(r->cpuid == 0)fprintf(stderr, "cpu %i after count %i pack_vars_count
-  //    %i\n", r->cpuid, *count_parts, 		pack_vars->count_parts);
+  /* Find last parts in task for ci and cj*/
   fparti_fpartj_lparti_lpartj[tasks_packed].z =
       pack_vars->count_parts - count_cj;
   fparti_fpartj_lparti_lpartj[tasks_packed].w = pack_vars->count_parts;
-  //    pack_vars->task_last_part[packed_tmp] = pack_vars->count_parts -
-  //    count_cj; pack_vars->task_last_part[packed_tmp + 1] =
-  //    pack_vars->count_parts;
 
   /* Tell the cells they have been packed */
   ci->pack_done++;
@@ -434,20 +411,12 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
-  /* Record that we have now done a packing (self) */
-  //  int qid = r->qid;
-  //  atomic_dec(&(s->queues[qid].n_packs_pair_left));
-  //  if ((s->p_d_left < 1)) pack_vars->launch_leftovers = 1;
 
   //A. Nasar: Need to come back to this at some point!
   lock_lock(&s->queues[qid].lock);
-
   s->queues[qid].n_packs_pair_left_d--;
-
   if (s->queues[qid].n_packs_pair_left_d < 1) pack_vars->launch_leftovers = 1;
-
   lock_unlock(&s->queues[qid].lock);
-
   if (pack_vars->tasks_packed == pack_vars->target_n_tasks){
     pack_vars->launch = 1;
   }
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index c9f697c661..2464f4b289 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -174,12 +174,12 @@ void *runner_main2(void *data) {
   struct scheduler *sched = &e->sched;
   struct space *space = e->s;
 
-  /*pack_vars contain data required for packing tasks destined for the GPU*/
+  //////////Declare and allocate GPU launch control data structures/////////
+  /*pack_vars contain data required for self and pair packing tasks destined
+   *  for the GPU*/
   struct pack_vars_self *pack_vars_self_dens;
   struct pack_vars_self *pack_vars_self_forc;
   struct pack_vars_self *pack_vars_self_grad;
-
-  /*pack_vars contain data required for packing tasks destined for the GPU*/
   struct pack_vars_pair *pack_vars_pair_dens;
   struct pack_vars_pair *pack_vars_pair_forc;
   struct pack_vars_pair *pack_vars_pair_grad;
@@ -197,53 +197,51 @@ void *runner_main2(void *data) {
                  sizeof(struct pack_vars_pair *));
   cudaMallocHost((void **)&pack_vars_pair_grad,
                  sizeof(struct pack_vars_pair *));
-
-  int devId = 0;  // find and print gpu device name
+  ///////////////////////////////////////////////////////////////////////////
+  /*Find and print GPU name(s)*/
+  int devId = 0;  //gpu device name
   struct cudaDeviceProp prop;
   int nDevices;
   int maxBlocksSM;
   int nSMs;
+  /*Get my rank*/
+  int mpi_rank = 0;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#endif
   cudaGetDeviceCount(&nDevices);
+  //If running on MPI we set code to use one MPI rank per GPU
+  //This was found to work very well and simplifies writing slurm scipts
+  if (nDevices == 1) cudaSetDevice(devId);
+#ifdef WITH_MPI
+  else {
+    cudaSetDevice(mpi_rank);
+    devId = mpi_rank;
+  }
+#endif
+  //Now tell me some info about my device
   cudaGetDeviceProperties(&prop, devId);
   cudaDeviceGetAttribute(&maxBlocksSM, cudaDevAttrMaxBlocksPerMultiprocessor,
                          devId);
   cudaDeviceGetAttribute(&nSMs, cudaDevAttrMultiProcessorCount, devId);
   int nPartsPerCell = space->nr_parts / space->tot_cells;
-  int mpi_rank = 0;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-#endif
+
   if (r->cpuid == 0 && mpi_rank == 0) {
-    fprintf(stderr, "%i devices available device id is %i\n", nDevices, devId);
-    fprintf(stderr, "Device : %s\n", prop.name);
-    fprintf(stderr, "nSMs %i max blocks per SM %i maxnBlocks per stream %i\n",
+    message("%i devices available device id is %i\n", nDevices, devId);
+    message("Device : %s\n", prop.name);
+    message("nSMs %i max blocks per SM %i maxnBlocks per stream %i\n",
             nSMs, maxBlocksSM, nSMs * maxBlocksSM);
-    fprintf(stderr, "Target nBlocks per kernel is %i\n",
+    message("Target nBlocks per kernel is %i\n",
             N_TASKS_BUNDLE_SELF * nPartsPerCell / BLOCK_SIZE);
-    fprintf(stderr, "Target nBlocks per stream is %i\n",
+    message("Target nBlocks per stream is %i\n",
             N_TASKS_PER_PACK_SELF * nPartsPerCell / BLOCK_SIZE);
   }
-  if (nDevices == 1) cudaSetDevice(devId);
-#ifndef WITH_MPI
-  else {
-    cudaSetDevice(devId);
-  }
-#endif
-#ifdef WITH_MPI
-  else {
-    cudaSetDevice(mpi_rank);
-    fprintf(stderr, "%i devices available device id is %i\n", nDevices,
-            mpi_rank);
-  }
-#endif
-  fprintf(stderr, "after dev select engine_rank %i rank %i\n", engine_rank,
-          mpi_rank);
 
   cudaError_t cu_error;
   size_t free_mem, total_mem;
   cudaMemGetInfo(&free_mem, &total_mem);
 
-  fprintf(stderr, "free mem %lu, total mem %lu\n", free_mem, total_mem);
+  message("free mem %lu, total mem %lu", free_mem, total_mem);
   // how many tasks do we want for each launch of GPU kernel
   const int target_n_tasks = sched->pack_size;
   const int target_n_tasks_pair = sched->pack_size_pair;

From ea690411a017698b67d121aab91cfd1a8e83d1bf Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 13:37:18 +0000
Subject: [PATCH 194/217] Changed input params in gresho.yml. Removed
 un-necessary arrays from runner_main and pack and launch functions

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       |  2 +-
 src/runner_main_clean.cu                      | 34 ++-----------------
 3 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index ccdb08c3e0..15709ccaf3 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-    #  replicate:  2
+  replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index a2bacf0c3d..a9f540540b 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -665,7 +665,7 @@ void runner_doself1_launch_f4(
     struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
     struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
-    double *unpack_time, int2 *d_task_first_part_self_dens_f4, int devId,
+    double *unpack_time, int devId,
     int2 *task_first_part_f4, int2 *d_task_first_part_f4,
     cudaEvent_t *self_end) {
 
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 2464f4b289..09173a760e 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -177,6 +177,7 @@ void *runner_main2(void *data) {
   //////////Declare and allocate GPU launch control data structures/////////
   /*pack_vars contain data required for self and pair packing tasks destined
    *  for the GPU*/
+  //A. N: Needed
   struct pack_vars_self *pack_vars_self_dens;
   struct pack_vars_self *pack_vars_self_forc;
   struct pack_vars_self *pack_vars_self_grad;
@@ -255,7 +256,6 @@ void *runner_main2(void *data) {
   // different streams)
   const int bundle_size = N_TASKS_BUNDLE_SELF;
   const int bundle_size_pair = N_TASKS_BUNDLE_PAIR;
-
   pack_vars_self_dens->bundle_size = bundle_size;
   pack_vars_pair_dens->bundle_size = bundle_size_pair;
   pack_vars_self_forc->bundle_size = bundle_size;
@@ -267,16 +267,13 @@ void *runner_main2(void *data) {
   // work with)
 
   // Copy of the above residing on the GPU
-  int *d_task_first_part_self_dens, *d_task_last_part_self_dens;
-  int2 *task_first_part_self_dens_f4;
+  // A. N.: Needed
   int2 *task_first_part_f4;
   int2 *task_first_part_f4_f;
   int2 *task_first_part_f4_g;
   int2 *d_task_first_part_f4;
   int2 *d_task_first_part_f4_f;
   int2 *d_task_first_part_f4_g;
-  int *d_task_first_part_self_forc, *d_task_last_part_self_forc;
-  int *d_task_first_part_self_grad, *d_task_last_part_self_grad;
   int *d_task_first_parts_pair_dens, *d_task_last_parts_pair_dens;
 
   int4 *fparti_fpartj_lparti_lpartj_dens;
@@ -286,8 +283,6 @@ void *runner_main2(void *data) {
   int *d_task_first_parts_pair_forc, *d_task_last_parts_pair_forc;
   int *d_task_first_parts_pair_grad, *d_task_last_parts_pair_grad;
 
-  cudaMallocManaged((void **)&task_first_part_self_dens_f4,
-                    target_n_tasks * sizeof(int2), cudaMemAttachGlobal);
   cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2));
   cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2));
   cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2));
@@ -405,29 +400,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
                  2 * nBundles * sizeof(int));
 
-  // These I need to keep/////////////////
-  cudaMalloc((void **)&d_task_first_part_self_dens,
-             target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_part_self_forc,
-             target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_part_self_grad,
-             target_n_tasks * sizeof(int));
-
-  cudaMalloc((void **)&d_task_last_part_self_dens,
-             target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_part_self_forc,
-             target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_part_self_grad,
-             target_n_tasks * sizeof(int));
-  // These I need to keep/////////////////
-  pack_vars_self_dens->d_task_first_part = d_task_first_part_self_dens;
-  pack_vars_self_dens->d_task_last_part = d_task_last_part_self_dens;
-  // These I need to keep/////////////////
-  pack_vars_self_forc->d_task_first_part = d_task_first_part_self_forc;
-  pack_vars_self_forc->d_task_last_part = d_task_last_part_self_forc;
-  // These I need to keep/////////////////
-  pack_vars_self_grad->d_task_first_part = d_task_first_part_self_grad;
-  pack_vars_self_grad->d_task_last_part = d_task_last_part_self_grad;
 
   // These I need to keep/////////////////
   cudaMalloc((void **)&d_task_first_parts_pair_dens,
@@ -1035,7 +1007,7 @@ void *runner_main2(void *data) {
                   r, sched, pack_vars_self_dens, ci, t, parts_aos_f4_send,
                   parts_aos_f4_recv, d_parts_aos_f4_send, d_parts_aos_f4_recv,
                   stream, d_a, d_H, e, &packing_time, &time_for_density_gpu,
-                  &unpack_time_self, task_first_part_self_dens_f4, devId,
+                  &unpack_time_self, devId,
                   task_first_part_f4, d_task_first_part_f4, self_end);
             } /*End of GPU work Self*/
 #endif

From 7fe65f3dfd7455203abc79b1c320eb540b92efc1 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 13:56:15 +0000
Subject: [PATCH 195/217] Removed a few other now redundant arrays

---
 src/runner_main_clean.cu | 47 ++++++----------------------------------
 1 file changed, 7 insertions(+), 40 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 09173a760e..4db4386de2 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -265,24 +265,14 @@ void *runner_main2(void *data) {
   // Keep track of first and last particles for each task (particle data is
   // arranged in long arrays containing particles from all the tasks we will
   // work with)
-
-  // Copy of the above residing on the GPU
-  // A. N.: Needed
+  /* A. N.: Needed for offloading self tasks as we use these to sort through
+   *        which parts need to interact with which */
   int2 *task_first_part_f4;
   int2 *task_first_part_f4_f;
   int2 *task_first_part_f4_g;
   int2 *d_task_first_part_f4;
   int2 *d_task_first_part_f4_f;
   int2 *d_task_first_part_f4_g;
-  int *d_task_first_parts_pair_dens, *d_task_last_parts_pair_dens;
-
-  int4 *fparti_fpartj_lparti_lpartj_dens;
-  int4 *fparti_fpartj_lparti_lpartj_forc, *d_fparti_fpartj_lparti_lpartj_forc;
-  int4 *fparti_fpartj_lparti_lpartj_grad, *d_fparti_fpartj_lparti_lpartj_grad;
-
-  int *d_task_first_parts_pair_forc, *d_task_last_parts_pair_forc;
-  int *d_task_first_parts_pair_grad, *d_task_last_parts_pair_grad;
-
   cudaMallocHost((void **)&task_first_part_f4, target_n_tasks * sizeof(int2));
   cudaMalloc((void **)&d_task_first_part_f4, target_n_tasks * sizeof(int2));
   cudaMallocHost((void **)&task_first_part_f4_f, target_n_tasks * sizeof(int2));
@@ -290,18 +280,17 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&task_first_part_f4_g, target_n_tasks * sizeof(int2));
   cudaMalloc((void **)&d_task_first_part_f4_g, target_n_tasks * sizeof(int2));
 
+  /*A. N.: Needed but only for small part in launch functions. Might
+           be useful for recursion on the GPU so keep for now     */
+  int4 *fparti_fpartj_lparti_lpartj_dens;
+  int4 *fparti_fpartj_lparti_lpartj_forc;
+  int4 *fparti_fpartj_lparti_lpartj_grad;
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_dens,
                  target_n_tasks * sizeof(int4));
-
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_forc,
                  target_n_tasks * sizeof(int4));
-  cudaMalloc((void **)&d_fparti_fpartj_lparti_lpartj_forc,
-             target_n_tasks * sizeof(int4));
-
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
                  target_n_tasks * sizeof(int4));
-  cudaMalloc((void **)&d_fparti_fpartj_lparti_lpartj_grad,
-             target_n_tasks * sizeof(int4));
 
   // Arrays keeping track of the row numbers of the first and last particles
   // within each bundle. Required by the GPU code
@@ -400,28 +389,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
                  2 * nBundles * sizeof(int));
 
-
-  // These I need to keep/////////////////
-  cudaMalloc((void **)&d_task_first_parts_pair_dens,
-             2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_parts_pair_forc,
-             2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_first_parts_pair_grad,
-             2 * target_n_tasks * sizeof(int));
-
-  cudaMalloc((void **)&d_task_last_parts_pair_dens,
-             2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_parts_pair_forc,
-             2 * target_n_tasks * sizeof(int));
-  cudaMalloc((void **)&d_task_last_parts_pair_grad,
-             2 * target_n_tasks * sizeof(int));
-  // These I need to keep/////////////////
-  pack_vars_pair_dens->d_task_first_part = d_task_first_parts_pair_dens;
-  pack_vars_pair_dens->d_task_last_part = d_task_last_parts_pair_dens;
-  pack_vars_pair_forc->d_task_first_part = d_task_first_parts_pair_forc;
-  pack_vars_pair_forc->d_task_last_part = d_task_last_parts_pair_forc;
-  pack_vars_pair_grad->d_task_first_part = d_task_first_parts_pair_grad;
-  pack_vars_pair_grad->d_task_last_part = d_task_last_parts_pair_grad;
   // cell positions for self tasks REMEMBER to remove CPU copies as these are no
   // longer necessary
   double *d_dens_cell_x, *d_dens_cell_y, *d_dens_cell_z;

From f4f1e3798bd6e7546c370a723ad40aaead826bc9 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 14:03:43 +0000
Subject: [PATCH 196/217] Removed more redundant memory allocations

---
 src/runner_main_clean.cu | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4db4386de2..f331594b2f 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -292,38 +292,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
                  target_n_tasks * sizeof(int4));
 
-  // Arrays keeping track of the row numbers of the first and last particles
-  // within each bundle. Required by the GPU code
-
-  cudaMallocHost((void **)&pack_vars_self_dens->task_first_part,
-                 target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_self_dens->task_last_part,
-                 target_n_tasks * sizeof(int));
-
-  cudaMallocHost((void **)&pack_vars_pair_dens->task_first_part,
-                 2 * target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_pair_dens->task_last_part,
-                 2 * target_n_tasks * sizeof(int));
-
-  cudaMallocHost((void **)&pack_vars_self_forc->task_first_part,
-                 target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_self_forc->task_last_part,
-                 target_n_tasks * sizeof(int));
-
-  cudaMallocHost((void **)&pack_vars_pair_forc->task_first_part,
-                 2 * target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_pair_forc->task_last_part,
-                 2 * target_n_tasks * sizeof(int));
-
-  cudaMallocHost((void **)&pack_vars_self_grad->task_first_part,
-                 target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_self_grad->task_last_part,
-                 target_n_tasks * sizeof(int));
-
-  cudaMallocHost((void **)&pack_vars_pair_grad->task_first_part,
-                 2 * target_n_tasks * sizeof(int));
-  cudaMallocHost((void **)&pack_vars_pair_grad->task_last_part,
-                 2 * target_n_tasks * sizeof(int));
 
   /* nBundles is the number of task bundles each
   thread has ==> Used to loop through bundles */

From 67ed42dbbc140ee0b4114b495cb3fb37adec7b11 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 14:18:01 +0000
Subject: [PATCH 197/217] Removed now un-necessary arrays used to track cell
 positions and shifts

---
 src/runner_main_clean.cu | 119 +--------------------------------------
 1 file changed, 1 insertion(+), 118 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index f331594b2f..c13d41cb0d 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -292,7 +292,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&fparti_fpartj_lparti_lpartj_grad,
                  target_n_tasks * sizeof(int4));
 
-
   /* nBundles is the number of task bundles each
   thread has ==> Used to loop through bundles */
   int nBundles = (target_n_tasks + bundle_size - 1) / bundle_size;
@@ -313,8 +312,7 @@ void *runner_main2(void *data) {
   pack_vars_pair_grad->nBundles = nBundles_pair;
 
   // first part and last part are the first and last particle ids (locally
-  // within this thread)
-
+  // within this thread). A. Nasar: All these are used in GPU offload setup
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
                  nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
@@ -357,121 +355,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
                  2 * nBundles * sizeof(int));
 
-  // cell positions for self tasks REMEMBER to remove CPU copies as these are no
-  // longer necessary
-  double *d_dens_cell_x, *d_dens_cell_y, *d_dens_cell_z;
-  float3 *d_dens_f3_cell_x;
-  double *d_grad_cell_x, *d_grad_cell_y, *d_grad_cell_z;
-  double *d_forc_cell_x, *d_forc_cell_y, *d_forc_cell_z;
-  // Shifts for pair tasks REMEMBER to remove CPU copies as these are no longer
-  // necessary
-  double *d_dens_shift_x, *d_dens_shift_y, *d_dens_shift_z;
-  double *d_grad_shift_x, *d_grad_shift_y, *d_grad_shift_z;
-  double *d_forc_shift_x, *d_forc_shift_y, *d_forc_shift_z;
-
-  // These I need to keep/////////////////
-  cudaMalloc((void **)&d_dens_cell_x, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_dens_cell_y, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_dens_cell_z, target_n_tasks * sizeof(double));
-
-  cudaMalloc((void **)&d_dens_f3_cell_x, target_n_tasks * sizeof(float3));
-
-  cudaMalloc((void **)&d_forc_cell_x, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_forc_cell_y, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_forc_cell_z, target_n_tasks * sizeof(double));
-
-  cudaMalloc((void **)&d_grad_cell_x, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_grad_cell_y, target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_grad_cell_z, target_n_tasks * sizeof(double));
-
-  cudaMalloc((void **)&d_dens_shift_x, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_dens_shift_y, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_dens_shift_z, 2 * target_n_tasks * sizeof(double));
-
-  cudaMalloc((void **)&d_forc_shift_x, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_forc_shift_y, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_forc_shift_z, 2 * target_n_tasks * sizeof(double));
-
-  cudaMalloc((void **)&d_grad_shift_x, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_grad_shift_y, 2 * target_n_tasks * sizeof(double));
-  cudaMalloc((void **)&d_grad_shift_z, 2 * target_n_tasks * sizeof(double));
-  // These I need to keep/////////////////
-
-  cudaMallocHost((void **)&pack_vars_self_dens->cellx,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_dens->celly,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_dens->cellz,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_self_dens->d_cellx = d_dens_cell_x;
-  pack_vars_self_dens->d_celly = d_dens_cell_y;
-  pack_vars_self_dens->d_cellz = d_dens_cell_z;
-
-  cudaMallocHost(
-      (void **)&pack_vars_pair_dens->shiftx,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_dens->shifty,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_dens->shiftz,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_pair_dens->d_shiftx = d_dens_shift_x;
-  pack_vars_pair_dens->d_shifty = d_dens_shift_y;
-  pack_vars_pair_dens->d_shiftz = d_dens_shift_z;
-
-  cudaMallocHost((void **)&pack_vars_self_forc->cellx,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_forc->celly,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_forc->cellz,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_self_forc->d_cellx = d_forc_cell_x;
-  pack_vars_self_forc->d_celly = d_forc_cell_y;
-  pack_vars_self_forc->d_cellz = d_forc_cell_z;
-
-  cudaMallocHost(
-      (void **)&pack_vars_pair_forc->shiftx,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_forc->shifty,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_forc->shiftz,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_pair_forc->d_shiftx = d_forc_shift_x;
-  pack_vars_pair_forc->d_shifty = d_forc_shift_y;
-  pack_vars_pair_forc->d_shiftz = d_forc_shift_z;
-
-  cudaMallocHost((void **)&pack_vars_self_grad->cellx,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_grad->celly,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost((void **)&pack_vars_self_grad->cellz,
-                 target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_self_grad->d_cellx = d_grad_cell_x;
-  pack_vars_self_grad->d_celly = d_grad_cell_y;
-  pack_vars_self_grad->d_cellz = d_grad_cell_z;
-
-  cudaMallocHost(
-      (void **)&pack_vars_pair_grad->shiftx,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_grad->shifty,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-  cudaMallocHost(
-      (void **)&pack_vars_pair_grad->shiftz,
-      2 * target_n_tasks * sizeof(double));  // Pinned allocation on host
-
-  pack_vars_pair_grad->d_shiftx = d_grad_shift_x;
-  pack_vars_pair_grad->d_shifty = d_grad_shift_y;
-  pack_vars_pair_grad->d_shiftz = d_grad_shift_z;
-
   cudaStream_t stream[nBundles];
   cudaStream_t stream_pairs[nBundles_pair];
 

From 5919e41aca3b5fee05697ba298fbbb954992641a Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 14:38:57 +0000
Subject: [PATCH 198/217] Removed more redundant data structures.
 runner_main2() is now pretty much clean of un-necessary allocations. There
 are some counters that need to be remove eventually as they are just for
 debugging and timing but left in for now

---
 src/runner_main_clean.cu | 49 ++++++----------------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index c13d41cb0d..d5dd654b2e 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -313,6 +313,7 @@ void *runner_main2(void *data) {
 
   // first part and last part are the first and last particle ids (locally
   // within this thread). A. Nasar: All these are used in GPU offload setup
+
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_first_part,
                  nBundles * sizeof(int));
   cudaMallocHost((void **)&pack_vars_self_dens->bundle_last_part,
@@ -355,24 +356,24 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&pack_vars_pair_grad->bundle_first_task_list,
                  2 * nBundles * sizeof(int));
 
+  /*Create streams so that we can off-load different batches of work in
+   * different streams and get some con-CURRENCY! Events used to maximise
+   * asynchrony further*/
+
   cudaStream_t stream[nBundles];
   cudaStream_t stream_pairs[nBundles_pair];
 
   cudaEvent_t self_end[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end[i]);
-
   cudaEvent_t self_end_g[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_g[i]);
-
   cudaEvent_t self_end_f[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&self_end_f[i]);
 
   cudaEvent_t pair_end[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end[i]);
-
   cudaEvent_t pair_end_g[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_g[i]);
-
   cudaEvent_t pair_end_f[nBundles];
   for (int i = 0; i < nBundles; i++) cudaEventCreate(&pair_end_f[i]);
 
@@ -389,7 +390,6 @@ void *runner_main2(void *data) {
 
   for (int i = 0; i < nBundles; ++i)
     cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
-
   for (int i = 0; i < nBundles_pair; ++i)
     cudaStreamCreateWithFlags(&stream_pairs[i], cudaStreamNonBlocking);
 
@@ -417,16 +417,11 @@ void *runner_main2(void *data) {
   /*A. Nasar: Increase parts per recursed task-level cell by buffer to
     ensure we allocate enough memory*/
   int buff = ceil(0.5 * np_per_cell);
-
-  int tot_self_tasks = space->nr_parts / np_per_cell;
-
   /*A. Nasar: Multiplication by 2 is also to ensure we do not over-run
    *  the allocated memory on buffers and GPU. This can happen if calculated h
    * is larger than cell width and splitting makes bigger than target cells*/
   int count_max_parts_tmp = 64 * 8 * target_n_tasks * (np_per_cell + buff);
 
-  //  message("np per cell %i, max_parts %i, n_tasks_GPU %i\n", np_per_cell,
-  //  count_max_parts_tmp, target_n_tasks);
   pack_vars_self_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_dens->count_max_parts = count_max_parts_tmp;
   pack_vars_self_forc->count_max_parts = count_max_parts_tmp;
@@ -434,60 +429,44 @@ void *runner_main2(void *data) {
   pack_vars_self_grad->count_max_parts = count_max_parts_tmp;
   pack_vars_pair_grad->count_max_parts = count_max_parts_tmp;
 
-  struct part_aos *parts_aos_dens;
-  struct part_aos_f4 *parts_aos_dens_f4;
+  /*Declare Buffer and GPU particle arrays*/
   struct part_aos_f4_send *parts_aos_f4_send;
   struct part_aos_f4_recv *parts_aos_f4_recv;
 
-  struct part_aos_f *parts_aos_forc;
-  struct part_aos_f4_f *parts_aos_forc_f4;
   struct part_aos_f4_f_send *parts_aos_forc_f4_send;
   struct part_aos_f4_f_recv *parts_aos_forc_f4_recv;
 
-  struct part_aos_g *parts_aos_grad;
-  struct part_aos_f4_g *parts_aos_grad_f4;
   struct part_aos_f4_g_send *parts_aos_grad_f4_send;
   struct part_aos_f4_g_recv *parts_aos_grad_f4_recv;
 
-  struct part_aos *d_parts_aos_dens;
-  struct part_aos_f4 *d_parts_aos_dens_f4;
   struct part_aos_f4_send *d_parts_aos_f4_send;
   struct part_aos_f4_recv *d_parts_aos_f4_recv;
 
-  struct part_aos_f *d_parts_aos_forc;
-  struct part_aos_f4_f *d_parts_aos_forc_f4;
   struct part_aos_f4_f_send *d_parts_aos_forc_f4_send;
   struct part_aos_f4_f_recv *d_parts_aos_forc_f4_recv;
 
-  struct part_aos_g *d_parts_aos_grad;
-  struct part_aos_f4_g *d_parts_aos_grad_f4;
   struct part_aos_f4_g_send *d_parts_aos_grad_f4_send;
   struct part_aos_f4_g_recv *d_parts_aos_grad_f4_recv;
 
-  struct part_aos *parts_aos_pair_dens;
   struct part_aos_f4_send *parts_aos_pair_f4_send;
   struct part_aos_f4_recv *parts_aos_pair_f4_recv;
 
-  struct part_aos *d_parts_aos_pair_dens;
   struct part_aos_f4_send *d_parts_aos_pair_f4_send;
   struct part_aos_f4_recv *d_parts_aos_pair_f4_recv;
 
-  struct part_aos_f *parts_aos_pair_forc;
   struct part_aos_f4_f_send *parts_aos_pair_f4_f_send;
   struct part_aos_f4_f_recv *parts_aos_pair_f4_f_recv;
 
-  struct part_aos_f *d_parts_aos_pair_forc;
   struct part_aos_f4_f_send *d_parts_aos_pair_f4_f_send;
   struct part_aos_f4_f_recv *d_parts_aos_pair_f4_f_recv;
 
-  struct part_aos_g *parts_aos_pair_grad;
   struct part_aos_f4_g_send *parts_aos_pair_f4_g_send;
   struct part_aos_f4_g_recv *parts_aos_pair_f4_g_recv;
 
-  struct part_aos_g *d_parts_aos_pair_grad;
   struct part_aos_f4_g_send *d_parts_aos_pair_f4_g_send;
   struct part_aos_f4_g_recv *d_parts_aos_pair_f4_g_recv;
 
+  /*Now allocate memory for Buffer and GPU particle arrays*/
   cudaMalloc((void **)&d_parts_aos_f4_send,
              count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMalloc((void **)&d_parts_aos_f4_recv,
@@ -533,15 +512,6 @@ void *runner_main2(void *data) {
   cudaMalloc((void **)&d_parts_aos_pair_f4_g_recv,
              2 * count_max_parts_tmp * sizeof(struct part_aos_f4_g_recv));
 
-  ///////////Probably not needed
-  /// anymore////////////////////////////////////////////////////////////////
-  cudaMalloc((void **)&d_parts_aos_pair_forc,
-             2 * count_max_parts_tmp * sizeof(struct part_aos_f));
-  cudaMalloc((void **)&d_parts_aos_pair_grad,
-             2 * count_max_parts_tmp * sizeof(struct part_aos_g));
-  ///////////Probably not needed
-  /// anymore////////////////////////////////////////////////////////////////
-
   cudaMallocHost((void **)&parts_aos_pair_f4_send,
                  2 * count_max_parts_tmp * sizeof(struct part_aos_f4_send));
   cudaMallocHost((void **)&parts_aos_pair_f4_recv,
@@ -557,11 +527,6 @@ void *runner_main2(void *data) {
   cudaMallocHost((void **)&parts_aos_pair_f4_f_recv,
                  2 * count_max_parts_tmp * sizeof(struct part_aos_f4_f_recv));
 
-  cudaMallocHost((void **)&parts_aos_pair_forc,
-                 2 * count_max_parts_tmp * sizeof(struct part_aos_f));
-  cudaMallocHost((void **)&parts_aos_pair_grad,
-                 2 * count_max_parts_tmp * sizeof(struct part_aos_g));
-
   /*Declare some global variables*/
   float d_a = e->cosmology->a;
   float d_H = e->cosmology->H;

From 76e886a5aa5f0803a216ad4ac06b22a8c44ab0d9 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 15:16:51 +0000
Subject: [PATCH 199/217] Removed now redundant code in
 runner_gpu_pack_functions.c. Need to now move onto sorting out recursion
 through sub_pair tasks.

---
 src/runner_doiact_functions_hydro_gpu.h |    8 +-
 src/runner_gpu_pack_functions.c         | 1364 +----------------------
 src/runner_main_clean.cu                |    5 +-
 3 files changed, 12 insertions(+), 1365 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index a9f540540b..afac4d3b11 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -295,7 +295,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
-							  struct cell ** cells_left, struct cell ** cells_right, int depth) {
+							  struct cell ** cells_left, struct cell ** cells_right, int depth, int n_expected_tasks) {
 
 	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
   if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
@@ -317,21 +317,19 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	  /*We probably want to record */
 	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
 		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
-				n_leafs_found, cells_left, cells_right, depth + 1);
+				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks);
 //	        message("recursing to depth %i", depth + 1);
 	  }
 	}
   }
   else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
-//	  else { //A .Nasar: WE DEFO HAVE A LEAF
 	/* if any cell empty: skip */
 	if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
 	/*for all leafs to be sent add to cell list */
 	cells_left[*n_leafs_found] = ci;
 	cells_right[*n_leafs_found] = cj;
-//        message("incrementing");
 	*n_leafs_found = *n_leafs_found + 1;
-	if(*n_leafs_found >= 1024)
+	if(*n_leafs_found >= n_expected_tasks)
 		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
   }
 
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
index c69e4202a3..5e9acac977 100644
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -14,66 +14,6 @@
 #include "timers.h"
 #include "runner_doiact_hydro.h"
 
-// #ifdef WITHCUDA
-// extern "C" {
-// #endif
-
-void runner_doself1_gpu_pack_neat(struct runner *r, struct cell *c,
-                                  struct part_soa parts_soa_buffer, int timer,
-                                  int *pack_length, int tid,
-                                  int count_max_parts_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat(c, parts_soa_buffer, tid, local_pack_position, count);
-  /* Increment pack length accordingly */
-  (*pack_length) += count;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
-void runner_doself1_gpu_pack_neat_aos(struct runner *r, struct cell *c,
-                                      struct part_aos *parts_aos_buffer,
-                                      int timer, int *pack_length, int tid,
-                                      int count_max_parts_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    error("0");
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos(c, parts_aos_buffer, tid, local_pack_position, count);
-  /* Increment pack length accordingly */
-  (*pack_length) += count;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_doself1_gpu_pack_neat_aos_f4(
     struct runner *r, struct cell *__restrict__ c,
     struct part_aos_f4_send *__restrict__ parts_aos_buffer, int timer,
@@ -106,34 +46,6 @@ void runner_doself1_gpu_pack_neat_aos_f4(
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_doself1_gpu_pack_neat_aos_g(struct runner *r, struct cell *c,
-                                        struct part_aos_g *parts_aos_buffer,
-                                        int timer, int *pack_length, int tid,
-                                        int count_max_parts_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_g(c, parts_aos_buffer, tid, local_pack_position, count);
-  /* Increment pack length accordingly */
-  (*pack_length) += count;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_doself1_gpu_pack_neat_aos_f4_g(
     struct runner *r, struct cell *c,
     struct part_aos_f4_g_send *parts_aos_buffer, int timer, int *pack_length,
@@ -162,34 +74,6 @@ void runner_doself1_gpu_pack_neat_aos_f4_g(
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_doself1_gpu_pack_neat_aos_f(struct runner *r, struct cell *c,
-                                        struct part_aos_f *parts_aos_buffer,
-                                        int timer, int *pack_length, int tid,
-                                        int count_max_parts_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_f(c, parts_aos_buffer, tid, local_pack_position, count);
-  /* Increment pack length accordingly */
-  (*pack_length) += count;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_doself1_gpu_pack_neat_aos_f4_f(
     struct runner *r, struct cell *restrict c,
     struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
@@ -218,101 +102,6 @@ void runner_doself1_gpu_pack_neat_aos_f4_f(
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void pack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
-               int local_pack_position, int count) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    const struct part p = ptmps[i];
-    /*Data to be copied to GPU*/
-    parts_soa_buffer.x_p[id_in_pack] = p.x[0];
-    parts_soa_buffer.y_p[id_in_pack] = p.x[1];
-    parts_soa_buffer.z_p[id_in_pack] = p.x[2];
-    parts_soa_buffer.tid_p[id_in_pack] = tid;
-    parts_soa_buffer.ux[id_in_pack] = p.v[0];
-    parts_soa_buffer.uy[id_in_pack] = p.v[1];
-    parts_soa_buffer.uz[id_in_pack] = p.v[2];
-    parts_soa_buffer.locx[id_in_pack] = c->loc[0];
-    parts_soa_buffer.locy[id_in_pack] = c->loc[1];
-    parts_soa_buffer.locz[id_in_pack] = c->loc[2];
-    parts_soa_buffer.mass[id_in_pack] = p.mass;
-    parts_soa_buffer.h[id_in_pack] = p.h;
-    //    parts_soa_buffer.time_bin[id_in_pack] = p.time_bin;
-    /*Initialise sums to zero before CPU/GPU copy*/
-    parts_soa_buffer.rho[id_in_pack] = 0.f;        // p.rho;
-    parts_soa_buffer.rho_dh[id_in_pack] = 0.f;     // p.density.rho_dh;
-    parts_soa_buffer.wcount[id_in_pack] = 0.f;     // p.density.wcount;
-    parts_soa_buffer.wcount_dh[id_in_pack] = 0.f;  // p.density.wcount_dh;
-    parts_soa_buffer.div_v[id_in_pack] = 0.f;      // p.viscosity.div_v;
-    parts_soa_buffer.rot_ux[id_in_pack] = 0.f;     // p.density.rot_v[0];
-    parts_soa_buffer.rot_uy[id_in_pack] = 0.f;     // p.density.rot_v[1];
-    parts_soa_buffer.rot_uz[id_in_pack] = 0.f;     // p.density.rot_v[2];
-  }
-}
-
-void pack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
-                   int local_pack_position, int count) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    const struct part p = ptmps[i];
-    /*Data to be copied to GPU*/
-    parts_aos_buffer[id_in_pack].x_p = p.x[0];
-    parts_aos_buffer[id_in_pack].y_p = p.x[1];
-    parts_aos_buffer[id_in_pack].z_p = p.x[2];
-    parts_aos_buffer[id_in_pack].ux = p.v[0];
-    parts_aos_buffer[id_in_pack].uy = p.v[1];
-    parts_aos_buffer[id_in_pack].uz = p.v[2];
-    parts_aos_buffer[id_in_pack].mass = p.mass;
-    parts_aos_buffer[id_in_pack].h = p.h;
-    parts_aos_buffer[id_in_pack].time_bin = 1000;  // p.time_bin;
-    /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].rho = 0.f;        // p.rho;
-    parts_aos_buffer[id_in_pack].rho_dh = 0.f;     // p.density.rho_dh;
-    parts_aos_buffer[id_in_pack].wcount = 0.f;     // p.density.wcount;
-    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;  // p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].div_v = 0.f;      // p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].rot_ux = 0.f;     // p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rot_uy = 0.f;     // p.density.rot_v[1];
-    parts_aos_buffer[id_in_pack].rot_uz = 0.f;     // p.density.rot_v[2];
-  }
-}
-
-void pack_neat_pair_aos(struct cell *c, struct part_aos *parts_aos_buffer,
-                        int tid, int local_pack_position, int count,
-                        float3 shift) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    const struct part p = ptmps[i];
-    /*Data to be copied to GPU*/
-    parts_aos_buffer[id_in_pack].x_p = p.x[0] - shift.x;
-    parts_aos_buffer[id_in_pack].y_p = p.x[1] - shift.y;
-    parts_aos_buffer[id_in_pack].z_p = p.x[2] - shift.z;
-    parts_aos_buffer[id_in_pack].ux = p.v[0];
-    parts_aos_buffer[id_in_pack].uy = p.v[1];
-    parts_aos_buffer[id_in_pack].uz = p.v[2];
-    parts_aos_buffer[id_in_pack].mass = p.mass;
-    parts_aos_buffer[id_in_pack].h = p.h;
-    parts_aos_buffer[id_in_pack].time_bin = 1000;  // p.time_bin;
-    /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].rho = 0.f;        // p.rho;
-    parts_aos_buffer[id_in_pack].rho_dh = 0.f;     // p.density.rho_dh;
-    parts_aos_buffer[id_in_pack].wcount = 0.f;     // p.density.wcount;
-    parts_aos_buffer[id_in_pack].wcount_dh = 0.f;  // p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].div_v = 0.f;      // p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].rot_ux = 0.f;     // p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].rot_uy = 0.f;     // p.density.rot_v[1];
-    parts_aos_buffer[id_in_pack].rot_uz = 0.f;     // p.density.rot_v[2];
-  }
-}
-
 extern inline void pack_neat_pair_aos_f4(
     struct cell *__restrict c,
     struct part_aos_f4_send *__restrict parts_aos_buffer, int tid,
@@ -362,38 +151,6 @@ void pack_neat_aos_f4(struct cell *__restrict__ c,
   }
 }
 
-void pack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
-                     int tid, int local_pack_position, int count) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    const struct part p = ptmps[i];
-    /*Data to be copied to GPU*/
-    parts_aos_buffer[id_in_pack].x_p = p.x[0];
-    parts_aos_buffer[id_in_pack].y_p = p.x[1];
-    parts_aos_buffer[id_in_pack].z_p = p.x[2];
-    parts_aos_buffer[id_in_pack].ux = p.v[0];
-    parts_aos_buffer[id_in_pack].uy = p.v[1];
-    parts_aos_buffer[id_in_pack].uz = p.v[2];
-    parts_aos_buffer[id_in_pack].mass = p.mass;
-    parts_aos_buffer[id_in_pack].h = p.h;
-    parts_aos_buffer[id_in_pack].time_bin = 1000;
-    parts_aos_buffer[id_in_pack].rho = p.rho;
-    parts_aos_buffer[id_in_pack].visc_alpha = p.viscosity.alpha;
-    parts_aos_buffer[id_in_pack].alpha_visc_max_ngb =
-        p.force.alpha_visc_max_ngb;  // p.density.wcount_dh;
-    parts_aos_buffer[id_in_pack].v_sig =
-        p.viscosity.v_sig;  // p.viscosity.div_v;
-    parts_aos_buffer[id_in_pack].soundspeed =
-        p.force.soundspeed;                // p.density.rot_v[0];
-    parts_aos_buffer[id_in_pack].u = p.u;  // p.density.rot_v[0];
-    /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos_buffer[id_in_pack].laplace_u = 0.f;  // p.density.wcount;
-  }
-}
-
 void pack_neat_aos_f4_g(struct cell *c,
                         struct part_aos_f4_g_send *parts_aos_buffer, int tid,
                         int local_pack_position, int count) {
@@ -449,43 +206,6 @@ extern inline void pack_neat_pair_aos_f4_g(
   }
 }
 
-void pack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos, int tid,
-                     int local_pack_position, int count) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    const struct part p = ptmps[i];
-    /*Data to be copied to GPU*/
-    parts_aos[id_in_pack].x_p = p.x[0];
-    parts_aos[id_in_pack].y_p = p.x[1];
-    parts_aos[id_in_pack].z_p = p.x[2];
-    parts_aos[id_in_pack].ux = p.v[0];
-    parts_aos[id_in_pack].uy = p.v[1];
-    parts_aos[id_in_pack].uz = p.v[2];
-    parts_aos[id_in_pack].mass = p.mass;
-    parts_aos[id_in_pack].h = p.h;
-    parts_aos[id_in_pack].time_bin = p.time_bin;
-    parts_aos[id_in_pack].min_ngb_time_bin = p.limiter_data.min_ngb_time_bin;
-    parts_aos[id_in_pack].rho = p.rho;
-    parts_aos[id_in_pack].pressure = p.force.pressure;
-    parts_aos[id_in_pack].soundspeed = p.force.soundspeed;
-    parts_aos[id_in_pack].f = p.force.f;
-    parts_aos[id_in_pack].balsara = p.force.balsara;
-    parts_aos[id_in_pack].alpha_visc = p.viscosity.alpha;
-    parts_aos[id_in_pack].a_hydrox = 0.0;
-    parts_aos[id_in_pack].a_hydroy = 0.0;
-    parts_aos[id_in_pack].a_hydroz = 0.0;
-    parts_aos[id_in_pack].alpha_diff = p.diffusion.alpha;
-    parts_aos[id_in_pack].u = p.u;
-    parts_aos[id_in_pack].u_dt = 0.0;
-    parts_aos[id_in_pack].h_dt = 0.0;
-    /*Initialise sums to zero before CPU/GPU copy*/
-    parts_aos[id_in_pack].v_sig = p.viscosity.v_sig;
-  }
-}
-
 void pack_neat_aos_f4_f(const struct cell *restrict c,
                         struct part_aos_f4_f_send *restrict parts_aos, int tid,
                         int local_pack_position, int count) {
@@ -570,69 +290,6 @@ extern inline void pack_neat_pair_aos_f4_f(
   }
 }
 
-void runner_doself1_gpu_unpack_neat(struct runner *r, struct cell *c,
-                                    struct part_soa parts_soa_buffer, int timer,
-                                    int *pack_length, int tid,
-                                    int count_max_parts_tmp, struct engine *e) {
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  if (!cell_is_active_hydro(c, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count, e);
-  }
-#endif
-
-  /* Copy particle data from CPU buffers to cells */
-  unpack_neat(c, parts_soa_buffer, tid, local_pack_position, count, e);
-  // Increment pack length accordingly
-  (*pack_length) += count;
-}
-
-void runner_doself1_gpu_unpack_neat_aos(struct runner *r, struct cell *c,
-                                        struct part_aos *parts_aos_buffer,
-                                        int timer, int *pack_length, int tid,
-                                        int count_max_parts_tmp,
-                                        struct engine *e) {
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  if (!cell_is_active_hydro(c, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count, e);
-  }
-#endif
-
-  /* Copy particle data from CPU buffers to cells */
-  unpack_neat_aos(c, parts_aos_buffer, tid, local_pack_position, count, e);
-  // Increment pack length accordingly
-  (*pack_length) += count;
-}
-
 void runner_doself1_gpu_unpack_neat_aos_f4(
     struct runner *r, struct cell *c, struct part_aos_f4_recv *parts_aos_buffer,
     int timer, int *pack_length, int tid, int count_max_parts_tmp,
@@ -664,38 +321,6 @@ void runner_doself1_gpu_unpack_neat_aos_f4(
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_g(struct runner *r, struct cell *c,
-                                          struct part_aos_g *parts_aos_buffer,
-                                          int timer, int *pack_length, int tid,
-                                          int count_max_parts_tmp,
-                                          struct engine *e) {
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  if (!cell_is_active_hydro(c, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count, e);
-  }
-#endif
-
-  /* Copy particle data from CPU buffers to cells */
-  unpack_neat_aos_g(c, parts_aos_buffer, tid, local_pack_position, count, e);
-  // Increment pack length accordingly
-  (*pack_length) += count;
-}
-
 void runner_doself1_gpu_unpack_neat_aos_f4_g(
     struct runner *r, struct cell *c,
     struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
@@ -727,38 +352,6 @@ void runner_doself1_gpu_unpack_neat_aos_f4_g(
   (*pack_length) += count;
 }
 
-void runner_doself1_gpu_unpack_neat_aos_f(struct runner *r, struct cell *c,
-                                          struct part_aos_f *parts_aos_buffer,
-                                          int timer, int *pack_length, int tid,
-                                          int count_max_parts_tmp,
-                                          struct engine *e) {
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  if (!cell_is_active_hydro(c, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count, e);
-  }
-#endif
-
-  /* Copy particle data from CPU buffers to cells */
-  unpack_neat_aos_f(c, parts_aos_buffer, tid, local_pack_position, count, e);
-  // Increment pack length accordingly
-  (*pack_length) += count;
-}
-
 void runner_doself1_gpu_unpack_neat_aos_f4_f(
     struct runner *r, struct cell *c,
     struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
@@ -790,161 +383,12 @@ void runner_doself1_gpu_unpack_neat_aos_f4_f(
   (*pack_length) += count;
 }
 
-void unpack_neat(struct cell *c, struct part_soa parts_soa_buffer, int tid,
-                 int local_pack_position, int count, struct engine *e) {
-
-  //  struct part *ptmps;
-  //  ptmps=c->hydro.parts;
-
-  //  memcpy(&rho[0], &parts_soa_buffer.rho[local_pack_position], count *
-  //  sizeof(float));
-  //	fprintf(stderr, "count %i\n", count);
-  //  memcpy(rho, &parts_soa_buffer.rho[local_pack_position], count *
-  //  sizeof(float)); memcpy(rho_dh,
-  //  &parts_soa_buffer.rho_dh[local_pack_position], count * sizeof(float));
-  //  memcpy(wcount, &parts_soa_buffer.wcount[local_pack_position], count *
-  //  sizeof(float)); memcpy(wcount_dh,
-  //  &parts_soa_buffer.wcount_dh[local_pack_position], count * sizeof(float));
-  //  memcpy(div_v, &parts_soa_buffer.div_v[local_pack_position], count *
-  //  sizeof(float)); memcpy(rot_ux,
-  //  &parts_soa_buffer.rot_ux[local_pack_position], count * sizeof(float));
-  //  memcpy(rot_uy, &parts_soa_buffer.rot_uy[local_pack_position], count *
-  //  sizeof(float)); memcpy(rot_uz,
-  //  &parts_soa_buffer.rot_uz[local_pack_position], count * sizeof(float));
-  float *rho =
-      &parts_soa_buffer
-           .rho[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *rho_dh =
-      &parts_soa_buffer
-           .rho_dh[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *wcount =
-      &parts_soa_buffer
-           .wcount[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *wcount_dh =
-      &parts_soa_buffer.wcount_dh[local_pack_position];  // = calloc(count,
-                                                         // sizeof(float));//
-  float *div_v =
-      &parts_soa_buffer
-           .div_v[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *rot_ux =
-      &parts_soa_buffer
-           .rot_ux[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *rot_uy =
-      &parts_soa_buffer
-           .rot_uy[local_pack_position];  // = calloc(count, sizeof(float));//
-  float *rot_uz =
-      &parts_soa_buffer
-           .rot_uz[local_pack_position];  // = calloc(count, sizeof(float));//
-
-  //  fprintf(stderr, "rho %f rho %f\n", rho[1],
-  //  parts_soa_buffer.rho[local_pack_position+1]);
-  for (int i = 0; i < count; i++) {
-    //    int id_in_pack = i + local_pack_position;
-    //    struct part *part_cpu = &c->hydro.parts[i];
-    struct part *pi = &c->hydro.parts[i];
-    //    if (part_is_inhibited(pi, e)) {
-    //      fprintf(stderr, "inhibited part\n");
-    //      continue;
-    //    }
-    //    const int pi_active = part_is_active(pi, e);
-    //    if (pi_active) {
-    pi->rho += rho[i];
-    //      c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
-    pi->density.rho_dh += rho_dh[i];
-    pi->density.wcount += wcount[i];
-    pi->density.wcount_dh += wcount_dh[i];
-    pi->viscosity.div_v += div_v[i];
-    pi->density.rot_v[0] += rot_ux[i];
-    pi->density.rot_v[1] += rot_uy[i];
-    pi->density.rot_v[2] += rot_uz[i];
-
-    //      c->hydro.parts[i].rho += rho[i];
-    //      c->hydro.parts[i].density.rho_dh += rho_dh[i];
-    //      c->hydro.parts[i].density.wcount += wcount[i];
-    //      c->hydro.parts[i].density.wcount_dh += wcount_dh[i];
-    //      c->hydro.parts[i].viscosity.div_v += div_v[i];
-    //      c->hydro.parts[i].density.rot_v[0] += rot_ux[i];
-    //      c->hydro.parts[i].density.rot_v[1] += rot_uy[i];
-    //      c->hydro.parts[i].density.rot_v[2] += rot_uz[i];
-
-    //      c->hydro.parts[i].rho += parts_tmp->rho[i];
-    //      c->hydro.parts[i].density.rho_dh += parts_tmp->rho_dh[i];
-    //      c->hydro.parts[i].density.wcount += parts_tmp->wcount[i];
-    //      c->hydro.parts[i].density.wcount_dh += parts_tmp->wcount_dh[i];
-    //      c->hydro.parts[i].viscosity.div_v += parts_tmp->div_v[i];
-    //      c->hydro.parts[i].density.rot_v[0] += parts_tmp->rot_ux[i];
-    //      c->hydro.parts[i].density.rot_v[1] += parts_tmp->rot_uy[i];
-    //      c->hydro.parts[i].density.rot_v[2] += parts_tmp->rot_uz[i];
-
-    //      part_cpu[i].rho += parts_soa_buffer.rho[i];
-    //      part_cpu[i].density.rho_dh += parts_soa_buffer.rho_dh[i];
-    //      part_cpu[i].density.wcount += parts_soa_buffer.wcount[i];
-    //      part_cpu[i].density.wcount_dh += parts_soa_buffer.wcount_dh[i];
-    //      part_cpu[i].viscosity.div_v += parts_soa_buffer.div_v[i];
-    //      part_cpu[i].density.rot_v[0] += parts_soa_buffer.rot_ux[i];
-    //      part_cpu[i].density.rot_v[1] += parts_soa_buffer.rot_uy[i];
-    //      part_cpu[i].density.rot_v[2] += parts_soa_buffer.rot_uz[i];
-    //    }
-    //    else fprintf(stderr,"a part is not active\n");
-  }
-  //  c->hydro.parts=ptmps;
-}
-void unpack_neat_aos(struct cell *c, struct part_aos *parts_aos_buffer, int tid,
-                     int local_pack_position, int count, struct engine *e) {
-
-  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
-  //  sizeof(float));// float *rho_dh  =
-  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
-  //  sizeof(float));// float *wcount  =
-  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
-  //  sizeof(float));// float *wcount_dh =
-  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
-  //  sizeof(float));// float *div_v  =
-  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
-  //  sizeof(float));// float *rot_ux  =
-  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
-  //  sizeof(float));// float *rot_uy  =
-  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
-  //  sizeof(float));// float *rot_uz  =
-  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
-  //  sizeof(float));//
-  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
-
-    struct part_aos p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    p->rho += p_tmp.rho;
-    p->density.rho_dh += p_tmp.rho_dh;
-    p->density.wcount += p_tmp.wcount;
-    p->density.wcount_dh += p_tmp.wcount_dh;
-    p->viscosity.div_v += p_tmp.div_v;
-    p->density.rot_v[0] += p_tmp.rot_ux;
-    p->density.rot_v[1] += p_tmp.rot_uy;
-    p->density.rot_v[2] += p_tmp.rot_uz;
-  }
-}
-#include <stdatomic.h>
-void unpack_neat_aos_f4(struct cell *c,
-                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
-                        int local_pack_position, int count, struct engine *e) {
-
-  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
-  //  sizeof(float));// float *rho_dh  =
-  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
-  //  sizeof(float));// float *wcount  =
-  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
-  //  sizeof(float));// float *wcount_dh =
-  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
-  //  sizeof(float));// float *div_v  =
-  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
-  //  sizeof(float));// float *rot_ux  =
-  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
-  //  sizeof(float));// float *rot_uy  =
-  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
-  //  sizeof(float));// float *rot_uz  =
-  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
-  //  sizeof(float));//
-  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
+#include <stdatomic.h>
+void unpack_neat_aos_f4(struct cell *c,
+                        struct part_aos_f4_recv *parts_aos_buffer, int tid,
+                        int local_pack_position, int count, struct engine *e) {
+
+  struct part_aos_f4_recv *parts_tmp = &parts_aos_buffer[local_pack_position];
   for (int i = 0; i < count; i++) {
 
     struct part_aos_f4_recv p_tmp = parts_tmp[i];
@@ -960,24 +404,6 @@ void unpack_neat_aos_f4(struct cell *c,
     p->density.rot_v[1] += rot_ux_div_v.y;
     p->density.rot_v[2] += rot_ux_div_v.z;
     p->viscosity.div_v += rot_ux_div_v.w;
-    //	      fprintf(stderr, "rho %f div_v %f\n", p_tmp.rho_dh_wcount.x,
-    // p_tmp.rot_ux_div_v.w);
-  }
-}
-
-void unpack_neat_aos_g(struct cell *c, struct part_aos_g *parts_aos_buffer,
-                       int tid, int local_pack_position, int count,
-                       struct engine *e) {
-
-  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
-    struct part_aos_g p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    const float v_sig = p->viscosity.v_sig;
-    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-    p->diffusion.laplace_u += p_tmp.laplace_u;
-    const float max_ngb = p->force.alpha_visc_max_ngb;
-    p->force.alpha_visc_max_ngb = max(p_tmp.alpha_visc_max_ngb, max_ngb);
   }
 }
 
@@ -999,42 +425,12 @@ void unpack_neat_aos_f4_g(struct cell *c,
   }
 }
 
-void unpack_neat_aos_f(struct cell *c, struct part_aos_f *parts_aos_buffer,
-                       int tid, int local_pack_position, int count,
-                       struct engine *e) {
-
-  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
-    struct part_aos_f p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    p->a_hydro[0] += p_tmp.a_hydrox;
-    p->a_hydro[1] += p_tmp.a_hydroy;
-    p->a_hydro[2] += p_tmp.a_hydroz;
-    p->u_dt += p_tmp.u_dt;
-    p->force.h_dt += p_tmp.h_dt;
-    //	      p->limiter_data.min_ngb_time_bin = min(p_tmp.min_ngb_time_bin,
-    // p->limiter_data.min_ngb_time_bin);
-    p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
-    const float v_sig = p->viscosity.v_sig;
-    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-    //	      p->viscosity.v_sig = p_tmp.v_sig;
-
-    //          fprintf(stderr, "ax %f ay %f az %f\n", p_tmp.a_hydrox,
-    //          p_tmp.a_hydroy, p_tmp.a_hydroz);
-  }
-}
-
 void unpack_neat_aos_f4_f(struct cell *restrict c,
                           struct part_aos_f4_f_recv *restrict parts_aos_buffer,
                           int tid, int local_pack_position, int count,
                           struct engine *e) {
-
-  //	  struct part_aos_f4_f_recv *restrict parts_tmp =
-  //&parts_aos_buffer[local_pack_position];
   int pp = local_pack_position;
   for (int i = 0; i < count; i++) {
-    //	      struct part_aos_f4_f_recv p_tmp = parts_tmp[i];
-    //	      struct part *restrict p = &c->hydro.parts[i];
 	if(!PART_IS_ACTIVE(&c->hydro.parts[i], e))continue;
     c->hydro.parts[i].a_hydro[0] += parts_aos_buffer[i + pp].a_hydro.x;
     c->hydro.parts[i].a_hydro[1] += parts_aos_buffer[i + pp].a_hydro.y;
@@ -1057,124 +453,6 @@ void unpack_neat_aos_f4_f(struct cell *restrict c,
   }
 }
 
-void unpack_neat_pair(struct runner *r, struct cell *c,
-                      struct part_soa parts_soa_buffer, int tid,
-                      int local_pack_position, int count, struct engine *e) {
-
-  //  struct part *ptmps;
-  //  ptmps=c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    //    struct part *pi = &c->hydro.parts[i];
-    //    if (part_is_inhibited(pi, e)) {
-    //      fprintf(stderr, "inhibited part\n");
-    //      continue;
-    //    }
-    //    const int pi_active = part_is_active(pi, e);
-    //    if (pi_active) {
-    c->hydro.parts[i].rho += parts_soa_buffer.rho[id_in_pack];
-    c->hydro.parts[i].density.rho_dh += parts_soa_buffer.rho_dh[id_in_pack];
-    c->hydro.parts[i].density.wcount += parts_soa_buffer.wcount[id_in_pack];
-    c->hydro.parts[i].density.wcount_dh +=
-        parts_soa_buffer.wcount_dh[id_in_pack];
-    c->hydro.parts[i].viscosity.div_v += parts_soa_buffer.div_v[id_in_pack];
-    c->hydro.parts[i].density.rot_v[0] += parts_soa_buffer.rot_ux[id_in_pack];
-    c->hydro.parts[i].density.rot_v[1] += parts_soa_buffer.rot_uy[id_in_pack];
-    c->hydro.parts[i].density.rot_v[2] += parts_soa_buffer.rot_uz[id_in_pack];
-    //      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i,
-    //      parts_soa_buffer.rho[id_in_pack]);
-    //    }
-    //    else fprintf(stderr,"a part is not active\n");
-  }
-  //  c->hydro.parts=ptmps;
-}
-
-void unpack_neat_pair_aos(struct runner *r, struct cell *c,
-                          struct part_aos *parts_aos_buffer, int tid,
-                          int local_pack_position, int count,
-                          struct engine *e) {
-
-  //  float *rho = &parts_aos_buffer[local_pack_position].rho;// = calloc(count,
-  //  sizeof(float));// float *rho_dh  =
-  //  &parts_aos_buffer[local_pack_position].rho_dh;// = calloc(count,
-  //  sizeof(float));// float *wcount  =
-  //  &parts_aos_buffer[local_pack_position].wcount;// = calloc(count,
-  //  sizeof(float));// float *wcount_dh =
-  //  &parts_aos_buffer[local_pack_position].wcount_dh;// = calloc(count,
-  //  sizeof(float));// float *div_v  =
-  //  &parts_aos_buffer[local_pack_position].div_v;// = calloc(count,
-  //  sizeof(float));// float *rot_ux  =
-  //  &parts_aos_buffer[local_pack_position].rot_ux;// = calloc(count,
-  //  sizeof(float));// float *rot_uy  =
-  //  &parts_aos_buffer[local_pack_position].rot_uy;// = calloc(count,
-  //  sizeof(float));// float *rot_uz  =
-  //  &parts_aos_buffer[local_pack_position].rot_uz;// = calloc(count,
-  //  sizeof(float));//
-  struct part_aos *parts_tmp = &parts_aos_buffer[local_pack_position];
-  //  struct part *ptmps;
-  //  ptmps=c->hydro.parts;
-  //  struct part *part_cpu = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    //    int id_in_pack = i + local_pack_position;
-    //      struct part_aos part_gpu = parts_aos_buffer[id_in_pack];
-    //    struct part *pi = &c->hydro.parts[i];
-    //    if (part_is_inhibited(pi, e)) {
-    //      fprintf(stderr, "inhibited part\n");
-    //      continue;
-    //    }
-    //    const int pi_active = part_is_active(pi, e);
-    //    if (pi_active) {
-    //      if(parts_aos_buffer[id_in_pack].time_bin == 1000)(*count1000)++
-    //      ;//fprintf(stderr, "timebin %i\n",
-    //      parts_aos_buffer[id_in_pack].time_bin); else
-    //      if(parts_aos_buffer[id_in_pack].time_bin == 20)(*count20)++
-    //      ;//fprintf(stderr, "timebin %i\n",
-    //      parts_aos_buffer[id_in_pack].time_bin); else fprintf(stderr, "not 20
-    //      or 1000\n");
-    //
-    struct part_aos p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    p->rho += p_tmp.rho;
-    p->density.rho_dh += p_tmp.rho_dh;
-    p->density.wcount += p_tmp.wcount;
-    p->density.wcount_dh += p_tmp.wcount_dh;
-    p->viscosity.div_v += p_tmp.div_v;
-    p->density.rot_v[0] += p_tmp.rot_ux;
-    p->density.rot_v[1] += p_tmp.rot_uy;
-    p->density.rot_v[2] += p_tmp.rot_uz;
-
-    //      c->hydro.parts[i].rho += parts_aos_buffer[id_in_pack].rho;
-    //      c->hydro.parts[i].density.rho_dh +=
-    //      parts_aos_buffer[id_in_pack].rho_dh;
-    //      c->hydro.parts[i].density.wcount +=
-    //      parts_aos_buffer[id_in_pack].wcount;
-    //      c->hydro.parts[i].density.wcount_dh +=
-    //      parts_aos_buffer[id_in_pack].wcount_dh;
-    //      c->hydro.parts[i].viscosity.div_v +=
-    //      parts_aos_buffer[id_in_pack].div_v;
-    //      c->hydro.parts[i].density.rot_v[0] +=
-    //      parts_aos_buffer[id_in_pack].rot_ux;
-    //      c->hydro.parts[i].density.rot_v[1] +=
-    //      parts_aos_buffer[id_in_pack].rot_uy;
-    //      c->hydro.parts[i].density.rot_v[2] +=
-    //      parts_aos_buffer[id_in_pack].rot_uz;
-
-    //      part_cpu[i].rho += part_gpu.rho;
-    //      part_cpu[i].density.rho_dh += part_gpu.rho_dh;
-    //      part_cpu[i].density.wcount += part_gpu.wcount;
-    //      part_cpu[i].density.wcount_dh += part_gpu.wcount_dh;
-    //      part_cpu[i].viscosity.div_v += part_gpu.div_v;
-    //      part_cpu[i].density.rot_v[0] += part_gpu.rot_ux;
-    //      part_cpu[i].density.rot_v[1] += part_gpu.rot_uy;
-    //      part_cpu[i].density.rot_v[2] += part_gpu.rot_uz;
-    //      if(r->cpuid == 0)fprintf(stderr, "i %i rho %lf\n", i,
-    //      parts_soa_buffer.rho[id_in_pack]);
-    //    }
-    //    else fprintf(stderr,"a part is not active\n");
-  }
-  //  c->hydro.parts=ptmps;
-}
-
 void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c,
                              struct part_aos_f4_recv *restrict parts_aos_buffer,
                              int tid, int local_pack_position, int count,
@@ -1199,20 +477,6 @@ void unpack_neat_pair_aos_f4(struct runner *r, struct cell *restrict c,
   }
 }
 
-void unpack_neat_pair_aos_g(struct runner *r, struct cell *c,
-                            struct part_aos_g *parts_aos_buffer, int tid,
-                            int local_pack_position, int count,
-                            struct engine *e) {
-  struct part_aos_g *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
-    struct part_aos_g p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    p->viscosity.v_sig = p_tmp.v_sig;
-    p->diffusion.laplace_u += p_tmp.laplace_u;
-    p->force.alpha_visc_max_ngb = p_tmp.alpha_visc_max_ngb;
-  }
-}
-
 void unpack_neat_pair_aos_f4_g(
     struct runner *r, struct cell *restrict c,
     struct part_aos_f4_g_recv *restrict parts_aos_buffer, int tid,
@@ -1244,26 +508,6 @@ void unpack_neat_pair_aos_f4_g(
   }
 }
 
-void unpack_neat_pair_aos_f(struct runner *r, struct cell *c,
-                            struct part_aos_f *parts_aos_buffer, int tid,
-                            int local_pack_position, int count,
-                            struct engine *e) {
-  struct part_aos_f *parts_tmp = &parts_aos_buffer[local_pack_position];
-  for (int i = 0; i < count; i++) {
-    struct part_aos_f p_tmp = parts_tmp[i];
-    struct part *p = &c->hydro.parts[i];
-    p->a_hydro[0] += p_tmp.a_hydrox;
-    p->a_hydro[1] += p_tmp.a_hydroy;
-    p->a_hydro[2] += p_tmp.a_hydroz;
-    p->u_dt += p_tmp.u_dt;
-    p->force.h_dt += p_tmp.h_dt;
-    const float v_sig = p->viscosity.v_sig;
-    p->viscosity.v_sig = max(p_tmp.v_sig, v_sig);
-    p->limiter_data.min_ngb_time_bin = p_tmp.min_ngb_time_bin;
-    //	      p->viscosity.v_sig = p_tmp.v_sig;
-  }
-}
-
 void unpack_neat_pair_aos_f4_f(
     struct runner *r, struct cell *restrict c,
     struct part_aos_f4_f_recv *restrict parts_aos_buffer, int tid,
@@ -1292,97 +536,6 @@ void unpack_neat_pair_aos_f4_f(
   }
 }
 
-void runner_do_ci_cj_gpu_unpack_neat(struct runner *r, struct cell *ci,
-                                     struct cell *cj,
-                                     struct part_soa parts_soa_buffer,
-                                     int timer, int *pack_length, int tid,
-                                     int count_max_parts_tmp,
-                                     struct engine *e) {
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  //  if (c->hydro.count == 0)
-  //    return;
-  if (!cell_is_active_hydro(ci, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count_ci = ci->hydro.count;
-  int count_cj = cj->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count_ci, e);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair(r, ci, parts_soa_buffer, tid, local_pack_position, count_ci,
-                   e);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair(r, cj, parts_soa_buffer, tid, local_pack_position, count_cj,
-                   e);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-  //  if(r->cpuid == 0)exit(0);
-}
-
-void runner_do_ci_cj_gpu_unpack_neat_aos(struct runner *r, struct cell *ci,
-                                         struct cell *cj,
-                                         struct part_aos *parts_aos_buffer,
-                                         int timer, int *pack_length, int tid,
-                                         int count_max_parts_tmp,
-                                         struct engine *e) {
-
-  /* Anything to do here? */
-  //  if (c->hydro.count == 0)
-  //    return;
-  if (!cell_is_active_hydro(ci, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count_ci = ci->hydro.count;
-  int count_cj = cj->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count_ci, e);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos(r, ci, parts_aos_buffer, tid, local_pack_position,
-                       count_ci, e);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos(r, cj, parts_aos_buffer, tid, local_pack_position,
-                       count_cj, e);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-  //  if(r->cpuid == 0)exit(0);
-}
-
 void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
     struct runner *r, struct cell *ci, struct cell *cj,
     struct part_aos_f4_recv *parts_aos_buffer, int timer, int *pack_length,
@@ -1425,50 +578,6 @@ void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
   //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_g(struct runner *r, struct cell *ci,
-                                           struct cell *cj,
-                                           struct part_aos_g *parts_aos_buffer,
-                                           int timer, int *pack_length, int tid,
-                                           int count_max_parts_tmp,
-                                           struct engine *e) {
-
-  /* Anything to do here? */
-  //  if (c->hydro.count == 0)
-  //    return;
-  if (!cell_is_active_hydro(ci, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count_ci = ci->hydro.count;
-  int count_cj = cj->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count_ci, e);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_g(r, ci, parts_aos_buffer, tid, local_pack_position,
-                         count_ci, e);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_g(r, cj, parts_aos_buffer, tid, local_pack_position,
-                         count_cj, e);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-  //  if(r->cpuid == 0)exit(0);
-}
-
 void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
     struct runner *r, struct cell *ci, struct cell *cj,
     struct part_aos_f4_g_recv *parts_aos_buffer, int timer, int *pack_length,
@@ -1511,56 +620,6 @@ void runner_do_ci_cj_gpu_unpack_neat_aos_f4_g(
   //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_do_ci_cj_gpu_unpack_neat_aos_f(struct runner *r, struct cell *ci,
-                                           struct cell *cj,
-                                           struct part_aos_f *parts_aos_buffer,
-                                           int timer, int *pack_length, int tid,
-                                           int count_max_parts_tmp,
-                                           struct engine *e) {
-
-  /* Anything to do here? */
-  //  if (c->hydro.count == 0)
-  //    return;
-  if (!cell_is_active_hydro(ci, e)) {
-    message("Inactive cell\n");
-    return;
-  }
-  int count_ci = ci->hydro.count;
-  int count_cj = cj->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i pointer to pack_length is %i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), pack_length, local_pack_position, count_ci, e);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f(r, ci, parts_aos_buffer, tid, local_pack_position,
-                         count_ci, e);
-  local_pack_position += count_ci;
-  //  for (int i = 0; i < count_ci; i++){
-  //    struct part *p = &ci->hydro.parts[i];
-  //    fprintf(stderr, "ax %f, ay %f, az %f, u_dt %f, h_dt %f\n",
-  //    p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->u_dt, p->force.h_dt);
-  //  }
-  //	      p->viscosity.v_sig = p_tmp.v_sig;
-  /* Pack the particle data into CPU-side buffers*/
-  //  if(r->cpuid == 0)fprintf(stderr, "unpacking ci l_pos %i count_i %i count_j
-  //  %i\n", local_pack_position, count_ci, count_cj);
-  unpack_neat_pair_aos_f(r, cj, parts_aos_buffer, tid, local_pack_position,
-                         count_cj, e);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-  //  if(r->cpuid == 0)exit(0);
-}
-
 void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
     struct runner *r, struct cell *ci, struct cell *cj,
     struct part_aos_f4_f_recv *parts_aos_buffer, int timer, int *pack_length,
@@ -1603,106 +662,6 @@ void runner_do_ci_cj_gpu_unpack_neat_aos_f4_f(
   //  if(r->cpuid == 0)exit(0);
 }
 
-void runner_dopair_gpu_pack_neat(struct runner *r, struct cell *c,
-                                 struct part_soa parts_soa_buffer, int timer,
-                                 int *pack_length, int tid,
-                                 int count_max_parts_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat(c, parts_soa_buffer, tid, local_pack_position, count);
-  /* Increment pack length accordingly */
-  (*pack_length) += count;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
-void runner_do_ci_cj_gpu_pack_neat(struct runner *r, struct cell *ci,
-                                   struct cell *cj,
-                                   struct part_soa parts_soa_buffer, int timer,
-                                   int *pack_length, int tid,
-                                   int count_max_parts_tmp, int count_ci,
-                                   int count_cj) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (ci->hydro.count == 0) return;
-
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat(ci, parts_soa_buffer, tid, local_pack_position, count_ci);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat(cj, parts_soa_buffer, tid, local_pack_position, count_cj);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
-void runner_do_ci_cj_gpu_pack_neat_aos(struct runner *r, struct cell *ci,
-                                       struct cell *cj,
-                                       struct part_aos *parts_aos_buffer,
-                                       int timer, int *pack_length, int tid,
-                                       int count_max_parts_tmp, int count_ci,
-                                       int count_cj, float3 shift_tmp) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (ci->hydro.count == 0) return;
-
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-            "ci %i cj %i count_max %i\n",
-            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
-    error();
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  float3 shift_i = {shift_tmp.x + cj->loc[0], shift_tmp.y + cj->loc[1],
-                    shift_tmp.z + cj->loc[2]};
-  pack_neat_pair_aos(ci, parts_aos_buffer, tid, local_pack_position, count_ci,
-                     shift_i);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  float3 shift_j = {cj->loc[0], cj->loc[1], cj->loc[2]};
-  pack_neat_pair_aos(cj, parts_aos_buffer, tid, local_pack_position, count_cj,
-                     shift_j);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_do_ci_cj_gpu_pack_neat_aos_f4(
     struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
     struct part_aos_f4_send *restrict parts_aos_buffer, int timer,
@@ -1752,41 +711,6 @@ void runner_do_ci_cj_gpu_pack_neat_aos_f4(
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat_aos_g(struct runner *r, struct cell *ci,
-                                         struct cell *cj,
-                                         struct part_aos_g *parts_aos_buffer,
-                                         int timer, int *pack_length, int tid,
-                                         int count_max_parts_tmp, int count_ci,
-                                         int count_cj) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (ci->hydro.count == 0) return;
-
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-            "ci %i cj %i count_max %i\n",
-            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
-    error();
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_g(ci, parts_aos_buffer, tid, local_pack_position, count_ci);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_g(cj, parts_aos_buffer, tid, local_pack_position, count_cj);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
     struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
     struct part_aos_f4_g_send *restrict parts_aos_buffer, int timer,
@@ -1836,41 +760,6 @@ void runner_do_ci_cj_gpu_pack_neat_aos_f4_g(
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
 
-void runner_do_ci_cj_gpu_pack_neat_aos_f(struct runner *r, struct cell *ci,
-                                         struct cell *cj,
-                                         struct part_aos_f *parts_aos_buffer,
-                                         int timer, int *pack_length, int tid,
-                                         int count_max_parts_tmp, int count_ci,
-                                         int count_cj) {
-
-  TIMER_TIC;
-
-  /* Anything to do here? */
-  if (ci->hydro.count == 0) return;
-
-  int local_pack_position = (*pack_length);
-
-#ifdef SWIFT_DEBUG_CHECKS
-  if (local_pack_position + count_ci + count_cj >= 2 * count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! Pack pos %i"
-            "ci %i cj %i count_max %i\n",
-            local_pack_position, count_ci, count_cj, count_max_parts_tmp);
-    error();
-  }
-#endif
-
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_f(ci, parts_aos_buffer, tid, local_pack_position, count_ci);
-  local_pack_position += count_ci;
-  /* Pack the particle data into CPU-side buffers*/
-  pack_neat_aos_f(cj, parts_aos_buffer, tid, local_pack_position, count_cj);
-  /* Increment pack length accordingly */
-  (*pack_length) += count_ci + count_cj;
-
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
 void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
     struct runner *r, struct cell *restrict ci, struct cell *restrict cj,
     struct part_aos_f4_f_send *restrict parts_aos_buffer, int timer,
@@ -1919,247 +808,6 @@ void runner_do_ci_cj_gpu_pack_neat_aos_f4_f(
 
   if (timer) TIMER_TOC(timer_doself_gpu_pack);
 }
-
-void runner_doself1_gpu_pack(
-    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
-    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
-    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
-    float *locx, float *locy, float *locz, float *widthx, float *widthy,
-    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
-    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
-    float *div_v_previous_step, float *alpha_visc, float *v_sig,
-    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
-    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
-    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-    char *to_be_synchronized, int count_max_parts_tmp) {
-
-  TIMER_TIC;
-  //  fprintf(stderr,"Entered outer packing code!\n");
-
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  /* Recurse? */
-  //  if (c->split) {
-  ////	fprintf(stderr,"Entered recursive packing code!\n");
-  //    for (int k = 0; k < 8; k++){
-  //      if (c->progeny[k] != NULL){
-  //    	  runner_doself1_gpu_pack(r, c, timer, pack_length,
-  //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
-  //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
-  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount,
-  //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
-  //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
-  //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
-  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized,
-  //    count_max_parts_tmp, fgpuin); 	  fprintf(stderr,"working on a split
-  //    cell\n");
-  //      }
-  //    }
-  //  }
-  //  else {
-  //	    fprintf(stderr,"Entered inner packing code!\n");
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr, "Exceeded count_max_parts_tmp. Make arrays bigger!\n");
-    exit(0);
-  }
-  // Pack the particle data
-  pack(c, x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox, a_hydroy,
-       a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx, locy, locz, widthx,
-       widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_u, rot_v,
-       rot_w, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-       alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
-       time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
-       local_pack_position, count);
-  // Increment pack length accordingly
-  (*pack_length) += count;
-  //  }
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
-void pack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
-          int *tid_p, long long *id, float *ux, float *uy, float *uz,
-          float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
-          float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
-          float *locx, float *locy, float *locz, float *widthx, float *widthy,
-          float *widthz, float *h_max, int *count_p, float *wcount,
-          float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
-          float *rot_w, float *div_v, float *div_v_previous_step,
-          float *alpha_visc, float *v_sig, float *laplace_u, float *alpha_diff,
-          float *f, float *soundspeed, float *h_dt, float *balsara,
-          float *pressure, float *alpha_visc_max_ngb, timebin_t *time_bin,
-          timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-          char *to_be_synchronized, int local_pack_position, int count) {
-
-  const struct part *ptmps;
-  ptmps = c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    x_p[id_in_pack] = ptmps[i].x[0];
-    y_p[id_in_pack] = ptmps[i].x[1];
-    z_p[id_in_pack] = ptmps[i].x[2];
-    //    id[id_in_pack]=ptmps[i].id;
-    //    count_p[id_in_pack]=count;
-    tid_p[id_in_pack] = tid;
-    //    h_max[id_in_pack]=c->hydro.h_max;
-    ux[id_in_pack] = ptmps[i].v[0];
-    uy[id_in_pack] = ptmps[i].v[1];
-    uz[id_in_pack] = ptmps[i].v[2];
-    //	a_hydrox[id_in_pack]=ptmps[i].a_hydro[0];
-    //	a_hydroy[id_in_pack]=ptmps[i].a_hydro[1];
-    //	a_hydroz[id_in_pack]=ptmps[i].a_hydro[2];
-    locx[id_in_pack] = c->loc[0];
-    locy[id_in_pack] = c->loc[1];
-    locz[id_in_pack] = c->loc[2];
-    mass[id_in_pack] = ptmps[i].mass;
-    h[id_in_pack] = ptmps[i].h;
-    //	u[id_in_pack]=ptmps[i].u;
-    //	u_dt[id_in_pack]=ptmps[i].u_dt;
-    //////////////////////////////////////////////////////
-    rho[id_in_pack] = 0.f;  // ptmps[i].rho;
-    /////////////////////////////////////////////////////
-    //	div_v_previous_step[id_in_pack]=ptmps[i].viscosity.div_v_previous_step;
-    //	alpha_visc[id_in_pack]=ptmps[i].viscosity.alpha;
-    //	v_sig[id_in_pack]=ptmps[i].viscosity.v_sig;
-    //	laplace_u[id_in_pack]=ptmps[i].diffusion.laplace_u;
-    //	alpha_diff[id_in_pack]=ptmps[i].diffusion.alpha;
-    //	f[id_in_pack]=ptmps[i].force.f;
-    //	soundspeed[id_in_pack]=ptmps[i].force.soundspeed;
-    //	h_dt[id_in_pack]=ptmps[i].force.h_dt;
-    //	balsara[id_in_pack]=ptmps[i].force.balsara;
-    //	pressure[id_in_pack]=ptmps[i].force.pressure;
-    //    time_bin[id_in_pack] = ptmps[i].time_bin;
-    //	wakeup[id_in_pack]=ptmps[i].limiter_data.wakeup;
-    //	min_ngb_time_bin[id_in_pack]=ptmps[i].limiter_data.min_ngb_time_bin;
-    //	to_be_synchronized[id_in_pack]=ptmps[i].limiter_data.to_be_synchronized;
-    ///////////////////////////////////////////////////////////////////
-    wcount[id_in_pack] = 0.f;     // ptmps[i].density.wcount;
-    wcount_dh[id_in_pack] = 0.f;  // ptmps[i].density.wcount_dh;
-    rho_dh[id_in_pack] = 0.f;     // ptmps[i].density.rho_dh;
-    div_v[id_in_pack] = 0.f;      // ptmps[i].viscosity.div_v;
-    rot_u[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[0];
-    rot_v[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[1];
-    rot_w[id_in_pack] = 0.f;      // ptmps[i].density.rot_v[2];
-    ///////////////////////////////////////////////////////////////////
-  }
-}
-
-void runner_doself1_gpu_unpack(
-    struct runner *r, struct cell *c, int timer, int *pack_length, double *x_p,
-    double *y_p, double *z_p, int tid, int *tid_p, long long *id, float *ux,
-    float *uy, float *uz, float *a_hydrox, float *a_hydroy, float *a_hydroz,
-    float *mass, float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
-    float *locx, float *locy, float *locz, float *widthx, float *widthy,
-    float *widthz, float *h_max, int *count_p, float *wcount, float *wcount_dh,
-    float *rho_dh, float *rot_u, float *rot_v, float *rot_w, float *div_v,
-    float *div_v_previous_step, float *alpha_visc, float *v_sig,
-    float *laplace_u, float *alpha_diff, float *f, float *soundspeed,
-    float *h_dt, float *balsara, float *pressure, float *alpha_visc_max_ngb,
-    timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-    char *to_be_synchronized, int count_max_parts_tmp, struct engine *e) {
-  TIMER_TIC;
-  //  fprintf(stderr, "got into pack function\n");
-  /* Anything to do here? */
-  if (c->hydro.count == 0) return;
-  if (!cell_is_active_hydro(c, e)) return;
-  /* Anything to do here? */
-  /* Recurse? */
-  //  if (c->split) {
-  //	  fprintf(stderr,"working on a split cell\n");
-  //    for (int k = 0; k < 8; k++){
-  //      if (c->progeny[k] != NULL){
-  //    	  runner_doself1_gpu_unpack(r, c, timer, pack_length,
-  //    	  x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox,
-  //    	  a_hydroy, a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx,
-  //    locy, locz, 	  widthx, widthy, widthz, h_max, count_p, wcount,
-  //    wcount_dh, rho_dh, rot_u, rot_v, 	  rot_w, div_v,
-  //    div_v_previous_step, alpha_visc, v_sig, laplace_u, alpha_diff, f,
-  //    soundspeed, 	  h_dt, balsara, pressure, alpha_visc_max_ngb, time_bin,
-  //    wakeup, min_ngb_time_bin, 	  to_be_synchronized,
-  //    count_max_parts_tmp, fgpuin); 	  fprintf(stderr,"working on a split
-  //    cell\n");
-  //      }
-  //    }
-  //  } else {
-  int count = c->hydro.count;
-  int local_pack_position = (*pack_length);
-  if (local_pack_position + count >= count_max_parts_tmp) {
-    fprintf(stderr,
-            "Exceeded count_max_parts_tmp. Make arrays bigger! pack_length is "
-            "%i, local_pack_position is % i, "
-            "count is %i\n",
-            (*pack_length), local_pack_position, count);
-    //	      exit(0);
-  }
-  // Pack the particle data
-  unpack(c, x_p, y_p, z_p, tid, tid_p, id, ux, uy, uz, a_hydrox, a_hydroy,
-         a_hydroz, mass, h, u, u_dt, rho, SPH_sum, locx, locy, locz, widthx,
-         widthy, widthz, h_max, count_p, wcount, wcount_dh, rho_dh, rot_u,
-         rot_v, rot_w, div_v, div_v_previous_step, alpha_visc, v_sig, laplace_u,
-         alpha_diff, f, soundspeed, h_dt, balsara, pressure, alpha_visc_max_ngb,
-         time_bin, wakeup, min_ngb_time_bin, to_be_synchronized,
-         local_pack_position, count, e);
-  //  for (int i = *pack_length; i < count+*pack_length; i++) {
-  //  for (int i = 0; i < count; i++) {
-  //	  message("wcount is %f", c->hydro.parts[i].density.wcount);
-  //  }
-  // Increment pack length accordingly
-  (*pack_length) += count;
-  //  }
-  if (timer) TIMER_TOC(timer_doself_gpu_pack);
-}
-
-void unpack(struct cell *c, double *x_p, double *y_p, double *z_p, int tid,
-            int *tid_p, long long *id, float *ux, float *uy, float *uz,
-            float *a_hydrox, float *a_hydroy, float *a_hydroz, float *mass,
-            float *h, float *u, float *u_dt, float *rho, float *SPH_sum,
-            float *locx, float *locy, float *locz, float *widthx, float *widthy,
-            float *widthz, float *h_max, int *count_p, float *wcount,
-            float *wcount_dh, float *rho_dh, float *rot_u, float *rot_v,
-            float *rot_w, float *div_v, float *div_v_previous_step,
-            float *alpha_visc, float *v_sig, float *laplace_u,
-            float *alpha_diff, float *f, float *soundspeed, float *h_dt,
-            float *balsara, float *pressure, float *alpha_visc_max_ngb,
-            timebin_t *time_bin, timebin_t *wakeup, timebin_t *min_ngb_time_bin,
-            char *to_be_synchronized, int local_pack_position, int count,
-            struct engine *e) {
-
-  //  struct part *ptmps;
-  //  ptmps=c->hydro.parts;
-  for (int i = 0; i < count; i++) {
-    int id_in_pack = i + local_pack_position;
-    struct part *pi = &c->hydro.parts[i];
-    if (part_is_inhibited(pi, e)) {
-      fprintf(stderr, "inhibited part\n");
-      continue;
-    }
-    const int pi_active = part_is_active(pi, e);
-    if (!pi_active)
-      fprintf(stderr, "Inactive part\n");
-    else if (pi_active) {
-      //    c->hydro.parts[i].rho = rho[id_in_pack];
-      //    c->hydro.parts[i].viscosity.div_v = div_v[id_in_pack];
-      //    c->hydro.parts[i].density.rho_dh = rho_dh[id_in_pack];
-      //    c->hydro.parts[i].density.wcount = wcount[id_in_pack];
-      //    c->hydro.parts[i].density.wcount_dh = wcount_dh[id_in_pack];
-      //    c->hydro.parts[i].density.rot_v[0] = rot_u[id_in_pack];
-      //    c->hydro.parts[i].density.rot_v[1] = rot_v[id_in_pack];
-      //    c->hydro.parts[i].density.rot_v[2] = rot_w[id_in_pack];
-      pi->rho += rho[id_in_pack];
-      pi->viscosity.div_v += div_v[id_in_pack];
-      pi->density.rho_dh += rho_dh[id_in_pack];
-      pi->density.wcount += wcount[id_in_pack];
-      pi->density.wcount_dh += wcount_dh[id_in_pack];
-      pi->density.rot_v[0] += rot_u[id_in_pack];
-      pi->density.rot_v[1] += rot_v[id_in_pack];
-      pi->density.rot_v[2] += rot_w[id_in_pack];
-    }
-    //    else fprintf(stderr,"a part is not active\n");
-  }
-  //  c->hydro.parts=ptmps;
-}
 // #ifdef WITHCUDA
 // }
 // #endif
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index d5dd654b2e..7b551f03ac 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -941,13 +941,14 @@ void *runner_main2(void *data) {
               /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
                * We are recursing separately to find out how much work we have before offloading*/
               //We need to allocate a list to put cell pointers into for each new task
-              int n_expected_tasks = 1024;
+              int n_expected_tasks = 1024; //A. Nasar: Need to come up with a good estimate for this
               int n_leafs_found = 0;
               int depth = 0;
               struct cell * cells_left[n_expected_tasks];
               struct cell * cells_right[n_expected_tasks];
               runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
-                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found, cells_left, cells_right, depth);
+                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found,
+					  cells_left, cells_right, depth, n_expected_tasks);
               n_leafs_total += n_leafs_found;
 
               int cstart = 0, cend = n_leafs_found;

From cfa5861b18b62a68732e6c050bb31b6991d1ebb1 Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 15:35:55 +0000
Subject: [PATCH 200/217] Edited decrements of the recurse, pack, unpack loops
 in runner_main2

---
 src/runner_main_clean.cu | 101 ++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 55 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 7b551f03ac..1c8fe3a9b1 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -931,64 +931,55 @@ void *runner_main2(void *data) {
             packed_pair++;
 #ifdef GPUOFFLOAD_DENSITY
 
-              ticks tic_cpu_pack = getticks();
-              n_cells_p_d++;
-              maxcount = max(maxcount, ci->hydro.count);
-              if (ci->hydro.count > 1.5 * np_per_cell) {
-                n_w_prts_gtr_target_p_d++;
-              }
-              /////////////////////W.I.P!!!////////////////////////////////////////////////////////
-              /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
-               * We are recursing separately to find out how much work we have before offloading*/
-              //We need to allocate a list to put cell pointers into for each new task
-              int n_expected_tasks = 1024; //A. Nasar: Need to come up with a good estimate for this
-              int n_leafs_found = 0;
-              int depth = 0;
-              struct cell * cells_left[n_expected_tasks];
-              struct cell * cells_right[n_expected_tasks];
-              runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
+            ticks tic_cpu_pack = getticks();
+            n_cells_p_d++;
+            maxcount = max(maxcount, ci->hydro.count);
+            if (ci->hydro.count > 1.5 * np_per_cell) {
+              n_w_prts_gtr_target_p_d++;
+            }
+            /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+            /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
+            * We are recursing separately to find out how much work we have before offloading*/
+            //We need to allocate a list to put cell pointers into for each new task
+            int n_expected_tasks = 1024; //A. Nasar: Need to come up with a good estimate for this
+            int n_leafs_found = 0;
+            int depth = 0;
+            struct cell * cells_left[n_expected_tasks];
+            struct cell * cells_right[n_expected_tasks];
+            runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found,
 					  cells_left, cells_right, depth, n_expected_tasks);
-              n_leafs_total += n_leafs_found;
-
-              int cstart = 0, cend = n_leafs_found;
-
-              int cid = 0;
-              pack_vars_pair_dens->task_locked = 1;
-              int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
-              pack_vars_pair_dens->top_tasks_packed++;
-              pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
-              int t_s, t_e;
-              t_s = 0;
-              while(cid < n_leafs_found){
-                //////////////////////////////////////////////////////////////////////////////////
-                /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-                for (cid = cstart; pack_vars_pair_dens->tasks_packed < pack_vars_pair_dens->target_n_tasks
-                     && cid < n_leafs_found; cid++){
-
+            n_leafs_total += n_leafs_found;
+
+            int cstart = 0, cend = n_leafs_found;
+
+            int cid = 0;
+            pack_vars_pair_dens->task_locked = 1;
+            int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
+            pack_vars_pair_dens->top_tasks_packed++;
+            pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+            int t_s, t_e;
+            t_s = 0;
+            while(cid < n_leafs_found){
+              //////////////////////////////////////////////////////////////////////////////////
+              /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
+              for (cid = cstart; pack_vars_pair_dens->tasks_packed < pack_vars_pair_dens->target_n_tasks
+                  && cid < n_leafs_found; cid++){
                   packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, cells_left[cid], cells_right[cid], t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-//                  if (pack_vars_pair_dens->unfinished)
-//                	break;
-//                message("Packing task %i in recursed tasks\n", cid);
-                }
-                /* Copies done. Release the lock ! */
-                pack_vars_pair_dens->task_locked = 0;
-//                if(cid == n_leafs_found){
-//                  cell_unlocktree(ci);
-//                  cell_unlocktree(cj);
-//                  pack_vars_pair_dens->task_locked = 0;
-//                }
-                cstart = cid + 1;
-                t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-                /* Packed enough tasks or no pack tasks left in queue, flag that
-                 * we want to run */
-                int launch = pack_vars_pair_dens->launch;
-                int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-
-                /* Do we have enough stuff to run the GPU ? */
-                if (launch) n_full_p_d_bundles++;
+              }
+              /* Copies done. Release the lock ! */
+              pack_vars_pair_dens->task_locked = 0;
+              cstart = cid + 1;
+              t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+              /* Packed enough tasks or no pack tasks left in queue, flag that
+               * we want to run */
+              int launch = pack_vars_pair_dens->launch;
+              int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
+
+              /* Do we have enough stuff to run the GPU ? */
+              if (launch) n_full_p_d_bundles++;
                 if (launch_leftovers) n_partial_p_d_bundles++;
 
                 if (launch || launch_leftovers) {
@@ -1004,13 +995,13 @@ void *runner_main2(void *data) {
                     pair_end);
                   for (int tid = 0; tid < pack_vars_pair_dens->top_tasks_packed -1; tid++){
                     /*schedule my dependencies (Only unpacks really)*/
-                	struct task *tii = pack_vars_pair_dens->top_task_list[tid];
+                    struct task *tii = pack_vars_pair_dens->top_task_list[tid];
                     enqueue_dependencies(sched, tii);
                   }
                   pack_vars_pair_dens->top_tasks_packed = 1;
                   pack_vars_pair_dens->top_task_list[0] = t;
                 }
-                ///////////////////////////////////////////////////////////////////////
+              ///////////////////////////////////////////////////////////////////////
               }
               cell_unlocktree(ci);
               cell_unlocktree(cj);

From 922d526ffb0dc59497ad43b732f37d3cb0d3c79e Mon Sep 17 00:00:00 2001
From: Abouzied Nasar <abouzied.nasar@gmail.com>
Date: Thu, 6 Mar 2025 16:19:31 +0000
Subject: [PATCH 201/217] Found one bug in how we were incrementing cstart.
 Code still complains about negative wait though when using recursion through
 pair tasks

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_main_clean.cu                      | 47 +++++++++----------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 15709ccaf3..409b825441 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 32
   tasks_per_cell: 200
     #  deadlock_waiting_time_s:   10
     #  cell_split_size: 100
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 1c8fe3a9b1..6c461fed78 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -960,55 +960,54 @@ void *runner_main2(void *data) {
             pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
             int t_s, t_e;
             t_s = 0;
+            int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             while(cid < n_leafs_found){
               //////////////////////////////////////////////////////////////////////////////////
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-              for (cid = cstart; pack_vars_pair_dens->tasks_packed < pack_vars_pair_dens->target_n_tasks
-                  && cid < n_leafs_found; cid++){
+              for (cid = cstart; pack_vars_pair_dens->tasks_packed < n_t_tasks && cid < n_leafs_found; cid++){
                   packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, cells_left[cid], cells_right[cid], t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
+                  if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
+                	  error("Packed more parts than possible");
               }
               /* Copies done. Release the lock ! */
               pack_vars_pair_dens->task_locked = 0;
-              cstart = cid + 1;
+              cstart = cid;
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
-
               /* Do we have enough stuff to run the GPU ? */
               if (launch) n_full_p_d_bundles++;
-                if (launch_leftovers) n_partial_p_d_bundles++;
+              if (launch_leftovers) n_partial_p_d_bundles++;
 
-                if (launch || launch_leftovers) {
-                  /*Launch GPU tasks*/
-                  int t_packed = pack_vars_pair_dens->tasks_packed;
-                  //                signal_sleeping_runners(sched, t, t_packed);
-                  runner_dopair1_launch_f4_one_memcpy(
+              if (launch || launch_leftovers) {
+                /*Launch GPU tasks*/
+                int t_packed = pack_vars_pair_dens->tasks_packed;
+                runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
-                  for (int tid = 0; tid < pack_vars_pair_dens->top_tasks_packed -1; tid++){
-                    /*schedule my dependencies (Only unpacks really)*/
-                    struct task *tii = pack_vars_pair_dens->top_task_list[tid];
-                    enqueue_dependencies(sched, tii);
-                  }
-                  pack_vars_pair_dens->top_tasks_packed = 1;
-                  pack_vars_pair_dens->top_task_list[0] = t;
+                for (int tid = 0; tid < pack_vars_pair_dens->top_tasks_packed -1; tid++){
+                  /*schedule my dependencies (Only unpacks really)*/
+                  struct task *tii = pack_vars_pair_dens->top_task_list[tid];
+                  enqueue_dependencies(sched, tii);
                 }
-              ///////////////////////////////////////////////////////////////////////
+                pack_vars_pair_dens->top_tasks_packed = 1;
+                pack_vars_pair_dens->top_task_list[0] = t;
               }
-              cell_unlocktree(ci);
-              cell_unlocktree(cj);
-              pack_vars_pair_dens->task_locked = 0;
-              pack_vars_pair_dens->launch_leftovers = 0;
-
-              /////////////////////W.I.P!!!////////////////////////////////////////////////////////
+              ///////////////////////////////////////////////////////////////////////
+            }
+            cell_unlocktree(ci);
+            cell_unlocktree(cj);
+            pack_vars_pair_dens->task_locked = 0;
+            pack_vars_pair_dens->launch_leftovers = 0;
+            /////////////////////W.I.P!!!////////////////////////////////////////////////////////
 
 #endif  // GPUOFFLOAD_DENSITY
           } /* pair / pack */

From 2ed1204265fceaa786832606247959283ef49b32 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Fri, 7 Mar 2025 22:34:35 +0000
Subject: [PATCH 202/217] Another bug. I was decremeting s->waiting for all
 leaf tasks, should only have been done once for the top level task. Moved
 decrementing from the GPU launch function into runner_main so that s->waiting
 only decremented once per top level task

---
 examples/HydroTests/GreshoVortex_3D/gresho.yml |  2 +-
 src/runner_doiact_functions_hydro_gpu.h        |  6 ------
 src/runner_main_clean.cu                       | 10 +++++-----
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 409b825441..c87ec26a18 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  2
+    #replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index afac4d3b11..28d1951485 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1565,12 +1565,6 @@ void runner_dopair1_launch_f4_one_memcpy(
     }
   }
 
-
-  pthread_mutex_lock(&s->sleep_mutex);
-  atomic_sub(&s->waiting, pack_vars->top_tasks_packed);
-  pthread_cond_broadcast(&s->sleep_cond);
-  pthread_mutex_unlock(&s->sleep_mutex);
-
   /* Zero counters for the next pack operations */
   pack_vars->count_parts = 0;
   pack_vars->tasks_packed = 0;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 6c461fed78..fceaddba47 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -932,11 +932,7 @@ void *runner_main2(void *data) {
 #ifdef GPUOFFLOAD_DENSITY
 
             ticks tic_cpu_pack = getticks();
-            n_cells_p_d++;
-            maxcount = max(maxcount, ci->hydro.count);
-            if (ci->hydro.count > 1.5 * np_per_cell) {
-              n_w_prts_gtr_target_p_d++;
-            }
+
             /////////////////////W.I.P!!!////////////////////////////////////////////////////////
             /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
             * We are recursing separately to find out how much work we have before offloading*/
@@ -997,6 +993,10 @@ void *runner_main2(void *data) {
                   /*schedule my dependencies (Only unpacks really)*/
                   struct task *tii = pack_vars_pair_dens->top_task_list[tid];
                   enqueue_dependencies(sched, tii);
+                  pthread_mutex_lock(&sched->sleep_mutex);
+                  atomic_dec(&sched->waiting);
+                  pthread_cond_broadcast(&sched->sleep_cond);
+                  pthread_mutex_unlock(&sched->sleep_mutex);
                 }
                 pack_vars_pair_dens->top_tasks_packed = 1;
                 pack_vars_pair_dens->top_task_list[0] = t;

From a3d24c51ce8af4bef79f9788472d7b9d7011667d Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 8 Mar 2025 03:56:24 +0000
Subject: [PATCH 203/217] I think the algorithm is there now. I split the GPU
 launch from the unpacking but I still need to sort out the counters for all
 eventualities

---
 src/runner_doiact_functions_hydro_gpu.h | 192 +++++++++++++++++-------
 src/runner_main_clean.cu                |  96 ++++++++----
 2 files changed, 200 insertions(+), 88 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 28d1951485..8bd1a27eb9 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -41,6 +41,7 @@ struct pack_vars_pair {
   /*List of tasks and respective cells to be packed*/
   struct task **task_list;
   struct task **top_task_list;
+  struct task ****leaf_task_list;
   struct cell **ci_list;
   struct cell **cj_list;
   /*List of cell shifts*/
@@ -340,7 +341,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
                               struct cell *ci, struct cell *cj, struct task *t,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
-                              int4 *fparti_fpartj_lparti_lpartj) {
+                              int4 *fparti_fpartj_lparti_lpartj, int leaves_packed) {
   /* Timers for how long this all takes.
    * t0 and t1 are from start to finish including GPU calcs
    * tp0 and tp1 only time packing and unpacking*/
@@ -361,7 +362,8 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   space_getsid_GPU(e->s, &ci, &cj, &x_tmp, &y_tmp, &z_tmp);
 
   /*Get pointers to the list of tasks and cells packed*/
-  pack_vars->task_list[tasks_packed] = t;
+//  pack_vars->task_list[tasks_packed] = t;
+  pack_vars->leaf_task_list[pack_vars->top_tasks_packed][leaves_packed] = t;
   pack_vars->ci_list[tasks_packed] = ci;
   pack_vars->cj_list[tasks_packed] = cj;
 
@@ -403,7 +405,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
         fparti_fpartj_lparti_lpartj[tasks_packed].x;
     pack_vars->bundle_first_task_list[bid] = tasks_packed;
   }
-
   /* Record that we have now done a packing (self) */
   t->done = 1;
   pack_vars->tasks_packed++;
@@ -1509,60 +1510,60 @@ void runner_dopair1_launch_f4_one_memcpy(
     /*Time unpacking*/
     //		clock_gettime(CLOCK_REALTIME, &tp0);
 
-    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
-
-      if (tid < tasks_packed) {
-        clock_gettime(CLOCK_REALTIME, &tp0);
-        /*grab cell and task pointers*/
-        struct cell *cii = pack_vars->ci_list[tid];
-        struct cell *cjj = pack_vars->cj_list[tid];
-        struct task *tii = pack_vars->task_list[tid];
-
-//        if(!pack_vars->task_locked){
-//          /*Let's lock ci*/
-//          while (cell_locktree(cii)) {
-//            ; /* spin until we acquire the lock */
-//          }
-//          /*Let's lock cj*/
-//          while (cell_locktree(cjj)) {
-//            ; /* spin until we acquire the lock */
-//          }
-//          pack_vars->task_locked = 1;
-//        }
-
-        const ticks tic = getticks();
-
-        /* Do the copy */
-        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
-            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
-            2 * pack_vars->count_max_parts, e);
-
-        const ticks toc = getticks();
-
-        total_cpu_unpack_ticks += toc - tic;
-
-        /* Record things for debugging */
-        cii->gpu_done_pair++;
-        cjj->gpu_done_pair++;
-
-        if(pack_vars->task_locked){
-          /* Release the locks */
-          cell_unlocktree(cii);
-          /* Release the locks */
-          cell_unlocktree(cjj);
-          pack_vars->task_locked = 0;
-        }
-
-        /*Time end of unpacking*/
-        clock_gettime(CLOCK_REALTIME, &tp1);
-        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
-                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
-        /*Signal sleeping runners*/
-        // MATTHIEU signal_sleeping_runners(s, tii);
-
-        tii->gpu_done = 1;
-      }
-    }
+//    for (int tid = bid * bundle_size; tid < (bid + 1) * bundle_size; tid++) {
+//
+//      if (tid < tasks_packed) {
+//        clock_gettime(CLOCK_REALTIME, &tp0);
+//        /*grab cell and task pointers*/
+//        struct cell *cii = pack_vars->ci_list[tid];
+//        struct cell *cjj = pack_vars->cj_list[tid];
+//        struct task *tii = pack_vars->task_list[tid];
+//
+////        if(!pack_vars->task_locked){
+////          /*Let's lock ci*/
+////          while (cell_locktree(cii)) {
+////            ; /* spin until we acquire the lock */
+////          }
+////          /*Let's lock cj*/
+////          while (cell_locktree(cjj)) {
+////            ; /* spin until we acquire the lock */
+////          }
+////          pack_vars->task_locked = 1;
+////        }
+//
+//        const ticks tic = getticks();
+//
+//        /* Do the copy */
+//        runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+//            r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+//            2 * pack_vars->count_max_parts, e);
+//
+//        const ticks toc = getticks();
+//
+//        total_cpu_unpack_ticks += toc - tic;
+//
+//        /* Record things for debugging */
+//        cii->gpu_done_pair++;
+//        cjj->gpu_done_pair++;
+//
+////        if(pack_vars->task_locked){
+////          /* Release the locks */
+////          cell_unlocktree(cii);
+////          /* Release the locks */
+////          cell_unlocktree(cjj);
+//          pack_vars->task_locked = 0;
+////        }
+//
+//        /*Time end of unpacking*/
+//        clock_gettime(CLOCK_REALTIME, &tp1);
+//        *unpack_time += (tp1.tv_sec - tp0.tv_sec) +
+//                        (tp1.tv_nsec - tp0.tv_nsec) / 1000000000.0;
+//        /*Signal sleeping runners*/
+//        // MATTHIEU signal_sleeping_runners(s, tii);
+//
+//        tii->gpu_done = 1;
+//      }
+//    }
   }
 
   /* Zero counters for the next pack operations */
@@ -1579,6 +1580,83 @@ void runner_dopair1_launch_f4_one_memcpy(
 
 } /*End of GPU work*/
 
+void runner_dopair1_unpack_f4(
+    struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
+    struct task *t, struct part_aos_f4_send *parts_send,
+    struct part_aos_f4_recv *parts_recv, struct part_aos_f4_send *d_parts_send,
+    struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
+    float d_H, struct engine *e, double *packing_time, double *gpu_time,
+    double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
+    cudaEvent_t *pair_end){
+  int topid;
+  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
+	//lock top level cell here
+	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+	while (cell_locktree(cii)) {
+		; /* spin until we acquire the lock */
+	}
+	/*Let's lock cj*/
+	while (cell_locktree(cjj)) {
+		; /* spin until we acquire the lock */
+	}
+  }
+  if(pack_vars->task_locked = 0){
+	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+	while (cell_locktree(cii)) {
+		; /* spin until we acquire the lock */
+	}
+	/*Let's lock cj*/
+	while (cell_locktree(cjj)) {
+		; /* spin until we acquire the lock */
+	}
+	pack_vars->task_locked = 1;
+  }
+  int pack_length_unpack = 0;
+  ticks total_cpu_unpack_ticks = 0;
+  for(int tid = 0; tid < pack_vars->tasks_packed; tid++){
+	/*grab cell and task pointers*/
+	struct cell *cii = pack_vars->ci_list[tid];
+	struct cell *cjj = pack_vars->cj_list[tid];
+	const ticks tic = getticks();
+	/* Do the copy */
+	runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+			r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+			2 * pack_vars->count_max_parts, e);
+
+	const ticks toc = getticks();
+	total_cpu_unpack_ticks += toc - tic;
+  }
+  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
+	//lock top level cell here
+	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+	/* Release the locks */
+	cell_unlocktree(cii);
+	/* Release the locks */
+	cell_unlocktree(cjj);
+    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+  if(pack_vars->task_locked){
+	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+	/* Release the locks */
+	cell_unlocktree(cii);
+	/* Release the locks */
+	cell_unlocktree(cjj);
+    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
+    pthread_mutex_lock(&s->sleep_mutex);
+    atomic_dec(&s->waiting);
+    pthread_cond_broadcast(&s->sleep_cond);
+    pthread_mutex_unlock(&s->sleep_mutex);
+  }
+
+}
 void runner_dopair1_launch_f4_g_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
     struct task *t, struct part_aos_f4_g_send *parts_send,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index fceaddba47..4fe34b228e 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -542,6 +542,14 @@ void *runner_main2(void *data) {
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   pack_vars_pair_dens->top_task_list =
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  int n_leaves_max = 4096;
+  /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
+  pack_vars_pair_dens->leaf_task_list =
+      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  /*Allocate memory for n_leaves_max task pointers per top level task*/
+  for(int i = 0; i < target_n_tasks; i++)
+	  pack_vars_pair_dens->leaf_task_list[i] = (struct task **)calloc(n_leaves_max, sizeof(struct task *));
+
   pack_vars_pair_dens->ci_list =
       (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
   pack_vars_pair_dens->cj_list =
@@ -937,7 +945,7 @@ void *runner_main2(void *data) {
             /*Call recursion here. This will be a function in runner_doiact_functions_hydro_gpu.h.
             * We are recursing separately to find out how much work we have before offloading*/
             //We need to allocate a list to put cell pointers into for each new task
-            int n_expected_tasks = 1024; //A. Nasar: Need to come up with a good estimate for this
+            int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this
             int n_leafs_found = 0;
             int depth = 0;
             struct cell * cells_left[n_expected_tasks];
@@ -947,39 +955,52 @@ void *runner_main2(void *data) {
 					  cells_left, cells_right, depth, n_expected_tasks);
             n_leafs_total += n_leafs_found;
 
-            int cstart = 0, cend = n_leafs_found;
-
-            int cid = 0;
-            pack_vars_pair_dens->task_locked = 1;
-            int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
+            int cstart = 0, cid = 0;
+            pack_vars_pair_dens->top_task_list[pack_vars_pair_dens->top_tasks_packed] = t;
             pack_vars_pair_dens->top_tasks_packed++;
-            pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+            pack_vars_pair_dens->task_locked = 1;
             int t_s, t_e;
             t_s = 0;
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
+            t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
             while(cid < n_leafs_found){
-              //////////////////////////////////////////////////////////////////////////////////
+              tic_cpu_pack = getticks();
+              if(pack_vars_pair_dens->task_locked = 0){
+            	/*Is this lock necessary? Maybe not since we are only reading positions etc.
+            	  Leave it in for now as I'm just getting it to work and need it to be locked
+            	  in case I unlock in the outside loop*/
+                while (cell_locktree(ci)) {
+                  ; /* spin until we acquire the lock */
+                }
+                /*Let's lock cj*/
+                while (cell_locktree(cj)) {
+                  ; /* spin until we acquire the lock */
+                }
+   		        pack_vars_pair_dens->task_locked = 1;
+              }
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-              for (cid = cstart; pack_vars_pair_dens->tasks_packed < n_t_tasks && cid < n_leafs_found; cid++){
-                  packing_time_pair += runner_dopair1_pack_f4(
-                  r, sched, pack_vars_pair_dens, cells_left[cid], cells_right[cid], t,
-                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
-                  if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
-                	  error("Packed more parts than possible");
+              while(cstart < n_leafs_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
+                packing_time_pair += runner_dopair1_pack_f4(
+                  r, sched, pack_vars_pair_dens, cells_left[cstart], cells_right[cstart], t,
+                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, cstart);
+                if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
+                  error("Packed more parts than possible");
+                cstart++;
+              }
+              if(pack_vars_pair_dens->task_locked){
+  		        cell_unlocktree(ci);
+  		        cell_unlocktree(cj);
+  		        pack_vars_pair_dens->task_locked = 0;
               }
+              cid = cstart;
               /* Copies done. Release the lock ! */
-              pack_vars_pair_dens->task_locked = 0;
-              cstart = cid;
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
                * we want to run */
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
               /* Do we have enough stuff to run the GPU ? */
-              if (launch) n_full_p_d_bundles++;
-              if (launch_leftovers) n_partial_p_d_bundles++;
-
-              if (launch || launch_leftovers) {
+              if (launch || (launch_leftovers && cstart == n_leafs_found)) {
                 /*Launch GPU tasks*/
                 int t_packed = pack_vars_pair_dens->tasks_packed;
                 runner_dopair1_launch_f4_one_memcpy(
@@ -989,22 +1010,35 @@ void *runner_main2(void *data) {
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
-                for (int tid = 0; tid < pack_vars_pair_dens->top_tasks_packed -1; tid++){
+                int ntasks = 0;
+                runner_dopair1_unpack_f4(
+                    r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
+                    parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
+                    d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
+                    &packing_time_pair, &time_for_density_gpu_pair,
+                    &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
+                    pair_end);
+                if(cid == n_leafs_found) ntasks = pack_vars_pair_dens->top_tasks_packed;
+                else ntasks = pack_vars_pair_dens->top_tasks_packed - 1;
+//                for (int tid = 0; tid < ntasks; tid++){
                   /*schedule my dependencies (Only unpacks really)*/
-                  struct task *tii = pack_vars_pair_dens->top_task_list[tid];
-                  enqueue_dependencies(sched, tii);
-                  pthread_mutex_lock(&sched->sleep_mutex);
-                  atomic_dec(&sched->waiting);
-                  pthread_cond_broadcast(&sched->sleep_cond);
-                  pthread_mutex_unlock(&sched->sleep_mutex);
+                int ttid = pack_vars_pair_dens->top_tasks_packed - 1;
+			    struct task *tii = pack_vars_pair_dens->top_task_list[ttid];
+
+//                }
+                if(cid == n_leafs_found){
+                  pack_vars_pair_dens->top_tasks_packed = 1;
+                  pack_vars_pair_dens->top_task_list[0] = t;
+                }
+                else{
+                  pack_vars_pair_dens->top_tasks_packed = 0;
+                  pack_vars_pair_dens->top_task_list[0] = NULL;
                 }
-                pack_vars_pair_dens->top_tasks_packed = 1;
-                pack_vars_pair_dens->top_task_list[0] = t;
               }
               ///////////////////////////////////////////////////////////////////////
             }
-            cell_unlocktree(ci);
-            cell_unlocktree(cj);
+		    cell_unlocktree(ci);
+		    cell_unlocktree(cj);
             pack_vars_pair_dens->task_locked = 0;
             pack_vars_pair_dens->launch_leftovers = 0;
             /////////////////////W.I.P!!!////////////////////////////////////////////////////////

From ca350a4d1eaf4963ab086556c1b2643bde47ffae Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 8 Mar 2025 05:34:07 +0000
Subject: [PATCH 204/217] Still hanging but made some inroads into
 refining/correcting the
 recurse->pack->offload->re-pack_if_necessary->offload_again_if_necessary

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 examples/HydroTests/GreshoVortex_3D/makeIC.py |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       | 77 +++++++++---------
 src/runner_main_clean.cu                      | 78 +++++++++++--------
 4 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index c87ec26a18..58836598f5 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 32
+  max_top_level_cells: 16
   tasks_per_cell: 200
     #  deadlock_waiting_time_s:   10
     #  cell_split_size: 100
diff --git a/examples/HydroTests/GreshoVortex_3D/makeIC.py b/examples/HydroTests/GreshoVortex_3D/makeIC.py
index 19b38352eb..c611132715 100644
--- a/examples/HydroTests/GreshoVortex_3D/makeIC.py
+++ b/examples/HydroTests/GreshoVortex_3D/makeIC.py
@@ -28,7 +28,7 @@
 rho0 = 1  # Gas density
 P0 = 0.0  # Constant additional pressure (should have no impact on the dynamics)
 fileOutputName = "greshoVortex.hdf5"
-fileGlass = "glassCube_128.hdf5"
+fileGlass = "glassCube_64.hdf5"
 # ---------------------------------------------------
 
 # Get position and smoothing lengths from the glass
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 8bd1a27eb9..e28dd6e9d7 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1589,30 +1589,33 @@ void runner_dopair1_unpack_f4(
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
     cudaEvent_t *pair_end){
   int topid;
-  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
-	//lock top level cell here
-	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-	while (cell_locktree(cii)) {
-		; /* spin until we acquire the lock */
-	}
-	/*Let's lock cj*/
-	while (cell_locktree(cjj)) {
-		; /* spin until we acquire the lock */
-	}
-  }
-  if(pack_vars->task_locked = 0){
-	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-	while (cell_locktree(cii)) {
-		; /* spin until we acquire the lock */
-	}
-	/*Let's lock cj*/
-	while (cell_locktree(cjj)) {
-		; /* spin until we acquire the lock */
-	}
+  if(pack_vars->task_locked == 0){
+    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+    	//lock top level cell here
+    	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+    	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+    	while (cell_locktree(cii)) {
+    		; /* spin until we acquire the lock */
+    	}
+    	/*Let's lock cj*/
+    	while (cell_locktree(cjj)) {
+    		; /* spin until we acquire the lock */
+    	}
+    }
 	pack_vars->task_locked = 1;
   }
+//  if(pack_vars->task_locked == 0){
+//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+//	while (cell_locktree(cii)) {
+//	  ; /* spin until we acquire the lock */
+//	}
+//	/*Let's lock cj*/
+//	while (cell_locktree(cjj)) {
+//	  ; /* spin until we acquire the lock */
+//	}
+//	pack_vars->task_locked = 1;
+//  }
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
   for(int tid = 0; tid < pack_vars->tasks_packed; tid++){
@@ -1628,7 +1631,7 @@ void runner_dopair1_unpack_f4(
 	const ticks toc = getticks();
 	total_cpu_unpack_ticks += toc - tic;
   }
-  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
+  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
 	//lock top level cell here
 	struct cell * cii = pack_vars->top_task_list[topid]->ci;
 	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
@@ -1641,20 +1644,22 @@ void runner_dopair1_unpack_f4(
     atomic_dec(&s->waiting);
     pthread_cond_broadcast(&s->sleep_cond);
     pthread_mutex_unlock(&s->sleep_mutex);
+    pack_vars->task_locked = 0;
   }
-  if(pack_vars->task_locked){
-	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-	/* Release the locks */
-	cell_unlocktree(cii);
-	/* Release the locks */
-	cell_unlocktree(cjj);
-    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
-    pthread_mutex_lock(&s->sleep_mutex);
-    atomic_dec(&s->waiting);
-    pthread_cond_broadcast(&s->sleep_cond);
-    pthread_mutex_unlock(&s->sleep_mutex);
-  }
+//  if(pack_vars->task_locked){
+//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+//	/* Release the locks */
+//	cell_unlocktree(cii);
+//	/* Release the locks */
+//	cell_unlocktree(cjj);
+//    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
+//    pthread_mutex_lock(&s->sleep_mutex);
+//    atomic_dec(&s->waiting);
+//    pthread_cond_broadcast(&s->sleep_cond);
+//    pthread_mutex_unlock(&s->sleep_mutex);
+//    pack_vars->task_locked = 0;
+//  }
 
 }
 void runner_dopair1_launch_f4_g_one_memcpy(
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4fe34b228e..bde6538fef 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -965,19 +965,19 @@ void *runner_main2(void *data) {
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
             while(cid < n_leafs_found){
               tic_cpu_pack = getticks();
-              if(pack_vars_pair_dens->task_locked = 0){
-            	/*Is this lock necessary? Maybe not since we are only reading positions etc.
-            	  Leave it in for now as I'm just getting it to work and need it to be locked
-            	  in case I unlock in the outside loop*/
-                while (cell_locktree(ci)) {
-                  ; /* spin until we acquire the lock */
-                }
-                /*Let's lock cj*/
-                while (cell_locktree(cj)) {
-                  ; /* spin until we acquire the lock */
-                }
-   		        pack_vars_pair_dens->task_locked = 1;
-              }
+//              if(pack_vars_pair_dens->task_locked == 0){
+//            	/*Is this lock necessary? Maybe not since we are only reading positions etc.
+//            	  Leave it in for now as I'm just getting it to work and need it to be locked
+//            	  in case I unlock in the outside loop*/
+//                while (cell_locktree(ci)) {
+//                  ; /* spin until we acquire the lock */
+//                }
+//                /*Let's lock cj*/
+//                while (cell_locktree(cj)) {
+//                  ; /* spin until we acquire the lock */
+//                }
+//   		        pack_vars_pair_dens->task_locked = 1;
+//              }
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
               while(cstart < n_leafs_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
                 packing_time_pair += runner_dopair1_pack_f4(
@@ -987,11 +987,11 @@ void *runner_main2(void *data) {
                   error("Packed more parts than possible");
                 cstart++;
               }
-              if(pack_vars_pair_dens->task_locked){
+//              if(pack_vars_pair_dens->task_locked){
   		        cell_unlocktree(ci);
   		        cell_unlocktree(cj);
   		        pack_vars_pair_dens->task_locked = 0;
-              }
+//              }
               cid = cstart;
               /* Copies done. Release the lock ! */
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
@@ -1010,7 +1010,7 @@ void *runner_main2(void *data) {
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
-                int ntasks = 0;
+                int ntoptasks = 0;
                 runner_dopair1_unpack_f4(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1018,29 +1018,39 @@ void *runner_main2(void *data) {
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
-                if(cid == n_leafs_found) ntasks = pack_vars_pair_dens->top_tasks_packed;
-                else ntasks = pack_vars_pair_dens->top_tasks_packed - 1;
-//                for (int tid = 0; tid < ntasks; tid++){
-                  /*schedule my dependencies (Only unpacks really)*/
-                int ttid = pack_vars_pair_dens->top_tasks_packed - 1;
-			    struct task *tii = pack_vars_pair_dens->top_task_list[ttid];
-
+                if(cstart == n_leafs_found)
+                	pack_vars_pair_dens->top_tasks_packed = 0;
+                else
+                    pack_vars_pair_dens->top_tasks_packed = 1;
+
+                pack_vars_pair_dens->tasks_packed = 0;
+//                else{
+//
+//                }
+//                if(cid == n_leafs_found) ntoptasks = pack_vars_pair_dens->top_tasks_packed;
+//                else ntoptasks = pack_vars_pair_dens->top_tasks_packed - 1;
+////                for (int tid = 0; tid < ntasks; tid++){
+//                  /*schedule my dependencies (Only unpacks really)*/
+//                int ttid = pack_vars_pair_dens->top_tasks_packed - 1;
+//			    struct task *tii = pack_vars_pair_dens->top_task_list[ttid];
+//
+////                }
+//                if(cid == n_leafs_found){
+//                  pack_vars_pair_dens->top_tasks_packed = 1;
+//                  pack_vars_pair_dens->top_task_list[0] = t;
+//                }
+//                else{
+//                  pack_vars_pair_dens->tasks_packed = 0;
+//                  pack_vars_pair_dens->top_task_list[0] = NULL;
 //                }
-                if(cid == n_leafs_found){
-                  pack_vars_pair_dens->top_tasks_packed = 1;
-                  pack_vars_pair_dens->top_task_list[0] = t;
-                }
-                else{
-                  pack_vars_pair_dens->top_tasks_packed = 0;
-                  pack_vars_pair_dens->top_task_list[0] = NULL;
-                }
               }
               ///////////////////////////////////////////////////////////////////////
             }
-		    cell_unlocktree(ci);
-		    cell_unlocktree(cj);
-            pack_vars_pair_dens->task_locked = 0;
+//		    cell_unlocktree(ci);
+//		    cell_unlocktree(cj);
+//            pack_vars_pair_dens->task_locked = 0;
             pack_vars_pair_dens->launch_leftovers = 0;
+            pack_vars_pair_dens->launch = 0;
             /////////////////////W.I.P!!!////////////////////////////////////////////////////////
 
 #endif  // GPUOFFLOAD_DENSITY

From 8db90f0eff93b681a5398a30c3c691ecf3b47083 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sun, 9 Mar 2025 04:59:57 +0000
Subject: [PATCH 205/217] Probably too soon to say for sure but the main
 recursion is in place (probably). Need to figure out how to get the
 locking/unlocking business to work properly though as I am now unpacking to a
 few unlocked cells

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  2 +-
 src/runner_doiact_functions_hydro_gpu.h       | 46 ++++++++++---------
 src/runner_main_clean.cu                      | 21 +++++----
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 58836598f5..15709ccaf3 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-    #replicate:  2
+  replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index e28dd6e9d7..7bd631a69e 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1587,23 +1587,23 @@ void runner_dopair1_unpack_f4(
     struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    cudaEvent_t *pair_end){
+    cudaEvent_t *pair_end, int cstart, int n_leaves_found){
   int topid;
-  if(pack_vars->task_locked == 0){
-    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
-    	//lock top level cell here
-    	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-    	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-    	while (cell_locktree(cii)) {
-    		; /* spin until we acquire the lock */
-    	}
-    	/*Let's lock cj*/
-    	while (cell_locktree(cjj)) {
-    		; /* spin until we acquire the lock */
-    	}
-    }
-	pack_vars->task_locked = 1;
-  }
+//  if(pack_vars->task_locked == 0){
+//    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+//    	//lock top level cell here
+//    	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+//    	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+//    	while (cell_locktree(cii)) {
+//    		; /* spin until we acquire the lock */
+//    	}
+//    	/*Let's lock cj*/
+//    	while (cell_locktree(cjj)) {
+//    		; /* spin until we acquire the lock */
+//    	}
+//    }
+//	pack_vars->task_locked = 1;
+//  }
 //  if(pack_vars->task_locked == 0){
 //	struct cell * cii = pack_vars->top_task_list[topid]->ci;
 //	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
@@ -1633,12 +1633,14 @@ void runner_dopair1_unpack_f4(
   }
   for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
 	//lock top level cell here
-	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-	/* Release the locks */
-	cell_unlocktree(cii);
-	/* Release the locks */
-	cell_unlocktree(cjj);
+//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+//	/* Release the locks */
+//	cell_unlocktree(cii);
+//	/* Release the locks */
+//	cell_unlocktree(cjj);
+	if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
+		break;
     enqueue_dependencies(s, pack_vars->top_task_list[topid]);
     pthread_mutex_lock(&s->sleep_mutex);
     atomic_dec(&s->waiting);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index bde6538fef..317f0011d5 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -963,7 +963,7 @@ void *runner_main2(void *data) {
             t_s = 0;
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-            while(cid < n_leafs_found){
+            while(cstart < n_leafs_found){
               tic_cpu_pack = getticks();
 //              if(pack_vars_pair_dens->task_locked == 0){
 //            	/*Is this lock necessary? Maybe not since we are only reading positions etc.
@@ -988,8 +988,8 @@ void *runner_main2(void *data) {
                 cstart++;
               }
 //              if(pack_vars_pair_dens->task_locked){
-  		        cell_unlocktree(ci);
-  		        cell_unlocktree(cj);
+//  		        cell_unlocktree(ci);
+//  		        cell_unlocktree(cj);
   		        pack_vars_pair_dens->task_locked = 0;
 //              }
               cid = cstart;
@@ -1017,12 +1017,17 @@ void *runner_main2(void *data) {
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end);
+                    pair_end, cstart, n_leafs_found);
+                /*This ensure that if we still have leaves left we start at index 1.
+                  Otherwise, reset the index since we will be grabbing a new task*/
                 if(cstart == n_leafs_found)
                 	pack_vars_pair_dens->top_tasks_packed = 0;
-                else
+                else{
                     pack_vars_pair_dens->top_tasks_packed = 1;
-
+                    pack_vars_pair_dens->top_task_list[0] = t;
+                }
+                /*This makes it such that the remaining leaf tasks are packed starting from a
+                  fresh list since we are still in the while cstart < n_leafs_found loop*/
                 pack_vars_pair_dens->tasks_packed = 0;
 //                else{
 //
@@ -1046,8 +1051,8 @@ void *runner_main2(void *data) {
               }
               ///////////////////////////////////////////////////////////////////////
             }
-//		    cell_unlocktree(ci);
-//		    cell_unlocktree(cj);
+		    cell_unlocktree(ci);
+		    cell_unlocktree(cj);
 //            pack_vars_pair_dens->task_locked = 0;
             pack_vars_pair_dens->launch_leftovers = 0;
             pack_vars_pair_dens->launch = 0;

From 409457dca1213bd0010325e79fe16ebfe514b092 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sun, 9 Mar 2025 23:41:22 +0000
Subject: [PATCH 206/217] A few changes that don't work for locking/unlocking
 when recursing (gresho.ym edited to allow recursion). The code now hangs
 trying to lock a pair of cells. Commented out the lock/unlock mechanism to
 confirm that code does not give correct results without locking cells

---
 .../HydroTests/GreshoVortex_3D/gresho.yml     |  4 +-
 src/runner_doiact_functions_hydro_gpu.h       | 57 +++++++------------
 src/runner_main_clean.cu                      | 43 +-------------
 3 files changed, 26 insertions(+), 78 deletions(-)

diff --git a/examples/HydroTests/GreshoVortex_3D/gresho.yml b/examples/HydroTests/GreshoVortex_3D/gresho.yml
index 15709ccaf3..6c945e7473 100644
--- a/examples/HydroTests/GreshoVortex_3D/gresho.yml
+++ b/examples/HydroTests/GreshoVortex_3D/gresho.yml
@@ -7,7 +7,7 @@ InternalUnitSystem:
   UnitTemp_in_cgs:     1   # Kelvin
 
 Scheduler:
-  max_top_level_cells: 16
+  max_top_level_cells: 8
   tasks_per_cell: 200
     #  deadlock_waiting_time_s:   10
     #  cell_split_size: 100
@@ -40,4 +40,4 @@ SPH:
 InitialConditions:
   file_name:  greshoVortex.hdf5
   periodic:   1
-  replicate:  2
+    #  replicate:  2
diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 7bd631a69e..01a4233aee 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1590,7 +1590,8 @@ void runner_dopair1_unpack_f4(
     cudaEvent_t *pair_end, int cstart, int n_leaves_found){
   int topid;
 //  if(pack_vars->task_locked == 0){
-//    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+//  if(pack_vars->top_tasks_packed > 1){
+//    for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
 //    	//lock top level cell here
 //    	struct cell * cii = pack_vars->top_task_list[topid]->ci;
 //    	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
@@ -1602,19 +1603,6 @@ void runner_dopair1_unpack_f4(
 //    		; /* spin until we acquire the lock */
 //    	}
 //    }
-//	pack_vars->task_locked = 1;
-//  }
-//  if(pack_vars->task_locked == 0){
-//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-//	while (cell_locktree(cii)) {
-//	  ; /* spin until we acquire the lock */
-//	}
-//	/*Let's lock cj*/
-//	while (cell_locktree(cjj)) {
-//	  ; /* spin until we acquire the lock */
-//	}
-//	pack_vars->task_locked = 1;
 //  }
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
@@ -1631,16 +1619,28 @@ void runner_dopair1_unpack_f4(
 	const ticks toc = getticks();
 	total_cpu_unpack_ticks += toc - tic;
   }
-  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
-	//lock top level cell here
+
+//  if(pack_vars->top_tasks_packed > 1){
+//    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+//    	/*The failed to unlock cell issue is related to this if statement. REVISE*/
+//    if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
+//      continue;
 //	struct cell * cii = pack_vars->top_task_list[topid]->ci;
 //	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-//	/* Release the locks */
-//	cell_unlocktree(cii);
-//	/* Release the locks */
-//	cell_unlocktree(cjj);
+//	/*For some reason the code fails if we get a leaf pair task
+//	 *this if statement stops the code from trying to unlock same cells twice*/
+//	if(n_leaves_found > 1){
+//	  /* Release the locks */
+//	  cell_unlocktree(cii);
+//	  /* Release the locks */
+//	  cell_unlocktree(cjj);
+//	}
+//    }
+//  }
+  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+	//lock top level cell here
 	if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
-		break;
+		continue;
     enqueue_dependencies(s, pack_vars->top_task_list[topid]);
     pthread_mutex_lock(&s->sleep_mutex);
     atomic_dec(&s->waiting);
@@ -1648,21 +1648,6 @@ void runner_dopair1_unpack_f4(
     pthread_mutex_unlock(&s->sleep_mutex);
     pack_vars->task_locked = 0;
   }
-//  if(pack_vars->task_locked){
-//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-//	/* Release the locks */
-//	cell_unlocktree(cii);
-//	/* Release the locks */
-//	cell_unlocktree(cjj);
-//    enqueue_dependencies(s, pack_vars->top_task_list[topid]);
-//    pthread_mutex_lock(&s->sleep_mutex);
-//    atomic_dec(&s->waiting);
-//    pthread_cond_broadcast(&s->sleep_cond);
-//    pthread_mutex_unlock(&s->sleep_mutex);
-//    pack_vars->task_locked = 0;
-//  }
-
 }
 void runner_dopair1_launch_f4_g_one_memcpy(
     struct runner *r, struct scheduler *s, struct pack_vars_pair *pack_vars,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 317f0011d5..025d2d1f99 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -965,19 +965,6 @@ void *runner_main2(void *data) {
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
             while(cstart < n_leafs_found){
               tic_cpu_pack = getticks();
-//              if(pack_vars_pair_dens->task_locked == 0){
-//            	/*Is this lock necessary? Maybe not since we are only reading positions etc.
-//            	  Leave it in for now as I'm just getting it to work and need it to be locked
-//            	  in case I unlock in the outside loop*/
-//                while (cell_locktree(ci)) {
-//                  ; /* spin until we acquire the lock */
-//                }
-//                /*Let's lock cj*/
-//                while (cell_locktree(cj)) {
-//                  ; /* spin until we acquire the lock */
-//                }
-//   		        pack_vars_pair_dens->task_locked = 1;
-//              }
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
               while(cstart < n_leafs_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
                 packing_time_pair += runner_dopair1_pack_f4(
@@ -987,11 +974,7 @@ void *runner_main2(void *data) {
                   error("Packed more parts than possible");
                 cstart++;
               }
-//              if(pack_vars_pair_dens->task_locked){
-//  		        cell_unlocktree(ci);
-//  		        cell_unlocktree(cj);
-  		        pack_vars_pair_dens->task_locked = 0;
-//              }
+  		      pack_vars_pair_dens->task_locked = 0;
               cid = cstart;
               /* Copies done. Release the lock ! */
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
@@ -1029,31 +1012,11 @@ void *runner_main2(void *data) {
                 /*This makes it such that the remaining leaf tasks are packed starting from a
                   fresh list since we are still in the while cstart < n_leafs_found loop*/
                 pack_vars_pair_dens->tasks_packed = 0;
-//                else{
-//
-//                }
-//                if(cid == n_leafs_found) ntoptasks = pack_vars_pair_dens->top_tasks_packed;
-//                else ntoptasks = pack_vars_pair_dens->top_tasks_packed - 1;
-////                for (int tid = 0; tid < ntasks; tid++){
-//                  /*schedule my dependencies (Only unpacks really)*/
-//                int ttid = pack_vars_pair_dens->top_tasks_packed - 1;
-//			    struct task *tii = pack_vars_pair_dens->top_task_list[ttid];
-//
-////                }
-//                if(cid == n_leafs_found){
-//                  pack_vars_pair_dens->top_tasks_packed = 1;
-//                  pack_vars_pair_dens->top_task_list[0] = t;
-//                }
-//                else{
-//                  pack_vars_pair_dens->tasks_packed = 0;
-//                  pack_vars_pair_dens->top_task_list[0] = NULL;
-//                }
               }
               ///////////////////////////////////////////////////////////////////////
             }
-		    cell_unlocktree(ci);
-		    cell_unlocktree(cj);
-//            pack_vars_pair_dens->task_locked = 0;
+            cell_unlocktree(ci);
+            cell_unlocktree(cj);
             pack_vars_pair_dens->launch_leftovers = 0;
             pack_vars_pair_dens->launch = 0;
             /////////////////////W.I.P!!!////////////////////////////////////////////////////////

From 42e43320a3dc3585d4cfacc8c0e3b4b9a6dc8d5a Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Mon, 10 Mar 2025 06:07:11 +0000
Subject: [PATCH 207/217] SOmething is not right when I go into
 runner_dopair1_unpack_f4(). The pointers to cii_l and cjj_l are incorrect.
 Need to go through the algorithm again and double check I'm using the right
 indices and re-setting indices correctly

---
 src/runner_doiact_functions_hydro_gpu.h | 73 ++++++++++---------------
 src/runner_gpu_pack_functions.c         |  4 +-
 src/runner_main_clean.cu                | 58 +++++++++++++++-----
 3 files changed, 73 insertions(+), 62 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 01a4233aee..926d845c33 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -36,12 +36,17 @@ struct pack_vars_self {
   int tasksperbundle;
 
 } pack_vars_self;
-
+struct leaf_cell_list{
+  struct cell **ci;
+  struct cell **cj;
+  int n_leaves;
+  int n_packed;
+};
 struct pack_vars_pair {
   /*List of tasks and respective cells to be packed*/
   struct task **task_list;
   struct task **top_task_list;
-  struct task ****leaf_task_list;
+  struct leaf_cell_list * leaf_list;
   struct cell **ci_list;
   struct cell **cj_list;
   /*List of cell shifts*/
@@ -329,6 +334,10 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	/*for all leafs to be sent add to cell list */
 	cells_left[*n_leafs_found] = ci;
 	cells_right[*n_leafs_found] = cj;
+	/*Add leaf cells to list for each top_level task*/
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[*n_leafs_found] = ci;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[*n_leafs_found] = cj;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
 	*n_leafs_found = *n_leafs_found + 1;
 	if(*n_leafs_found >= n_expected_tasks)
 		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
@@ -363,7 +372,6 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
 
   /*Get pointers to the list of tasks and cells packed*/
 //  pack_vars->task_list[tasks_packed] = t;
-  pack_vars->leaf_task_list[pack_vars->top_tasks_packed][leaves_packed] = t;
   pack_vars->ci_list[tasks_packed] = ci;
   pack_vars->cj_list[tasks_packed] = cj;
 
@@ -410,6 +418,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
   pack_vars->tasks_packed++;
   pack_vars->launch = 0;
   pack_vars->launch_leftovers = 0;
+  pack_vars->leaf_list[pack_vars->top_tasks_packed - 1].n_packed++;
 
   //A. Nasar: Need to come back to this at some point!
   lock_lock(&s->queues[qid].lock);
@@ -1588,57 +1597,31 @@ void runner_dopair1_unpack_f4(
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
     cudaEvent_t *pair_end, int cstart, int n_leaves_found){
+
   int topid;
-//  if(pack_vars->task_locked == 0){
-//  if(pack_vars->top_tasks_packed > 1){
-//    for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
-//    	//lock top level cell here
-//    	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-//    	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-//    	while (cell_locktree(cii)) {
-//    		; /* spin until we acquire the lock */
-//    	}
-//    	/*Let's lock cj*/
-//    	while (cell_locktree(cjj)) {
-//    		; /* spin until we acquire the lock */
-//    	}
-//    }
-//  }
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
-  for(int tid = 0; tid < pack_vars->tasks_packed; tid++){
-	/*grab cell and task pointers*/
-	struct cell *cii = pack_vars->ci_list[tid];
-	struct cell *cjj = pack_vars->cj_list[tid];
+  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+	//lock top level cell here
+	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
 	const ticks tic = getticks();
 	/* Do the copy */
-	runner_do_ci_cj_gpu_unpack_neat_aos_f4(
-			r, cii, cjj, parts_recv, 0, &pack_length_unpack, tid,
+
+	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
+	for(int tid = 0; tid < n_leaves_in_task; tid++){
+	  //Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
+	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
+	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
+	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
+			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
 			2 * pack_vars->count_max_parts, e);
+	}
 
 	const ticks toc = getticks();
 	total_cpu_unpack_ticks += toc - tic;
-  }
-
-//  if(pack_vars->top_tasks_packed > 1){
-//    for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
-//    	/*The failed to unlock cell issue is related to this if statement. REVISE*/
-//    if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
-//      continue;
-//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
-//	/*For some reason the code fails if we get a leaf pair task
-//	 *this if statement stops the code from trying to unlock same cells twice*/
-//	if(n_leaves_found > 1){
-//	  /* Release the locks */
-//	  cell_unlocktree(cii);
-//	  /* Release the locks */
-//	  cell_unlocktree(cjj);
-//	}
-//    }
-//  }
-  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
-	//lock top level cell here
+	/*For some reason the code fails if we get a leaf pair task
+	 *this if statement stops the code from trying to unlock same cells twice*/
 	if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
 		continue;
     enqueue_dependencies(s, pack_vars->top_task_list[topid]);
diff --git a/src/runner_gpu_pack_functions.c b/src/runner_gpu_pack_functions.c
index 5e9acac977..af743e6172 100644
--- a/src/runner_gpu_pack_functions.c
+++ b/src/runner_gpu_pack_functions.c
@@ -542,8 +542,8 @@ void runner_do_ci_cj_gpu_unpack_neat_aos_f4(
     int tid, int count_max_parts_tmp, struct engine *e) {
 
   /* Anything to do here? */
-  //  if (c->hydro.count == 0)
-  //    return;
+//    if (ci->hydro.count == 0 || cj->hydro.count == 0)
+//      return;
   if (!cell_is_active_hydro(ci, e) && !cell_is_active_hydro(cj, e)) {
     message("Inactive cell\n");
     return;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 025d2d1f99..67a805f9b4 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -544,11 +544,27 @@ void *runner_main2(void *data) {
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   int n_leaves_max = 4096;
   /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
-  pack_vars_pair_dens->leaf_task_list =
-      (struct task **)calloc(target_n_tasks, sizeof(struct task *));
+  struct leaf_cell_list l_list[target_n_tasks];
+  pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
+  for (int i = 0; i < target_n_tasks; i++){
+    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+    l_list[i].n_leaves = 0;
+    pack_vars_pair_dens->leaf_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+    for (int j = 0; j < n_leaves_max; j++){
+      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
+      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
+
+    }
+  }
+//  pack_vars_pair_dens->leaf_list = l_list;
+//  pack_vars_pair_dens->leaf_list->ci =
+//	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//  pack_vars_pair_dens->leaf_list->cj =
+//	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
   /*Allocate memory for n_leaves_max task pointers per top level task*/
-  for(int i = 0; i < target_n_tasks; i++)
-	  pack_vars_pair_dens->leaf_task_list[i] = (struct task **)calloc(n_leaves_max, sizeof(struct task *));
 
   pack_vars_pair_dens->ci_list =
       (struct cell **)calloc(target_n_tasks, sizeof(struct cell *));
@@ -654,6 +670,8 @@ void *runner_main2(void *data) {
     pack_vars_pair_grad->tasks_packed = 0;
     pack_vars_self_grad->count_parts = 0;
     pack_vars_pair_grad->count_parts = 0;
+    for(int i = 0; i < target_n_tasks; i++)
+    	pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
 
     int total_tasks_packed_this_time_pair = 0;
     double packing_time = 0.0;
@@ -946,27 +964,29 @@ void *runner_main2(void *data) {
             * We are recursing separately to find out how much work we have before offloading*/
             //We need to allocate a list to put cell pointers into for each new task
             int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this
-            int n_leafs_found = 0;
+            int n_leaves_found = 0;
             int depth = 0;
             struct cell * cells_left[n_expected_tasks];
             struct cell * cells_right[n_expected_tasks];
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
-                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leafs_found,
+                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
 					  cells_left, cells_right, depth, n_expected_tasks);
-            n_leafs_total += n_leafs_found;
+            n_leafs_total += n_leaves_found;
 
             int cstart = 0, cid = 0;
             pack_vars_pair_dens->top_task_list[pack_vars_pair_dens->top_tasks_packed] = t;
             pack_vars_pair_dens->top_tasks_packed++;
             pack_vars_pair_dens->task_locked = 1;
+            //This might be abit iffy setting it to zero here. What if we loop through a task twice for recursion but do not offload the second time? We could be unpacking to the wrong leaves
+            pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].n_packed = 0;
             int t_s, t_e;
             t_s = 0;
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
-            while(cstart < n_leafs_found){
+            while(cstart < n_leaves_found){
               tic_cpu_pack = getticks();
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
-              while(cstart < n_leafs_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
+              while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
                 packing_time_pair += runner_dopair1_pack_f4(
                   r, sched, pack_vars_pair_dens, cells_left[cstart], cells_right[cstart], t,
                   parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, cstart);
@@ -974,8 +994,6 @@ void *runner_main2(void *data) {
                   error("Packed more parts than possible");
                 cstart++;
               }
-  		      pack_vars_pair_dens->task_locked = 0;
-              cid = cstart;
               /* Copies done. Release the lock ! */
               t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
               /* Packed enough tasks or no pack tasks left in queue, flag that
@@ -983,8 +1001,10 @@ void *runner_main2(void *data) {
               int launch = pack_vars_pair_dens->launch;
               int launch_leftovers = pack_vars_pair_dens->launch_leftovers;
               /* Do we have enough stuff to run the GPU ? */
-              if (launch || (launch_leftovers && cstart == n_leafs_found)) {
+              if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
+//            	cell_unlocktree(ci);
+//            	cell_unlocktree(cj);
                 int t_packed = pack_vars_pair_dens->tasks_packed;
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
@@ -1000,17 +1020,25 @@ void *runner_main2(void *data) {
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end, cstart, n_leafs_found);
+                    pair_end, cstart, n_leaves_found);
                 /*This ensure that if we still have leaves left we start at index 1.
                   Otherwise, reset the index since we will be grabbing a new task*/
-                if(cstart == n_leafs_found)
+                if(cstart == n_leaves_found){
                 	pack_vars_pair_dens->top_tasks_packed = 0;
+                }
                 else{
                     pack_vars_pair_dens->top_tasks_packed = 1;
                     pack_vars_pair_dens->top_task_list[0] = t;
+//                    pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+                    pack_vars_pair_dens->leaf_list[0].ci[0] =
+                        pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
+                    pack_vars_pair_dens->leaf_list[0].cj[0] =
+                        pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
                 }
                 /*This makes it such that the remaining leaf tasks are packed starting from a
-                  fresh list since we are still in the while cstart < n_leafs_found loop*/
+                  fresh list since we are still in the while cstart < n_leaves_found loop*/
+
+                pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 pack_vars_pair_dens->tasks_packed = 0;
               }
               ///////////////////////////////////////////////////////////////////////

From e3cf866008376f8318ba460c83a807402c6383c8 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Tue, 18 Mar 2025 05:00:44 +0000
Subject: [PATCH 208/217] Spotted a few bugs but not sorted the problem out
 with unpacking to the new top_task->ci/cj arrays. Code now crashes when
 trying to recurse and add entries to the top_task->ci/cj arrays

---
 src/runner_doiact_functions_hydro_gpu.h | 23 ++++++---
 src/runner_main_clean.cu                | 69 ++++++++++++++++++-------
 2 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 926d845c33..36857102c5 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -331,14 +331,16 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
   else if (CELL_IS_ACTIVE(ci, e) || CELL_IS_ACTIVE(cj, e)) {
 	/* if any cell empty: skip */
 	if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
+	int leafs_found = *n_leafs_found;
 	/*for all leafs to be sent add to cell list */
-	cells_left[*n_leafs_found] = ci;
-	cells_right[*n_leafs_found] = cj;
+	cells_left[leafs_found] = ci;
+	cells_right[leafs_found] = cj;
 	/*Add leaf cells to list for each top_level task*/
-	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[*n_leafs_found] = ci;
-	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[*n_leafs_found] = cj;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
 	pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
-	*n_leafs_found = *n_leafs_found + 1;
+//	error("stop");
+	*n_leafs_found = leafs_found + 1;
 	if(*n_leafs_found >= n_expected_tasks)
 		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
   }
@@ -1601,10 +1603,10 @@ void runner_dopair1_unpack_f4(
   int topid;
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
-  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
+  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
 	//lock top level cell here
-	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
+//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
+//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
 	const ticks tic = getticks();
 	/* Do the copy */
 
@@ -1613,6 +1615,11 @@ void runner_dopair1_unpack_f4(
 	  //Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
 	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
 	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
+	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+                            , topid, tid, n_leaves_in_task);
+//	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
 	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
 			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
 			2 * pack_vars->count_max_parts, e);
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 67a805f9b4..4879158b53 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -547,17 +547,17 @@ void *runner_main2(void *data) {
   struct leaf_cell_list l_list[target_n_tasks];
   pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
   for (int i = 0; i < target_n_tasks; i++){
-    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
-    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
-    l_list[i].n_leaves = 0;
-    pack_vars_pair_dens->leaf_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
-    pack_vars_pair_dens->leaf_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].n_leaves = 0;
+    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
     pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
-    for (int j = 0; j < n_leaves_max; j++){
-      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
-      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
-
-    }
+    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+//    for (int j = 0; j < n_leaves_max; j++){
+//      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
+//      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
+//    }
   }
 //  pack_vars_pair_dens->leaf_list = l_list;
 //  pack_vars_pair_dens->leaf_list->ci =
@@ -965,26 +965,46 @@ void *runner_main2(void *data) {
             //We need to allocate a list to put cell pointers into for each new task
             int n_expected_tasks = 4096; //A. Nasar: Need to come up with a good estimate for this
             int n_leaves_found = 0;
+            int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
             int depth = 0;
             struct cell * cells_left[n_expected_tasks];
             struct cell * cells_right[n_expected_tasks];
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
 					  cells_left, cells_right, depth, n_expected_tasks);
             n_leafs_total += n_leaves_found;
-
+//            if(n_leaves_found > 4 && r->cpuid == 0){
+//              fprintf(stderr, "leaves found %i\n", n_leaves_found);
+//              for (int i = 0; i< n_leaves_found; i++){
+//                int tt = pack_vars_pair_dens->top_tasks_packed;
+//                fprintf(stderr, "ci->loc %f %f %f\n",
+//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[0],
+//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[1],
+//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[2]);
+//
+//                fprintf(stderr, "cj->loc %f %f %f\n\n",
+//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[0],
+//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[1],
+//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[2]);
+//              }
+//              error("stop");
+//            }
             int cstart = 0, cid = 0;
-            pack_vars_pair_dens->top_task_list[pack_vars_pair_dens->top_tasks_packed] = t;
+            pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
+            //This might be abit iffy setting it to zero here. What if we loop through a task twice for recursion but do not offload the second time? We could be unpacking to the wrong leaves
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
             pack_vars_pair_dens->top_tasks_packed++;
             pack_vars_pair_dens->task_locked = 1;
-            //This might be abit iffy setting it to zero here. What if we loop through a task twice for recursion but do not offload the second time? We could be unpacking to the wrong leaves
-            pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].n_packed = 0;
             int t_s, t_e;
             t_s = 0;
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
             while(cstart < n_leaves_found){
               tic_cpu_pack = getticks();
+
+              pack_vars_pair_dens->launch_leftovers = 0;
+              pack_vars_pair_dens->launch = 0;
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
               while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
                 packing_time_pair += runner_dopair1_pack_f4(
@@ -1025,21 +1045,30 @@ void *runner_main2(void *data) {
                   Otherwise, reset the index since we will be grabbing a new task*/
                 if(cstart == n_leaves_found){
                 	pack_vars_pair_dens->top_tasks_packed = 0;
+                    pack_vars_pair_dens->tasks_packed = 0;
+                    pack_vars_pair_dens->leaf_list[0].ci = NULL;
+                    pack_vars_pair_dens->leaf_list[0].cj = NULL;
+                    pack_vars_pair_dens->leaf_list[0].n_leaves = 0;
+                    pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 }
                 else{
-                    pack_vars_pair_dens->top_tasks_packed = 1;
-                    pack_vars_pair_dens->top_task_list[0] = t;
+                  pack_vars_pair_dens->leaf_list[0].ci[0] =
+                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
+                  pack_vars_pair_dens->leaf_list[0].cj[0] =
+                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
+
+                  pack_vars_pair_dens->tasks_packed = 0;
+                  pack_vars_pair_dens->top_tasks_packed = 1;
+                  pack_vars_pair_dens->top_task_list[0] = t;
 //                    pack_vars_pair_dens->leaf_list[0].n_packed = 0;
-                    pack_vars_pair_dens->leaf_list[0].ci[0] =
-                        pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
-                    pack_vars_pair_dens->leaf_list[0].cj[0] =
-                        pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
                 }
                 /*This makes it such that the remaining leaf tasks are packed starting from a
                   fresh list since we are still in the while cstart < n_leaves_found loop*/
 
                 pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 pack_vars_pair_dens->tasks_packed = 0;
+                pack_vars_pair_dens->launch_leftovers = 0;
+                pack_vars_pair_dens->launch = 0;
               }
               ///////////////////////////////////////////////////////////////////////
             }

From 1625f17ec93cc019047bf1c1c7c566b306526323 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 22 Mar 2025 05:14:22 +0000
Subject: [PATCH 209/217] Changed setup so tracking cells is via weird
 cell_list_i arrays and still get same issue. It is not because of how I am
 allocating the arrays the issue is in how I am managing the counting and book
 keeping

---
 src/runner_doiact_functions_hydro_gpu.h | 29 ++++++++-------
 src/runner_main_clean.cu                | 47 +++++++++++++++++--------
 2 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 36857102c5..3c6bf75740 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -36,7 +36,9 @@ struct pack_vars_self {
   int tasksperbundle;
 
 } pack_vars_self;
+
 struct leaf_cell_list{
+  struct cell_list;
   struct cell **ci;
   struct cell **cj;
   int n_leaves;
@@ -301,7 +303,8 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
-							  struct cell ** cells_left, struct cell ** cells_right, int depth, int n_expected_tasks) {
+							  struct cell ** cells_left, struct cell ** cells_right,
+							  int depth, int n_expected_tasks, struct cell ****cell_list_i, struct cell ****cell_list_j) {
 
 	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
   if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
@@ -323,7 +326,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	  /*We probably want to record */
 	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
 		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
-				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks);
+				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks, cell_list_i, cell_list_j);
 //	        message("recursing to depth %i", depth + 1);
 	  }
 	}
@@ -336,10 +339,11 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	cells_left[leafs_found] = ci;
 	cells_right[leafs_found] = cj;
 	/*Add leaf cells to list for each top_level task*/
-	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
-	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
+//	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
+//	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
+	cell_list_i[pack_vars->top_tasks_packed][leafs_found] = ci;
+	cell_list_j[pack_vars->top_tasks_packed][leafs_found] = cj;
 	pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
-//	error("stop");
 	*n_leafs_found = leafs_found + 1;
 	if(*n_leafs_found >= n_expected_tasks)
 		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
@@ -1598,7 +1602,7 @@ void runner_dopair1_unpack_f4(
     struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    cudaEvent_t *pair_end, int cstart, int n_leaves_found){
+    cudaEvent_t *pair_end, int cstart, int n_leaves_found, struct cell ****cell_list_i, struct cell ****cell_list_j){
 
   int topid;
   int pack_length_unpack = 0;
@@ -1613,12 +1617,13 @@ void runner_dopair1_unpack_f4(
 	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
 	for(int tid = 0; tid < n_leaves_in_task; tid++){
 	  //Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
-	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
-	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
-	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
-                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
-	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
-                            , topid, tid, n_leaves_in_task);
+//	  error("Here");
+	  struct cell * cii_l = cell_list_i[topid][tid];
+	  struct cell * cjj_l = cell_list_j[topid][tid];
+//	  message("loc %f %f %f topid %i tid %i nleaves %i", l_list_p_d[topid].ci[tid]->loc[0]
+//                            , l_list_p_d[topid].ci[tid]->loc[1]
+//	                        , l_list_p_d[topid].ci[tid]->loc[2]
+//                            , topid, tid, n_leaves_in_task);
 //	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
 	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
 			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4879158b53..4bf58cd806 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -544,21 +544,38 @@ void *runner_main2(void *data) {
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   int n_leaves_max = 4096;
   /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
-  struct leaf_cell_list l_list[target_n_tasks];
+  struct leaf_cell_list l_list_p_d[target_n_tasks];
+  struct cell **** cell_list_i = (struct cell ****)malloc(target_n_tasks * sizeof(struct cell ***));
+  struct cell **** cell_list_j = (struct cell ****)malloc(target_n_tasks * sizeof(struct cell ***));
   pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
   for (int i = 0; i < target_n_tasks; i++){
-//    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
-//    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+    cell_list_i[i] = (struct cell ***) malloc(n_leaves_max * sizeof(struct cell **));
+    cell_list_j[i] = (struct cell ***) malloc(n_leaves_max * sizeof(struct cell **));
+    for(int j =0; j < n_leaves_max; j++){
+      cell_list_i[i][j] = (struct cell **) malloc(sizeof(struct cell *));
+      cell_list_j[i][j] = (struct cell **) malloc(sizeof(struct cell *));
+    }
+//    l_list_p_d[i].cell_list.ci = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list_p_d[i].cell_list.ci = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list_p_d[i].ci_list = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list_p_d[i].cj_list = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
+//    for(int j = 0; j < n_leaves_max; j++){
+//      l_list_p_d[i].ci[j] = malloc(sizeof (struct cell *));
+//      l_list_p_d[i].cj[j] = malloc(sizeof (struct cell *));
+//      message("Alocating");
+//    }
 //    l_list[i].n_leaves = 0;
-    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
-    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
-    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
-    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+//    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
+//    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
+//    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+//    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
 //    for (int j = 0; j < n_leaves_max; j++){
 //      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
 //      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
 //    }
   }
+
+//  error("Alloced");
 //  pack_vars_pair_dens->leaf_list = l_list;
 //  pack_vars_pair_dens->leaf_list->ci =
 //	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
@@ -972,7 +989,7 @@ void *runner_main2(void *data) {
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
-					  cells_left, cells_right, depth, n_expected_tasks);
+					  cells_left, cells_right, depth, n_expected_tasks, cell_list_i, cell_list_j);
             n_leafs_total += n_leaves_found;
 //            if(n_leaves_found > 4 && r->cpuid == 0){
 //              fprintf(stderr, "leaves found %i\n", n_leaves_found);
@@ -1040,22 +1057,22 @@ void *runner_main2(void *data) {
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end, cstart, n_leaves_found);
+                    pair_end, cstart, n_leaves_found, cell_list_i, cell_list_j);
                 /*This ensure that if we still have leaves left we start at index 1.
                   Otherwise, reset the index since we will be grabbing a new task*/
                 if(cstart == n_leaves_found){
                 	pack_vars_pair_dens->top_tasks_packed = 0;
                     pack_vars_pair_dens->tasks_packed = 0;
-                    pack_vars_pair_dens->leaf_list[0].ci = NULL;
-                    pack_vars_pair_dens->leaf_list[0].cj = NULL;
+//                    l_list_p_d[0].ci = NULL;
+//                    l_list_p_d[0].cj = NULL;
                     pack_vars_pair_dens->leaf_list[0].n_leaves = 0;
                     pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 }
                 else{
-                  pack_vars_pair_dens->leaf_list[0].ci[0] =
-                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
-                  pack_vars_pair_dens->leaf_list[0].cj[0] =
-                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
+                  cell_list_i[0][0] =
+                    cell_list_i[pack_vars_pair_dens->top_tasks_packed - 1][pack_vars_pair_dens->tasks_packed - 1];
+                  cell_list_j[0][0] =
+                    cell_list_j[pack_vars_pair_dens->top_tasks_packed - 1][pack_vars_pair_dens->tasks_packed - 1];
 
                   pack_vars_pair_dens->tasks_packed = 0;
                   pack_vars_pair_dens->top_tasks_packed = 1;

From 8c7948251680868fc9067ddfb234286cb72fad4b Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Mar 2025 06:13:05 +0000
Subject: [PATCH 210/217] Revert "Changed setup so tracking cells is via weird
 cell_list_i arrays and still get same issue. It is not because of how I am
 allocating the arrays the issue is in how I am managing the counting and book
 keeping"

This reverts commit 1625f17ec93cc019047bf1c1c7c566b306526323.
---
 src/runner_doiact_functions_hydro_gpu.h | 29 +++++++--------
 src/runner_main_clean.cu                | 47 ++++++++-----------------
 2 files changed, 27 insertions(+), 49 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 3c6bf75740..36857102c5 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -36,9 +36,7 @@ struct pack_vars_self {
   int tasksperbundle;
 
 } pack_vars_self;
-
 struct leaf_cell_list{
-  struct cell_list;
   struct cell **ci;
   struct cell **cj;
   int n_leaves;
@@ -303,8 +301,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
-							  struct cell ** cells_left, struct cell ** cells_right,
-							  int depth, int n_expected_tasks, struct cell ****cell_list_i, struct cell ****cell_list_j) {
+							  struct cell ** cells_left, struct cell ** cells_right, int depth, int n_expected_tasks) {
 
 	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
   if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
@@ -326,7 +323,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	  /*We probably want to record */
 	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
 		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
-				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks, cell_list_i, cell_list_j);
+				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks);
 //	        message("recursing to depth %i", depth + 1);
 	  }
 	}
@@ -339,11 +336,10 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	cells_left[leafs_found] = ci;
 	cells_right[leafs_found] = cj;
 	/*Add leaf cells to list for each top_level task*/
-//	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
-//	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
-	cell_list_i[pack_vars->top_tasks_packed][leafs_found] = ci;
-	cell_list_j[pack_vars->top_tasks_packed][leafs_found] = cj;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
+	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
 	pack_vars->leaf_list[pack_vars->top_tasks_packed].n_leaves++;
+//	error("stop");
 	*n_leafs_found = leafs_found + 1;
 	if(*n_leafs_found >= n_expected_tasks)
 		error("Created %i more than expected leaf cells. depth %i", *n_leafs_found, depth);
@@ -1602,7 +1598,7 @@ void runner_dopair1_unpack_f4(
     struct part_aos_f4_recv *d_parts_recv, cudaStream_t *stream, float d_a,
     float d_H, struct engine *e, double *packing_time, double *gpu_time,
     double *unpack_time, int4 *fparti_fpartj_lparti_lpartj_dens,
-    cudaEvent_t *pair_end, int cstart, int n_leaves_found, struct cell ****cell_list_i, struct cell ****cell_list_j){
+    cudaEvent_t *pair_end, int cstart, int n_leaves_found){
 
   int topid;
   int pack_length_unpack = 0;
@@ -1617,13 +1613,12 @@ void runner_dopair1_unpack_f4(
 	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
 	for(int tid = 0; tid < n_leaves_in_task; tid++){
 	  //Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
-//	  error("Here");
-	  struct cell * cii_l = cell_list_i[topid][tid];
-	  struct cell * cjj_l = cell_list_j[topid][tid];
-//	  message("loc %f %f %f topid %i tid %i nleaves %i", l_list_p_d[topid].ci[tid]->loc[0]
-//                            , l_list_p_d[topid].ci[tid]->loc[1]
-//	                        , l_list_p_d[topid].ci[tid]->loc[2]
-//                            , topid, tid, n_leaves_in_task);
+	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
+	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
+	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+                            , topid, tid, n_leaves_in_task);
 //	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
 	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
 			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4bf58cd806..4879158b53 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -544,38 +544,21 @@ void *runner_main2(void *data) {
       (struct task **)calloc(target_n_tasks, sizeof(struct task *));
   int n_leaves_max = 4096;
   /*Allocate target_n_tasks for top level tasks. This is a 2D array with length target_n_tasks and width n_leaves_max*/
-  struct leaf_cell_list l_list_p_d[target_n_tasks];
-  struct cell **** cell_list_i = (struct cell ****)malloc(target_n_tasks * sizeof(struct cell ***));
-  struct cell **** cell_list_j = (struct cell ****)malloc(target_n_tasks * sizeof(struct cell ***));
+  struct leaf_cell_list l_list[target_n_tasks];
   pack_vars_pair_dens->leaf_list = (struct leaf_cell_list *)calloc(target_n_tasks, sizeof(struct leaf_cell_list));
   for (int i = 0; i < target_n_tasks; i++){
-    cell_list_i[i] = (struct cell ***) malloc(n_leaves_max * sizeof(struct cell **));
-    cell_list_j[i] = (struct cell ***) malloc(n_leaves_max * sizeof(struct cell **));
-    for(int j =0; j < n_leaves_max; j++){
-      cell_list_i[i][j] = (struct cell **) malloc(sizeof(struct cell *));
-      cell_list_j[i][j] = (struct cell **) malloc(sizeof(struct cell *));
-    }
-//    l_list_p_d[i].cell_list.ci = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
-//    l_list_p_d[i].cell_list.ci = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
-//    l_list_p_d[i].ci_list = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
-//    l_list_p_d[i].cj_list = malloc(n_leaves_max * sizeof (struct cell *));//calloc(n_leaves_max, sizeof(struct cell *));
-//    for(int j = 0; j < n_leaves_max; j++){
-//      l_list_p_d[i].ci[j] = malloc(sizeof (struct cell *));
-//      l_list_p_d[i].cj[j] = malloc(sizeof (struct cell *));
-//      message("Alocating");
-//    }
+//    l_list[i].ci = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
+//    l_list[i].cj = (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
 //    l_list[i].n_leaves = 0;
-//    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
-//    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
-//    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
-//    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+    pack_vars_pair_dens->leaf_list[i].ci = malloc(n_leaves_max * sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].cj = malloc(n_leaves_max * sizeof(struct cell *));
+    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
 //    for (int j = 0; j < n_leaves_max; j++){
 //      pack_vars_pair_dens->leaf_list[i].ci[j] = l_list[i].ci[j];
 //      pack_vars_pair_dens->leaf_list[i].cj[j] = l_list[i].cj[j];
 //    }
   }
-
-//  error("Alloced");
 //  pack_vars_pair_dens->leaf_list = l_list;
 //  pack_vars_pair_dens->leaf_list->ci =
 //	      (struct cell **)calloc(n_leaves_max, sizeof(struct cell *));
@@ -989,7 +972,7 @@ void *runner_main2(void *data) {
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
-					  cells_left, cells_right, depth, n_expected_tasks, cell_list_i, cell_list_j);
+					  cells_left, cells_right, depth, n_expected_tasks);
             n_leafs_total += n_leaves_found;
 //            if(n_leaves_found > 4 && r->cpuid == 0){
 //              fprintf(stderr, "leaves found %i\n", n_leaves_found);
@@ -1057,22 +1040,22 @@ void *runner_main2(void *data) {
                     d_parts_aos_pair_f4_recv, stream_pairs, d_a, d_H, e,
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
-                    pair_end, cstart, n_leaves_found, cell_list_i, cell_list_j);
+                    pair_end, cstart, n_leaves_found);
                 /*This ensure that if we still have leaves left we start at index 1.
                   Otherwise, reset the index since we will be grabbing a new task*/
                 if(cstart == n_leaves_found){
                 	pack_vars_pair_dens->top_tasks_packed = 0;
                     pack_vars_pair_dens->tasks_packed = 0;
-//                    l_list_p_d[0].ci = NULL;
-//                    l_list_p_d[0].cj = NULL;
+                    pack_vars_pair_dens->leaf_list[0].ci = NULL;
+                    pack_vars_pair_dens->leaf_list[0].cj = NULL;
                     pack_vars_pair_dens->leaf_list[0].n_leaves = 0;
                     pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 }
                 else{
-                  cell_list_i[0][0] =
-                    cell_list_i[pack_vars_pair_dens->top_tasks_packed - 1][pack_vars_pair_dens->tasks_packed - 1];
-                  cell_list_j[0][0] =
-                    cell_list_j[pack_vars_pair_dens->top_tasks_packed - 1][pack_vars_pair_dens->tasks_packed - 1];
+                  pack_vars_pair_dens->leaf_list[0].ci[0] =
+                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
+                  pack_vars_pair_dens->leaf_list[0].cj[0] =
+                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
 
                   pack_vars_pair_dens->tasks_packed = 0;
                   pack_vars_pair_dens->top_tasks_packed = 1;

From af746b2c27e7dd84b37983a4d4201f116ddedda2 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Mar 2025 06:32:28 +0000
Subject: [PATCH 211/217] Corrected comment and removed some commented out code

---
 src/runner_main_clean.cu | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 4879158b53..db5b6c4dec 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -974,25 +974,10 @@ void *runner_main2(void *data) {
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
 					  cells_left, cells_right, depth, n_expected_tasks);
             n_leafs_total += n_leaves_found;
-//            if(n_leaves_found > 4 && r->cpuid == 0){
-//              fprintf(stderr, "leaves found %i\n", n_leaves_found);
-//              for (int i = 0; i< n_leaves_found; i++){
-//                int tt = pack_vars_pair_dens->top_tasks_packed;
-//                fprintf(stderr, "ci->loc %f %f %f\n",
-//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[0],
-//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[1],
-//                    pack_vars_pair_dens->leaf_list[tt].ci[i]->loc[2]);
-//
-//                fprintf(stderr, "cj->loc %f %f %f\n\n",
-//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[0],
-//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[1],
-//                    pack_vars_pair_dens->leaf_list[tt].cj[i]->loc[2]);
-//              }
-//              error("stop");
-//            }
             int cstart = 0, cid = 0;
             pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
-            //This might be abit iffy setting it to zero here. What if we loop through a task twice for recursion but do not offload the second time? We could be unpacking to the wrong leaves
+
+            //This might be a bit iffy setting it to zero here. What if we loop through a task twice after recursion but do not offload the second time? We could be unpacking to the wrong leaves
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
             pack_vars_pair_dens->top_tasks_packed++;
             pack_vars_pair_dens->task_locked = 1;

From d7faac4790cb6f5fde6c24d880209a0759efa038 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Mar 2025 07:03:30 +0000
Subject: [PATCH 212/217] Added some debug code to keep track of prev top level
 task. Removed a comment

---
 src/runner_doiact_functions_hydro_gpu.h | 2 --
 src/runner_main_clean.cu                | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 36857102c5..99c2ec3bfd 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1401,8 +1401,6 @@ void runner_dopair1_launch_f4_one_memcpy(
     pack_vars->bundle_last_part[0] = pack_vars->count_parts;
 
   /* Launch the copies for each bundle and run the GPU kernel */
-  /*We don't go into this loop if tasks_left_self == 1 as
-   nBundles_temp will be zero DUHDUHDUHDUHHHHHH!!!!!*/
   for (int bid = 0; bid < nBundles_temp; bid++) {
 
     int max_parts_i = 0;
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index db5b6c4dec..0d1f69d9ef 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -727,6 +727,7 @@ void *runner_main2(void *data) {
     /* Loop while there are tasks... */
     tasks_done_gpu_inc = 0;
     ticks hang_time = getticks();
+    struct task * ttop_prev;
     while (1) {
       // A. Nasar: Get qid for re-use later
       int qid = r->qid;
@@ -743,6 +744,8 @@ void *runner_main2(void *data) {
       struct cell *ci = t->ci;
       struct cell *cj = t->cj;
 
+      struct task * ttop = t;
+
       if (ci == NULL && (t->subtype != task_subtype_gpu_unpack_d
     		  && t->subtype != task_subtype_gpu_unpack_g
 			  && t->subtype != task_subtype_gpu_unpack_f)) error("This cannot be");
@@ -988,6 +991,8 @@ void *runner_main2(void *data) {
             while(cstart < n_leaves_found){
               tic_cpu_pack = getticks();
 
+//              if(pack_vars_pair_dens->top_task_list[0] == ttop_prev)
+//                error("Working on prev top level task");
               pack_vars_pair_dens->launch_leftovers = 0;
               pack_vars_pair_dens->launch = 0;
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
@@ -1057,6 +1062,7 @@ void *runner_main2(void *data) {
               }
               ///////////////////////////////////////////////////////////////////////
             }
+            ttop_prev = t;
             cell_unlocktree(ci);
             cell_unlocktree(cj);
             pack_vars_pair_dens->launch_leftovers = 0;

From dfd114c6bd19683abb4433125e16f1a8e1441e60 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Mar 2025 07:11:08 +0000
Subject: [PATCH 213/217] Removed un-necessary commented out code. Modified a
 comment

---
 src/runner_doiact_functions_hydro_gpu.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 99c2ec3bfd..77ca21c047 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1601,16 +1601,14 @@ void runner_dopair1_unpack_f4(
   int topid;
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
+  /*Loop over top level tasks*/
   for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
-	//lock top level cell here
-//	struct cell * cii = pack_vars->top_task_list[topid]->ci;
-//	struct cell * cjj = pack_vars->top_task_list[topid]->cj;
 	const ticks tic = getticks();
-	/* Do the copy */
-
+	/* Loop through each daughter task */
 	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
 	for(int tid = 0; tid < n_leaves_in_task; tid++){
-	  //Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
+	  /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
+	   *but likely due to incorrect book keeping*/
 	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
 	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
 	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]

From 0b9610b90a0f25645e0f96d258d64683254ca1c2 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Wed, 26 Mar 2025 07:22:38 +0000
Subject: [PATCH 214/217] Replaced long counter accesses with ints

---
 src/runner_main_clean.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 0d1f69d9ef..33dc3cd865 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -1042,10 +1042,12 @@ void *runner_main2(void *data) {
                     pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                 }
                 else{
+                  int tt_packed = pack_vars_pair_dens->top_tasks_packed;
+                  int t_packed = pack_vars_pair_dens->tasks_packed;
                   pack_vars_pair_dens->leaf_list[0].ci[0] =
-                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].ci[pack_vars_pair_dens->tasks_packed - 1];
+                      pack_vars_pair_dens->leaf_list[tt_packed - 1].ci[t_packed - 1];
                   pack_vars_pair_dens->leaf_list[0].cj[0] =
-                      pack_vars_pair_dens->leaf_list[pack_vars_pair_dens->top_tasks_packed - 1].cj[pack_vars_pair_dens->tasks_packed - 1];
+                      pack_vars_pair_dens->leaf_list[tt_packed - 1].cj[t_packed - 1];
 
                   pack_vars_pair_dens->tasks_packed = 0;
                   pack_vars_pair_dens->top_tasks_packed = 1;

From fac312b679d60ca07bc3b3471401646261170a29 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 29 Mar 2025 06:50:06 +0000
Subject: [PATCH 215/217] Work in progress. Changed the offload process so that
 we now use the same counter for recursion and for offload and unpacking.
 Seems a bit better but code still crashes trying toe write somewhere that
 doesn't exist in the counter arrays. The counter for n_leaves_found seems
 iffy as it has gone up to 30 with this configuration which seems way too much
 for what should be one level of recursion

---
 src/runner_doiact_functions_hydro_gpu.h | 21 ++++---
 src/runner_main_clean.cu                | 80 +++++++++++++++----------
 2 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 77ca21c047..7d0ead7b4b 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -40,6 +40,8 @@ struct leaf_cell_list{
   struct cell **ci;
   struct cell **cj;
   int n_leaves;
+  int n_start;
+  int n_end;
   int n_packed;
 };
 struct pack_vars_pair {
@@ -301,7 +303,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
                               int4 *fparti_fpartj_lparti_lpartj, int *n_leafs_found,
-							  struct cell ** cells_left, struct cell ** cells_right, int depth, int n_expected_tasks) {
+							  int depth, int n_expected_tasks) {
 
 	/* Should we even bother? A. Nasar: For GPU code we need to be clever about this */
   if (!CELL_IS_ACTIVE(ci, e) && !CELL_IS_ACTIVE(cj, e)) return;
@@ -323,7 +325,7 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	  /*We probably want to record */
 	  if (ci->progeny[pid] != NULL && cj->progeny[pjd] != NULL){
 		runner_recurse_gpu(r, s, pack_vars, ci->progeny[pid], cj->progeny[pjd], t, parts_send, e, fparti_fpartj_lparti_lpartj,
-				n_leafs_found, cells_left, cells_right, depth + 1, n_expected_tasks);
+				n_leafs_found, depth + 1, n_expected_tasks);
 //	        message("recursing to depth %i", depth + 1);
 	  }
 	}
@@ -333,8 +335,8 @@ void runner_recurse_gpu(struct runner *r, struct scheduler *s,
 	if(ci->hydro.count == 0 || cj->hydro.count == 0) return;
 	int leafs_found = *n_leafs_found;
 	/*for all leafs to be sent add to cell list */
-	cells_left[leafs_found] = ci;
-	cells_right[leafs_found] = cj;
+//	cells_left[leafs_found] = ci;
+//	cells_right[leafs_found] = cj;
 	/*Add leaf cells to list for each top_level task*/
 	pack_vars->leaf_list[pack_vars->top_tasks_packed].ci[leafs_found] = ci;
 	pack_vars->leaf_list[pack_vars->top_tasks_packed].cj[leafs_found] = cj;
@@ -352,7 +354,7 @@ double runner_dopair1_pack_f4(struct runner *r, struct scheduler *s,
                               struct cell *ci, struct cell *cj, struct task *t,
                               struct part_aos_f4_send *parts_send,
                               struct engine *e,
-                              int4 *fparti_fpartj_lparti_lpartj, int leaves_packed) {
+                              int4 *fparti_fpartj_lparti_lpartj) {
   /* Timers for how long this all takes.
    * t0 and t1 are from start to finish including GPU calcs
    * tp0 and tp1 only time packing and unpacking*/
@@ -1576,8 +1578,8 @@ void runner_dopair1_launch_f4_one_memcpy(
   }
 
   /* Zero counters for the next pack operations */
-  pack_vars->count_parts = 0;
-  pack_vars->tasks_packed = 0;
+//  pack_vars->count_parts = 0;
+//  pack_vars->tasks_packed = 0;
 
   //	/*Time end of unpacking*/
   //	clock_gettime(CLOCK_REALTIME, &t1);
@@ -1606,7 +1608,8 @@ void runner_dopair1_unpack_f4(
 	const ticks tic = getticks();
 	/* Loop through each daughter task */
 	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
-	for(int tid = 0; tid < n_leaves_in_task; tid++){
+	int nstart = pack_vars->leaf_list[topid].n_start;
+	for(int tid = nstart; tid < n_leaves_in_task; tid++){
 	  /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
 	   *but likely due to incorrect book keeping*/
 	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
@@ -1623,6 +1626,7 @@ void runner_dopair1_unpack_f4(
 
 	const ticks toc = getticks();
 	total_cpu_unpack_ticks += toc - tic;
+	pack_vars->count_parts = 0;
 	/*For some reason the code fails if we get a leaf pair task
 	 *this if statement stops the code from trying to unlock same cells twice*/
 	if(topid == pack_vars->top_tasks_packed -1 && cstart != n_leaves_found)
@@ -1632,7 +1636,6 @@ void runner_dopair1_unpack_f4(
     atomic_dec(&s->waiting);
     pthread_cond_broadcast(&s->sleep_cond);
     pthread_mutex_unlock(&s->sleep_mutex);
-    pack_vars->task_locked = 0;
   }
 }
 void runner_dopair1_launch_f4_g_one_memcpy(
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index 33dc3cd865..f15e3484ce 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -970,24 +970,27 @@ void *runner_main2(void *data) {
             int n_leaves_found = 0;
             int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
             int depth = 0;
-            struct cell * cells_left[n_expected_tasks];
-            struct cell * cells_right[n_expected_tasks];
+//            struct cell * cells_left[n_expected_tasks];
+//            struct cell * cells_right[n_expected_tasks];
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_start = 0;
+            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
-                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found,
-					  cells_left, cells_right, depth, n_expected_tasks);
+                      parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found, depth, n_expected_tasks);
             n_leafs_total += n_leaves_found;
             int cstart = 0, cid = 0;
             pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
 
-            //This might be a bit iffy setting it to zero here. What if we loop through a task twice after recursion but do not offload the second time? We could be unpacking to the wrong leaves
-            pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
             pack_vars_pair_dens->top_tasks_packed++;
             pack_vars_pair_dens->task_locked = 1;
             int t_s, t_e;
             t_s = 0;
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
+
+            pack_vars_pair_dens->leaf_list[0].n_start = 0;
+            int ntop_packed = pack_vars_pair_dens->top_tasks_packed;
+
             while(cstart < n_leaves_found){
               tic_cpu_pack = getticks();
 
@@ -997,9 +1000,20 @@ void *runner_main2(void *data) {
               pack_vars_pair_dens->launch = 0;
               /*Loop through n_daughters such that the pack_vars_pair_dens counters are updated*/
               while(cstart < n_leaves_found && pack_vars_pair_dens->tasks_packed < n_t_tasks){
+                // n_start is incremented in pack. However, for cases where we have launched
+                // but there are still some daughters left unpacked, we need to restart the
+                // count from zero for the packed arrays as the daughters we previously worked on are no longer necessary.
+                // Thus, the counter for cii and cjj should remain cstart but counter for packing/unpacking arrays
+                // should be n_start which is set to zero after launch. count_parts should also be zero ater launch
+                struct cell * cii = pack_vars_pair_dens->leaf_list[ntop_packed - 1].ci[cstart];
+                struct cell * cjj = pack_vars_pair_dens->leaf_list[ntop_packed - 1].cj[cstart];
                 packing_time_pair += runner_dopair1_pack_f4(
-                  r, sched, pack_vars_pair_dens, cells_left[cstart], cells_right[cstart], t,
-                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, cstart);
+                    /////////////////////////////Are we sure we should use
+                    /////////////////////////////cells_left/cells right and not
+                    /////////////////////////////pack_vars_pair_dens->leaf_list[top_tasks_packed].ci & cj?
+                  r, sched, pack_vars_pair_dens, cii, cjj, t,
+                    /////////////////////////////      HERE        //////////////////////////////////////////
+                  parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens);
                 if(pack_vars_pair_dens->count_parts > count_max_parts_tmp)
                   error("Packed more parts than possible");
                 cstart++;
@@ -1013,8 +1027,6 @@ void *runner_main2(void *data) {
               /* Do we have enough stuff to run the GPU ? */
               if (launch || launch_leftovers) {
                 /*Launch GPU tasks*/
-//            	cell_unlocktree(ci);
-//            	cell_unlocktree(cj);
                 int t_packed = pack_vars_pair_dens->tasks_packed;
                 runner_dopair1_launch_f4_one_memcpy(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
@@ -1023,7 +1035,7 @@ void *runner_main2(void *data) {
                     &packing_time_pair, &time_for_density_gpu_pair,
                     &unpacking_time_pair, fparti_fpartj_lparti_lpartj_dens,
                     pair_end);
-                int ntoptasks = 0;
+                //A. Nasar: Unpack data and zero count_parts counter
                 runner_dopair1_unpack_f4(
                     r, sched, pack_vars_pair_dens, t, parts_aos_pair_f4_send,
                     parts_aos_pair_f4_recv, d_parts_aos_pair_f4_send,
@@ -1033,31 +1045,33 @@ void *runner_main2(void *data) {
                     pair_end, cstart, n_leaves_found);
                 /*This ensure that if we still have leaves left we start at index 1.
                   Otherwise, reset the index since we will be grabbing a new task*/
+                int n_packed = pack_vars_pair_dens->tasks_packed;
+                //A. Nasar: We've packed all daughters and have launched --> one way or the other
                 if(cstart == n_leaves_found){
-                	pack_vars_pair_dens->top_tasks_packed = 0;
-                    pack_vars_pair_dens->tasks_packed = 0;
-                    pack_vars_pair_dens->leaf_list[0].ci = NULL;
-                    pack_vars_pair_dens->leaf_list[0].cj = NULL;
-                    pack_vars_pair_dens->leaf_list[0].n_leaves = 0;
-                    pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+                  pack_vars_pair_dens->top_tasks_packed = 0;
+                  for(int i = 0; i < ntop_packed; i++){
+                    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+                    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
+                  }
                 }
+                // A. Nasar: We've launched but we have not packed all daughters.
+                // Need to set counters so we start from the last top-task packed
+                // and it's last packed daughter-task and start packing to the beginning of GPU arrays
+                // which is reset to zero (count_parts) in "....unpack_f4()"
                 else{
-                  int tt_packed = pack_vars_pair_dens->top_tasks_packed;
-                  int t_packed = pack_vars_pair_dens->tasks_packed;
-                  pack_vars_pair_dens->leaf_list[0].ci[0] =
-                      pack_vars_pair_dens->leaf_list[tt_packed - 1].ci[t_packed - 1];
-                  pack_vars_pair_dens->leaf_list[0].cj[0] =
-                      pack_vars_pair_dens->leaf_list[tt_packed - 1].cj[t_packed - 1];
-
-                  pack_vars_pair_dens->tasks_packed = 0;
                   pack_vars_pair_dens->top_tasks_packed = 1;
-                  pack_vars_pair_dens->top_task_list[0] = t;
-//                    pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+                  pack_vars_pair_dens->top_task_list[0]= t;
+                  // A. Nasar: We've launched so need to restart counting tasks
+                  // from zero and need to reset tasks_packed to zero.
+                  // However, the counter for
+                  pack_vars_pair_dens->leaf_list[0].n_start = cstart;
+                  //A. Nasar: We have packed all daughter tasks in this parent task
+                  /*This makes it such that the remaining leaf tasks are packed starting from a
+                      fresh list since we are still in the while cstart < n_leaves_found loop**/
                 }
-                /*This makes it such that the remaining leaf tasks are packed starting from a
-                  fresh list since we are still in the while cstart < n_leaves_found loop*/
-
-                pack_vars_pair_dens->leaf_list[0].n_packed = 0;
+                // A. Nasar: These need to be reset to zero either way as our GPU array counters
+                // need to re-start from zero
                 pack_vars_pair_dens->tasks_packed = 0;
                 pack_vars_pair_dens->launch_leftovers = 0;
                 pack_vars_pair_dens->launch = 0;
@@ -1067,8 +1081,8 @@ void *runner_main2(void *data) {
             ttop_prev = t;
             cell_unlocktree(ci);
             cell_unlocktree(cj);
-            pack_vars_pair_dens->launch_leftovers = 0;
-            pack_vars_pair_dens->launch = 0;
+//            pack_vars_pair_dens->launch_leftovers = 0;
+//            pack_vars_pair_dens->launch = 0;
             /////////////////////W.I.P!!!////////////////////////////////////////////////////////
 
 #endif  // GPUOFFLOAD_DENSITY

From bd6ca44f92dd233b0c77983b8804c2215d4126a4 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 29 Mar 2025 07:31:27 +0000
Subject: [PATCH 216/217] Seems like I've got something working now. Issue was
 I was not re-setting n_packed per top level task. Tested on 1 thread and code
 is running fine

---
 src/runner_doiact_functions_hydro_gpu.h | 12 ++++++------
 src/runner_main_clean.cu                | 21 +++++++++++++--------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 7d0ead7b4b..3ff1407621 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1604,20 +1604,20 @@ void runner_dopair1_unpack_f4(
   int pack_length_unpack = 0;
   ticks total_cpu_unpack_ticks = 0;
   /*Loop over top level tasks*/
-  for (topid = 0; topid < pack_vars->top_tasks_packed - 1; topid++) {
+  for (topid = 0; topid < pack_vars->top_tasks_packed; topid++) {
 	const ticks tic = getticks();
 	/* Loop through each daughter task */
 	int n_leaves_in_task = pack_vars->leaf_list[topid].n_packed;
 	int nstart = pack_vars->leaf_list[topid].n_start;
-	for(int tid = nstart; tid < n_leaves_in_task; tid++){
+	for(int tid = nstart; tid < n_leaves_in_task + nstart; tid++){
 	  /*Get pointers to the leaf cells. SEEMS I'm NOT GETTING A CORRECT POINTER
 	   *but likely due to incorrect book keeping*/
 	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
 	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
-	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
-                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
-	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
-                            , topid, tid, n_leaves_in_task);
+//	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+//                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+//	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+//                            , topid, tid, n_leaves_in_task);
 //	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
 	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
 			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,
diff --git a/src/runner_main_clean.cu b/src/runner_main_clean.cu
index f15e3484ce..2376aafba7 100644
--- a/src/runner_main_clean.cu
+++ b/src/runner_main_clean.cu
@@ -970,15 +970,17 @@ void *runner_main2(void *data) {
             int n_leaves_found = 0;
             int top_tasks_packed = pack_vars_pair_dens->top_tasks_packed;
             int depth = 0;
-//            struct cell * cells_left[n_expected_tasks];
-//            struct cell * cells_right[n_expected_tasks];
+
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_leaves = 0;
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_start = 0;
             pack_vars_pair_dens->leaf_list[top_tasks_packed].n_packed = 0;
+
             runner_recurse_gpu(r, sched, pack_vars_pair_dens, ci, cj, t,
                       parts_aos_pair_f4_send, e, fparti_fpartj_lparti_lpartj_dens, &n_leaves_found, depth, n_expected_tasks);
+
             n_leafs_total += n_leaves_found;
             int cstart = 0, cid = 0;
+
             pack_vars_pair_dens->top_task_list[top_tasks_packed] = t;
 
             pack_vars_pair_dens->top_tasks_packed++;
@@ -988,7 +990,6 @@ void *runner_main2(void *data) {
             int n_t_tasks = pack_vars_pair_dens->target_n_tasks;
             t->total_cpu_pack_ticks += getticks() - tic_cpu_pack;
 
-            pack_vars_pair_dens->leaf_list[0].n_start = 0;
             int ntop_packed = pack_vars_pair_dens->top_tasks_packed;
 
             while(cstart < n_leaves_found){
@@ -1049,23 +1050,27 @@ void *runner_main2(void *data) {
                 //A. Nasar: We've packed all daughters and have launched --> one way or the other
                 if(cstart == n_leaves_found){
                   pack_vars_pair_dens->top_tasks_packed = 0;
-                  for(int i = 0; i < ntop_packed; i++){
-                    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
-                    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
-                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
-                  }
+//                  for(int i = 0; i < ntop_packed; i++){
+//                    pack_vars_pair_dens->leaf_list[i].n_leaves = 0;
+//                    pack_vars_pair_dens->leaf_list[i].n_packed = 0;
+//                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
+//                  }
                 }
                 // A. Nasar: We've launched but we have not packed all daughters.
                 // Need to set counters so we start from the last top-task packed
                 // and it's last packed daughter-task and start packing to the beginning of GPU arrays
                 // which is reset to zero (count_parts) in "....unpack_f4()"
                 else{
+                  for(int i = 1; i < pack_vars_pair_dens->top_tasks_packed; i++)
+                    pack_vars_pair_dens->leaf_list[i].n_start = 0;
                   pack_vars_pair_dens->top_tasks_packed = 1;
                   pack_vars_pair_dens->top_task_list[0]= t;
                   // A. Nasar: We've launched so need to restart counting tasks
                   // from zero and need to reset tasks_packed to zero.
                   // However, the counter for
                   pack_vars_pair_dens->leaf_list[0].n_start = cstart;
+
+                  pack_vars_pair_dens->leaf_list[0].n_packed = 0;
                   //A. Nasar: We have packed all daughter tasks in this parent task
                   /*This makes it such that the remaining leaf tasks are packed starting from a
                       fresh list since we are still in the while cstart < n_leaves_found loop**/

From 7933a471373d7ddad5516b886dd05c28b6195b55 Mon Sep 17 00:00:00 2001
From: Abouzied <abouzied.nasar@gmail.com>
Date: Sat, 29 Mar 2025 07:39:31 +0000
Subject: [PATCH 217/217] Spoke too soon. Results do not look right in serial
 and code crashes when run on multi threads. Also, since problem gets worse in
 parallel there's probz an issue with locking/unlocking DEBUG DEBUG DEBUG

---
 src/runner_doiact_functions_hydro_gpu.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/runner_doiact_functions_hydro_gpu.h b/src/runner_doiact_functions_hydro_gpu.h
index 3ff1407621..a78ec6409c 100644
--- a/src/runner_doiact_functions_hydro_gpu.h
+++ b/src/runner_doiact_functions_hydro_gpu.h
@@ -1614,10 +1614,10 @@ void runner_dopair1_unpack_f4(
 	   *but likely due to incorrect book keeping*/
 	  struct cell * cii_l = pack_vars->leaf_list[topid].ci[tid];
 	  struct cell * cjj_l = pack_vars->leaf_list[topid].cj[tid];
-//	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
-//                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
-//	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
-//                            , topid, tid, n_leaves_in_task);
+	  message("loc %f %f %f topid %i tid %i nleaves %i", pack_vars->leaf_list[topid].ci[tid]->loc[0]
+                            , pack_vars->leaf_list[topid].ci[tid]->loc[1]
+	                        , pack_vars->leaf_list[topid].ci[tid]->loc[2]
+                            , topid, tid, n_leaves_in_task);
 //	  if(*cii_l == NULL || *cjj_l == NULL)error("stop");
 	  runner_do_ci_cj_gpu_unpack_neat_aos_f4(
 			r, cii_l, cjj_l, parts_recv, 0, &pack_length_unpack, tid,