From 1e548265adddb0e172edaa5e8f6a03eb6badf232 Mon Sep 17 00:00:00 2001
From: ecker <mrq@ecker.tech>
Date: Mon, 15 Sep 2025 16:01:05 -0500
Subject: [PATCH] more optimizations by making node bounds an AoS instead and
 some other tweaks and fixes (even though it doesn't seem to amount to much
 tangible results because the scene explodes when I make props in
 mds_mcdonalds dynamic)

---
 engine/inc/uf/utils/math/physics/impl.h   |  36 +-
 engine/src/utils/math/physics/aabb.inl    |   6 +-
 engine/src/utils/math/physics/bvh.inl     | 390 ++++++++++++----------
 engine/src/utils/math/physics/helpers.inl |   8 +-
 engine/src/utils/math/physics/impl.cpp    |  22 +-
 engine/src/utils/math/physics/mesh.inl    |  15 +-
 engine/src/utils/math/physics/ray.inl     |   3 +-
 7 files changed, 260 insertions(+), 220 deletions(-)

diff --git a/engine/inc/uf/utils/math/physics/impl.h b/engine/inc/uf/utils/math/physics/impl.h
index 2bee82ca..bd6b305a 100644
--- a/engine/inc/uf/utils/math/physics/impl.h
+++ b/engine/inc/uf/utils/math/physics/impl.h
@@ -44,7 +44,8 @@ namespace pod {
 	};
 
 	struct BVH {
-		typedef std::pair<int32_t,int32_t> pair_t;
+		typedef uint32_t index_t;
+		typedef std::pair<index_t,index_t> pair_t;
 		
 		struct PairHash {
 			size_t operator()( const pair_t& p ) const noexcept {
@@ -62,21 +63,25 @@ namespace pod {
 		typedef uf::stl::unordered_set<pair_t, PairHash, PairEq> pairs_t;
 		
 		struct Node {
-			/*alignas(16)*/ pod::AABB bounds = {};
-			int32_t left = -1;
-			int32_t right = -1;
-			int32_t start = 0;
-			int32_t count = 0;
+			BVH::index_t left = 0;
+			BVH::index_t right = 0;
+			BVH::index_t start = 0;
+			BVH::index_t flags = 0;
 
-			bool asleep = false;
+			BVH::index_t getCount() const { return flags & 0x7FFFFFFF; }
+			bool isAsleep() const { return (flags & 0x80000000u) != 0; }
+			void setCount(BVH::index_t c) { flags = (flags & 0x80000000u) | (c & 0x7FFFFFFF); }
+			void setAsleep(bool a) { flags = (flags & 0x7FFFFFFF) | (a ? 0x80000000u : 0); }
 		};
 		struct FlatNode {
-			/*alignas(16)*/ pod::AABB bounds = {};
-			int32_t start = -1;
-			int32_t count = -1;
-			int32_t skipIndex = -1;
+			BVH::index_t start = 0;
+			BVH::index_t skipIndex = 0;
+			BVH::index_t flags = 0;
 
-			bool asleep = false;
+			BVH::index_t getCount() const { return flags & 0x7FFFFFFF; }
+			bool isAsleep() const { return (flags & 0x80000000u) != 0; }
+			void setCount(BVH::index_t c) { flags = (flags & 0x80000000u) | (c & 0x7FFFFFFF); }
+			void setAsleep(bool a) { flags = (flags & 0x7FFFFFFF) | (a ? 0x80000000u : 0); }
 		};
 		struct UpdatePolicy {
 			enum class Decision {
@@ -87,13 +92,16 @@ namespace pod {
 			float displacementThreshold = 0.25f; // 25% of AABB size
 			float overlapThreshold = 2.0f;	   // 2x growth in root surface area
 			float dirtyRatioThreshold = 0.3f;	// 30% dirty bodies
-			int   maxFramesBeforeRebuild = 60;   // force rebuild every 60 frames
+			uint16_t maxFramesBeforeRebuild = 600;   // force rebuild every 600 frames
 		};
 
 		bool dirty = false;
-		uf::stl::vector<uint32_t> indices;
+		uf::stl::vector<pod::BVH::index_t> indices;
 		uf::stl::vector<pod::BVH::Node> nodes;
 		uf::stl::vector<pod::BVH::FlatNode> flattened;
+		
+		uf::stl::vector<pod::AABB> bounds;
+		uf::stl::vector<pod::AABB> flatBounds;
 	};
 
 	struct MeshBVH {
diff --git a/engine/src/utils/math/physics/aabb.inl b/engine/src/utils/math/physics/aabb.inl
index 2058b413..70866edd 100644
--- a/engine/src/utils/math/physics/aabb.inl
+++ b/engine/src/utils/math/physics/aabb.inl
@@ -103,10 +103,10 @@ namespace {
 				return ::computeSegmentAABB( p1, p2, body.collider.capsule.radius );
 			} break;
 			case pod::ShapeType::MESH: {
-				if ( body.collider.mesh.bvh && !body.collider.mesh.bvh->nodes.empty() )
+				if ( body.collider.mesh.bvh && !body.collider.mesh.bvh->bounds.empty() )
 					return {
-						transform.position + body.collider.mesh.bvh->nodes[0].bounds.min,
-						transform.position + body.collider.mesh.bvh->nodes[0].bounds.max,
+						transform.position + body.collider.mesh.bvh->bounds[0].min,
+						transform.position + body.collider.mesh.bvh->bounds[0].max,
 					};
 			} break;
 			default: {
diff --git a/engine/src/utils/math/physics/bvh.inl b/engine/src/utils/math/physics/bvh.inl
index 4ab32737..98140a53 100644
--- a/engine/src/utils/math/physics/bvh.inl
+++ b/engine/src/utils/math/physics/bvh.inl
@@ -1,8 +1,8 @@
 namespace {
-	int32_t flattenBVH( pod::BVH& bvh, int32_t nodeID );
+	pod::BVH::index_t flattenBVH( pod::BVH& bvh, pod::BVH::index_t nodeID );
 
-	void queryFlatBVH( const pod::BVH&, const pod::AABB& bounds, uf::stl::vector<int32_t>& out );
-	void queryFlatBVH( const pod::BVH&, const pod::Ray& ray, uf::stl::vector<int32_t>& out, float maxDist = FLT_MAX );
+	void queryFlatBVH( const pod::BVH&, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& out );
+	void queryFlatBVH( const pod::BVH&, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& out, float maxDist = FLT_MAX );
 	
 	void queryFlatOverlaps( const pod::BVH& bvh, pod::BVH::pairs_t& outPairs );
 	void queryFlatOverlaps( const pod::BVH& bvhA, const pod::BVH& bvhB, pod::BVH::pairs_t& outPairs );
@@ -10,41 +10,40 @@ namespace {
 
 // BVH
 namespace {
-	int32_t buildBVHNode( pod::BVH& bvh, const uf::stl::vector<pod::AABB>& bounds, int32_t start, int32_t end, int32_t capacity = 2 ) {
+	pod::BVH::index_t buildBVHNode( pod::BVH& bvh, const uf::stl::vector<pod::AABB>& bounds, pod::BVH::index_t start, pod::BVH::index_t end, pod::BVH::index_t capacity = 2 ) {
 		pod::BVH::Node node{};
-		node.left  = -1;
-		node.right = -1;
+		node.left  = 0;
+		node.right = 0;
 		node.start = start;
-		node.count = 0;
-		node.bounds = bounds[bvh.indices[start]];
+		node.setCount(0);
 
-		// compute bounds of this node
-		for ( auto i = start + 1; i < end; ++i) node.bounds = ::mergeAabb( node.bounds, bounds[bvh.indices[i]] );
+		pod::AABB bound = bounds[bvh.indices[start]];
+		for ( auto i = start + 1; i < end; ++i) bound = ::mergeAabb( bound, bounds[bvh.indices[i]] );
 
-		int32_t count = end - start;
+		pod::BVH::index_t count = end - start;
 		if ( count <= capacity ) {
 			// leaf
 			node.start = start;
-			node.count = count;
-			int32_t index = (int32_t) bvh.nodes.size();
+			node.setCount(count);
+			pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 			bvh.nodes.emplace_back(node);
+			bvh.bounds.emplace_back(bound);
 			return index;
 		}
 
 		// choose split axis by largest extent
-		auto extent = node.bounds.max - node.bounds.min;
+		auto extent = bound.max - bound.min;
 		auto axis = (extent.x > extent.y && extent.x > extent.z) ? 0 : (extent.y > extent.z ? 1 : 2);
 
 		// sort indices by centroid along axis
-		std::sort( bvh.indices.begin() + start, bvh.indices.begin() + end, [&](uint32_t a, uint32_t b) {
-			float ca = ::aabbCenter( bounds[a] )[axis];
-			float cb = ::aabbCenter( bounds[b] )[axis];
-			return ca < cb;
+		auto mid = ( start + end ) / 2;
+		std::nth_element(bvh.indices.begin() + start, bvh.indices.begin() + mid, bvh.indices.begin() + end, [&](uint32_t a, uint32_t b) {
+			return ::aabbCenter(bounds[a])[axis] < ::aabbCenter(bounds[b])[axis];
 		});
 
-		int32_t mid = ( start + end ) / 2;
-		int32_t index = (int32_t) bvh.nodes.size();
+		pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 		bvh.nodes.emplace_back( node ); // insert now, gets filled later
+		bvh.bounds.emplace_back( bound );
 
 		node.left = ::buildBVHNode( bvh, bounds, start, mid, capacity );
 		node.right = ::buildBVHNode( bvh, bounds, mid, end, capacity );
@@ -52,57 +51,58 @@ namespace {
 		return index;
 	}
 
-	int32_t buildBVHNode_SAH( pod::BVH& bvh, const uf::stl::vector<pod::AABB>& bounds, int32_t start, int32_t end, int32_t capacity = 4 ) {
+	pod::BVH::index_t buildBVHNode_SAH( pod::BVH& bvh, const uf::stl::vector<pod::AABB>& bounds, pod::BVH::index_t start, pod::BVH::index_t end, pod::BVH::index_t capacity = 4 ) {
 		struct Bin {
 			pod::AABB bounds;
-			int32_t count = 0;
+			pod::BVH::index_t count = 0;
 		};
 
 		pod::BVH::Node node{};
-		node.left  = -1;
-		node.right = -1;
+		node.left  = 0;
+		node.right = 0;
 		node.start = start;
-		node.count = 0;
-		node.bounds = bounds[bvh.indices[start]];
+		node.setCount(0);
 
-		for ( auto i = start + 1; i < end; ++i ) node.bounds = ::mergeAabb( node.bounds, bounds[bvh.indices[i]] );
+		pod::AABB bound = bounds[bvh.indices[start]];
+		for ( auto i = start + 1; i < end; ++i) bound = ::mergeAabb( bound, bounds[bvh.indices[i]] );
 
-		int32_t count = end - start;
+		pod::BVH::index_t count = end - start;
 		if ( count <= capacity ) {
-			node.count = count;
-			int32_t index = (int32_t) bvh.nodes.size();
+			node.setCount(count);
+			pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 			bvh.nodes.emplace_back(node);
+			bvh.bounds.emplace_back(bound);
 			return index;
 		}
 
 		constexpr auto numBins = 16;
 		static thread_local Bin bins[numBins];
-		for ( auto i = 0; i < numBins; i++ ) bins[i] = {};
+		for ( auto i = 0; i < numBins; i++ ) bins[i].count = 0;
 
-		auto extent = node.bounds.max - node.bounds.min;
+		auto extent = bound.max - bound.min;
 		auto bestAxis = -1, bestSplit = -1;
 		float bestCost = std::numeric_limits<float>::infinity();
 
 		for ( auto axis = 0; axis < 3; ++axis ) {
 			if ( extent[axis] < EPS(1e-6f) ) continue;
 
-			float minC = node.bounds.min[axis];
-			float maxC = node.bounds.max[axis];
+			float minC = bound.min[axis];
+			float maxC = bound.max[axis];
 			float scale = (float) numBins / (maxC - minC);
 
 			for ( auto i = start; i < end; ++i ) {
-				int32_t idx = bvh.indices[i];
+				pod::BVH::index_t idx = bvh.indices[i];
 				float c = ::aabbCenter( bounds[idx] )[axis];
-				int32_t binID = std::min(numBins - 1, (int32_t)((c - minC) * scale));
-				bins[binID].count++;
-				bins[binID].bounds = ::mergeAabb( bins[binID].bounds, bounds[idx] );
+				pod::BVH::index_t binID = std::min((pod::BVH::index_t)(numBins - 1), (pod::BVH::index_t)((c - minC) * scale));
+				bins[binID].bounds = bins[binID].count == 0 ? bounds[idx] : ::mergeAabb( bins[binID].bounds, bounds[idx] );
+				++bins[binID].count;
 			}
 
 			pod::AABB leftBounds[numBins], rightBounds[numBins];
-			int32_t leftCount[numBins] = {}, rightCount[numBins] = {};
+			pod::BVH::index_t leftCount[numBins] = {}, rightCount[numBins] = {};
 
 			pod::AABB acc;
-			int32_t cnt = 0;
+			pod::BVH::index_t cnt = 0;
 			for ( auto i = 0; i < numBins; i++ ) {
 				if ( bins[i].count > 0 ) acc = (cnt == 0) ? bins[i].bounds : ::mergeAabb( acc, bins[i].bounds );
 				cnt += bins[i].count;
@@ -110,6 +110,7 @@ namespace {
 				leftCount[i] = cnt;
 			}
 
+
 			acc = {};
 			cnt = 0;
 			for ( auto i = numBins - 1; i >= 0; i-- ) {
@@ -119,12 +120,18 @@ namespace {
 				rightCount[i] = cnt;
 			}
 
-			float parentArea = ::aabbSurfaceArea(node.bounds);
+			// precompute area
+			float leftArea[numBins], rightArea[numBins];
+			for ( auto i = 0; i < numBins; i++ ) leftArea[i] = ::aabbSurfaceArea( leftBounds[i] );
+			for ( auto i = 0; i < numBins; i++ ) rightArea[i] = ::aabbSurfaceArea( rightBounds[i] );
+			
+			float parentArea = ::aabbSurfaceArea(bound);
+
 			for ( auto i = 0; i < numBins - 1; i++ ) {
 				if ( leftCount[i] == 0 || rightCount[i + 1] == 0 ) continue;
 				float cost = 1.0f + (
-					( ::aabbSurfaceArea(leftBounds[i]) / parentArea ) * leftCount[i] +
-					( ::aabbSurfaceArea(rightBounds[i + 1]) / parentArea ) * rightCount[i + 1]
+					( leftArea[i] / parentArea ) * leftCount[i] +
+					( rightArea[i + 1] / parentArea ) * rightCount[i + 1]
 				);
 				if ( cost < bestCost ) {
 					bestCost = cost;
@@ -136,34 +143,37 @@ namespace {
 
 		// fallback: no valid split → make leaf
 		if ( bestAxis == -1 ) {
-			node.count = count;
-			int32_t index = (int32_t) bvh.nodes.size();
+			node.setCount(count); // node.count = count;
+			pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 			bvh.nodes.emplace_back(node);
+			bvh.bounds.emplace_back(bound);
 			return index;
 		}
 
-		float minC = node.bounds.min[bestAxis];
-		float maxC = node.bounds.max[bestAxis];
+		float minC = bound.min[bestAxis];
+		float maxC = bound.max[bestAxis];
 		float scale = (float) numBins / (maxC - minC);
 
-		auto midIt = std::partition( bvh.indices.begin() + start, bvh.indices.begin() + end, [&](int32_t idx) {
-			float c = ::aabbCenter( bounds[idx])[bestAxis ];
-			int32_t binID = std::min(numBins - 1, (int32_t)((c - minC) * scale));
+		auto midIt = std::partition( bvh.indices.begin() + start, bvh.indices.begin() + end, [&](pod::BVH::index_t idx) {
+			float c = ::aabbCenter( bounds[idx] )[bestAxis ];
+			pod::BVH::index_t binID = std::min((pod::BVH::index_t)(numBins - 1), (pod::BVH::index_t)((c - minC) * scale));
 			return binID <= bestSplit;
 		});
 
-		int32_t mid = (int32_t) ( midIt - bvh.indices.begin() );
+		pod::BVH::index_t mid = (pod::BVH::index_t) ( midIt - bvh.indices.begin() );
 
 		// if partition failed (all left or all right), force leaf
 		if ( mid == start || mid == end ) {
-			node.count = count;
-			int32_t index = (int32_t) bvh.nodes.size();
+			node.setCount(count); // node.count = count;
+			pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 			bvh.nodes.emplace_back(node);
+			bvh.bounds.emplace_back(bound);
 			return index;
 		}
 
-		int32_t index = (int32_t) bvh.nodes.size();
+		pod::BVH::index_t index = (pod::BVH::index_t) bvh.nodes.size();
 		bvh.nodes.emplace_back(node);
+		bvh.bounds.emplace_back(bound);
 
 		node.left  = ::buildBVHNode_SAH( bvh, bounds, start, mid, capacity );
 		node.right = ::buildBVHNode_SAH( bvh, bounds, mid, end, capacity );
@@ -171,11 +181,12 @@ namespace {
 		return index;
 	}
 
-	void buildBroadphaseBVH( pod::BVH& bvh, const uf::stl::vector<pod::PhysicsBody*>& bodies, int32_t capacity = 2, bool filters = false, bool filterType = false ) {
+	void buildBroadphaseBVH( pod::BVH& bvh, const uf::stl::vector<pod::PhysicsBody*>& bodies, pod::BVH::index_t capacity = 2, bool filters = false, bool filterType = false ) {
 		if ( bodies.empty() ) return;
 
 		bvh.indices.clear();
 		bvh.nodes.clear();
+		bvh.bounds.clear();
 		bvh.indices.reserve(bodies.size());
 
 		// stores bounds
@@ -201,7 +212,7 @@ namespace {
 		bvh.dirty = false;
 	}
 
-	void buildMeshBVH( pod::BVH& bvh, const uf::Mesh& mesh, int32_t capacity = 4 ) {
+	void buildMeshBVH( pod::BVH& bvh, const uf::Mesh& mesh, pod::BVH::index_t capacity = 4 ) {
 		uint32_t triangles = mesh.index.count / 3;
 
 		bvh.indices.clear();
@@ -245,7 +256,7 @@ namespace {
 }
 
 namespace {
-	pod::BVH::UpdatePolicy::Decision decideBVHUpdate( const pod::BVH& bvh, uf::stl::vector<pod::PhysicsBody*>& bodies, const pod::BVH::UpdatePolicy& policy, size_t frameCounter ) {
+	pod::BVH::UpdatePolicy::Decision decideBVHUpdate( pod::BVH& bvh, uf::stl::vector<pod::PhysicsBody*>& bodies, const pod::BVH::UpdatePolicy& policy, size_t frameCounter ) {
 		// BVH is not built
 		if ( bvh.indices.empty() || bvh.nodes.empty() ) {
 			return pod::BVH::UpdatePolicy::Decision::REBUILD;
@@ -253,7 +264,7 @@ namespace {
 		if ( bodies.empty() ) return pod::BVH::UpdatePolicy::Decision::NONE;
 
 		uint32_t dirtyCount = 0;
-		float oldRootArea = ::aabbSurfaceArea( bvh.nodes[0].bounds );
+		float oldRootArea = ::aabbSurfaceArea( bvh.bounds[0] );
 
 		// update/check each body
 		for ( auto idx : bvh.indices ) {
@@ -273,14 +284,20 @@ namespace {
 
 			if ( displacement > policy.displacementThreshold * size ) ++dirtyCount;
 		}
+		// update nodes
+		for ( auto i = 0; i < bvh.nodes.size(); ++i ) {
+			auto& node = bvh.nodes[i];
+			if ( /*node.count*/ node.getCount() == 0 ) continue;
+			auto& bound = bvh.bounds[i];
+			bound = bodies[bvh.indices[node.start]]->bounds;
+			for ( auto i = 1; i < node.getCount() /*node.count*/; ++i ) bound = ::mergeAabb( bound, bodies[bvh.indices[node.start + i]]->bounds );
+		}
 
 		float dirtyRatio = (float) dirtyCount / (float) bodies.size();
 
 		// compute new root bounds
 		pod::AABB newRoot = bodies[bvh.indices[0]]->bounds;
-		for ( auto i = 1; i < bvh.indices.size(); ++i ) {
-			newRoot = ::mergeAabb(newRoot, bodies[bvh.indices[i]]->bounds);
-		}
+		for ( auto i = 1; i < bvh.indices.size(); ++i ) newRoot = ::mergeAabb(newRoot, bodies[bvh.indices[i]]->bounds);
 
 		float newRootArea = ::aabbSurfaceArea( newRoot );
 		// BVH is too out of date, rebuild it
@@ -298,25 +315,30 @@ namespace {
 		if ( bvh.nodes.empty() ) return;
 
 		// update leaf bounds
-		#pragma omp parallel for
+		uf::stl::vector<pod::BVH::index_t> leaves;
+		leaves.reserve(::reserveCount);
 		for ( auto i = 0; i < bvh.nodes.size(); i++ ) {
-			auto& node = bvh.nodes[i];
-			if ( node.count > 0 ) {
-				// leaf node: recompute bounds from bodies
-				node.bounds = bounds[bvh.indices[node.start]];
+			if ( bvh.nodes[i].getCount() == 0 ) continue;
+			leaves.emplace_back(i);
+		}
 
-				for ( auto j = 1; j < node.count; j++ ) {
-					node.bounds = ::mergeAabb(node.bounds, bounds[bvh.indices[node.start + j]] );
-				}
-			}
+		// recompute bounds from bodies
+		for ( auto i = 0; i < leaves.size(); i++ ) {
+			auto nodeID = leaves[i];
+			auto& node = bvh.nodes[nodeID];
+			auto& bound = bvh.bounds[nodeID];
+			bound = bounds[bvh.indices[node.start]];
+			for ( auto j = 1; j < node.getCount(); j++ )
+				bound = ::mergeAabb(bound, bounds[bvh.indices[node.start + j]]);
 		}
 
 		// update internal nodes bottom-up
-		for ( int32_t i = (int32_t) bvh.nodes.size() - 1; i >= 0; i-- ) {
+		for ( pod::BVH::index_t i = (pod::BVH::index_t) bvh.nodes.size() - 1; i >= 0; i-- ) {
 			auto& node = bvh.nodes[i];
+			auto& bound = bvh.bounds[i];
 			// internal node
-			if ( node.count == 0 ) {
-				node.bounds = ::mergeAabb(bvh.nodes[node.left].bounds, bvh.nodes[node.right].bounds);
+			if ( node.getCount() == 0 ) {
+				bound = ::mergeAabb(bvh.bounds[node.left], bvh.bounds[node.right]);
 			}
 		}
 	}
@@ -329,31 +351,28 @@ namespace {
 		#pragma omp parallel for
 		for ( auto i = 0; i < bvh.nodes.size(); i++ ) {
 			auto& node = bvh.nodes[i];
-			if ( node.count > 0 ) {
-				// leaf node: recompute bounds from bodies
-				auto nodeID = bvh.indices[node.start];
+			if ( node.getCount() == 0 ) continue;
+			auto& bound = bvh.bounds[i];
+			// leaf node: recompute bounds from bodies
+			auto nodeID = bvh.indices[node.start];
 
-				node.bounds = bodies[nodeID]->bounds;
-				node.asleep = !bodies[nodeID]->activity.awake;
+			bound = bodies[nodeID]->bounds;
+			node.setAsleep(!bodies[nodeID]->activity.awake);
 
-				for ( auto j = 1; j < node.count; j++ ) {
-					auto bodyID = bvh.indices[node.start + j];
-					node.bounds = ::mergeAabb(node.bounds, bodies[bodyID]->bounds );
-					node.asleep = node.asleep && !bodies[bodyID]->activity.awake;
-				}
+			for ( auto j = 1; j < node.getCount(); j++ ) {
+				auto bodyID = bvh.indices[node.start + j];
+				bound = ::mergeAabb( bound, bodies[bodyID]->bounds );
+				node.setAsleep(node.isAsleep() && !bodies[bodyID]->activity.awake);
 			}
 		}
 
 		// update internal nodes bottom-up
-		for ( int32_t i = (int32_t) bvh.nodes.size() - 1; i >= 0; i-- ) {
+		for ( int64_t i = (int64_t) bvh.nodes.size() - 1; i >= 0; i-- ) {
 			auto& node = bvh.nodes[i];
+			if ( node.getCount() > 0 ) continue;
 			// internal node
-			if ( node.count == 0 ) {
-				const auto& leftNode = bvh.nodes[node.left];
-				const auto& rightNode = bvh.nodes[node.right];
-				node.bounds = ::mergeAabb(leftNode.bounds, rightNode.bounds);
-				node.asleep = leftNode.asleep && rightNode.asleep;
-			}
+			bvh.bounds[i] = ::mergeAabb( bvh.bounds[node.left], bvh.bounds[node.right] );
+			node.setAsleep( bvh.nodes[node.left].isAsleep() && bvh.nodes[node.right].isAsleep());
 		}
 	}
 
@@ -385,36 +404,41 @@ namespace {
 }
 
 namespace {
-	int32_t flattenBVH( pod::BVH& bvh, int32_t nodeID ) {
-		if ( nodeID == 0 ) bvh.flattened.reserve(bvh.nodes.size());
+	pod::BVH::index_t flattenBVH( pod::BVH& bvh, pod::BVH::index_t nodeID ) {
+		if ( nodeID == 0 ) {
+			bvh.flattened.clear();
+			bvh.flatBounds.clear();
+			bvh.flattened.reserve(bvh.nodes.size());
+			bvh.flatBounds.reserve(bvh.bounds.size());
+		}
 
 		const auto& node = bvh.nodes[nodeID];
 
-		int32_t flatID = (int32_t) bvh.flattened.size();
+		pod::BVH::index_t flatID = (pod::BVH::index_t) bvh.flattened.size();
 		bvh.flattened.emplace_back(); // placeholder
+		bvh.flatBounds.emplace_back( bvh.bounds[nodeID] );
 
 		pod::BVH::FlatNode flat{};
-		flat.bounds = node.bounds;
-		flat.start = -1;
-		flat.count = -1;
-		flat.skipIndex = -1;
-		flat.asleep = node.asleep;
+		flat.start = 0;
+		flat.setCount(0);
+		flat.skipIndex = 0;
+		flat.setAsleep(node.isAsleep());
 
 		// leaf
-		if ( node.count > 0 ) {
+		if ( node.getCount() > 0 ) {
 			flat.start = node.start;
-			flat.count = node.count;
+			flat.setCount(node.getCount());
 			flat.skipIndex = flatID + 1; // next node after this leaf
 			bvh.flattened[flatID] = flat;
 			return flatID + 1;
 		}
 		// internal
 		else {
-			flat.start = -1;
-			flat.count = 0;
+			flat.start = 0;
+			flat.setCount(0);
 
-			int32_t leftID  = ::flattenBVH( bvh, node.left );
-			int32_t rightID = ::flattenBVH( bvh, node.right );
+			pod::BVH::index_t leftID  = ::flattenBVH( bvh, node.left );
+			pod::BVH::index_t rightID = ::flattenBVH( bvh, node.right );
 
 			flat.skipIndex = rightID; // skip entire subtree
 			bvh.flattened[flatID] = flat;
@@ -425,17 +449,17 @@ namespace {
 
 namespace {
 	// collects a list of nodes that are overlapping with each other
-	void traverseNodePair(const pod::BVH& bvh, int32_t nodeAID, int32_t nodeBID, pod::BVH::pairs_t& pairs) {
+	void traverseNodePair(const pod::BVH& bvh, pod::BVH::index_t nodeAID, pod::BVH::index_t nodeBID, pod::BVH::pairs_t& pairs) {
 		const auto& nodeA = bvh.nodes[nodeAID];
 		const auto& nodeB = bvh.nodes[nodeBID];
 
-		if ( nodeA.asleep || nodeB.asleep || !::aabbOverlap( nodeA.bounds, nodeB.bounds ) ) return;
+		if ( nodeA.isAsleep() || nodeB.isAsleep() || !::aabbOverlap( bvh.bounds[nodeAID], bvh.bounds[nodeBID] ) ) return;
 
-		if ( nodeA.count > 0 && nodeB.count > 0 ) {
-			for ( auto i = 0; i < nodeA.count; ++i ) {
-				for ( auto j = 0; j < nodeB.count; ++j ) {
-					int32_t bodyA = bvh.indices[nodeA.start + i];
-					int32_t bodyB = bvh.indices[nodeB.start + j];
+		if ( nodeA.getCount() > 0 && nodeB.getCount() > 0 ) {
+			for ( auto i = 0; i < nodeA.getCount(); ++i ) {
+				for ( auto j = 0; j < nodeB.getCount(); ++j ) {
+					pod::BVH::index_t bodyA = bvh.indices[nodeA.start + i];
+					pod::BVH::index_t bodyB = bvh.indices[nodeB.start + j];
 					if ( bodyA == bodyB ) continue;
 					if ( bodyA > bodyB ) std::swap( bodyA, bodyB );
 
@@ -445,27 +469,27 @@ namespace {
 			return;
 		}
 
-		if ( nodeA.count == 0 ) {
+		if ( nodeA.getCount() == 0 ) {
 			::traverseNodePair( bvh, nodeA.left, nodeBID, pairs );
 			::traverseNodePair( bvh, nodeA.right, nodeBID, pairs );
 		}
-		if ( nodeB.count == 0 ) {
+		if ( nodeB.getCount() == 0 ) {
 			::traverseNodePair( bvh, nodeAID, nodeB.left, pairs );
 			::traverseNodePair( bvh, nodeAID, nodeB.right, pairs );
 		}
 	}
 	// collects a list of nodes from each BVH that are overlapping with each other (for mesh v mesh)
-	void traverseNodePair( const pod::BVH& bvhA, int32_t nodeAID, const pod::BVH& bvhB, int32_t nodeBID, pod::BVH::pairs_t& pairs ) {
+	void traverseNodePair( const pod::BVH& bvhA, pod::BVH::index_t nodeAID, const pod::BVH& bvhB, pod::BVH::index_t nodeBID, pod::BVH::pairs_t& pairs ) {
 		const auto& nodeA = bvhA.nodes[nodeAID];
 		const auto& nodeB = bvhB.nodes[nodeBID];
 
-		if ( nodeA.asleep || nodeB.asleep || !::aabbOverlap( nodeA.bounds, nodeB.bounds ) ) return;
+		if ( nodeA.isAsleep() || nodeB.isAsleep() || !::aabbOverlap( bvhA.bounds[nodeAID], bvhB.bounds[nodeBID] ) ) return;
 
-		if ( nodeA.count > 0 && nodeB.count > 0 ) {
-			for ( auto i = 0; i < nodeA.count; ++i ) {
-				for ( auto j = 0; j < nodeB.count; ++j ) {
-					int32_t bodyA = bvhA.indices[nodeA.start + i];
-					int32_t bodyB = bvhB.indices[nodeB.start + j];
+		if ( nodeA.getCount() > 0 && nodeB.getCount() > 0 ) {
+			for ( auto i = 0; i < nodeA.getCount(); ++i ) {
+				for ( auto j = 0; j < nodeB.getCount(); ++j ) {
+					pod::BVH::index_t bodyA = bvhA.indices[nodeA.start + i];
+					pod::BVH::index_t bodyB = bvhB.indices[nodeB.start + j];
 					if ( bodyA == bodyB ) continue;
 					if ( bodyA > bodyB ) std::swap( bodyA, bodyB );
 
@@ -475,24 +499,24 @@ namespace {
 			return;
 		}
 
-		if ( nodeA.count == 0 ) {
+		if ( nodeA.getCount() == 0 ) {
 			::traverseNodePair( bvhA, nodeA.left, bvhB, nodeBID, pairs );
 			::traverseNodePair( bvhA, nodeA.right, bvhB, nodeBID, pairs );
 		}
-		if ( nodeB.count == 0 ) {
+		if ( nodeB.getCount() == 0 ) {
 			::traverseNodePair( bvhA, nodeAID, bvhB, nodeB.left, pairs );
 			::traverseNodePair( bvhA, nodeAID, bvhB, nodeB.right, pairs );
 		}
 	}
 
-	void traverseBVH( const pod::BVH& bvh, int32_t nodeID, pod::BVH::pairs_t& pairs ) {
+	void traverseBVH( const pod::BVH& bvh, pod::BVH::index_t nodeID, pod::BVH::pairs_t& pairs ) {
 		const auto& node = bvh.nodes[nodeID];
 
-		if ( node.count > 0 ) {
-			for ( auto i = 0; i < node.count; ++i ) {
-				for ( auto j = i + 1; j < node.count; ++j ) {
-					int32_t bodyA = bvh.indices[node.start + i];
-					int32_t bodyB = bvh.indices[node.start + j];
+		if ( node.getCount() > 0 ) {
+			for ( auto i = 0; i < node.getCount(); ++i ) {
+				for ( auto j = i + 1; j < node.getCount(); ++j ) {
+					pod::BVH::index_t bodyA = bvh.indices[node.start + i];
+					pod::BVH::index_t bodyB = bvh.indices[node.start + j];
 
 					if ( bodyA == bodyB ) continue;
 					if ( bodyA > bodyB ) std::swap( bodyA, bodyB );
@@ -525,44 +549,45 @@ namespace {
 
 namespace {
 	// query a BVH with an AABB via a stack
-	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<int32_t>& outIndices ) {
+	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& outIndices ) {
 		if ( bvh.nodes.empty() ) return;
 		
 		if ( !bvh.flattened.empty() ) return ::queryFlatBVH( bvh, bounds, outIndices );
 
 		outIndices.reserve(::reserveCount);
 
-		uf::stl::stack<int32_t> stack;
+		thread_local uf::stl::stack<pod::BVH::index_t> stack;
+		//stack.clear(); // there is no stack.clear(), and the stack should already be cleared by the end of this function
 		stack.push(0);
 
 		while ( !stack.empty() ) {
-			int32_t idx = stack.top(); stack.pop();
+			pod::BVH::index_t idx = stack.top(); stack.pop();
 			auto& node = bvh.nodes[idx];
-			if ( node.asleep || !::aabbOverlap( bounds, node.bounds ) ) continue;
+			if ( node.isAsleep() || !::aabbOverlap( bounds, bvh.bounds[idx] ) ) continue;
 
-			if ( node.count > 0 ) {
-				for ( auto i = 0; i < node.count; ++i) outIndices.emplace_back(bvh.indices[node.start + i]);
+			if ( node.getCount() > 0 ) {
+				for ( auto i = 0; i < node.getCount(); ++i) outIndices.emplace_back(bvh.indices[node.start + i]);
 			} else {
 				stack.push(node.left);
 				stack.push(node.right);
 			}
 		}
 	}
-	void queryBVH( const pod::BVH& bvh, const pod::PhysicsBody& body, uf::stl::vector<int32_t>& outIndices ) {
+	void queryBVH( const pod::BVH& bvh, const pod::PhysicsBody& body, uf::stl::vector<pod::BVH::index_t>& outIndices ) {
 		return ::queryBVH( bvh, body.bounds, outIndices );
 	}
 	
 	// query a BVH with an AABB via recursion
-	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<int32_t>& outIndices, int32_t nodeID ) {
+	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& outIndices, pod::BVH::index_t nodeID ) {
 		if ( !bvh.flattened.empty() ) return ::queryFlatBVH( bvh, bounds, outIndices );
 
 		if ( nodeID == 0 ) outIndices.reserve(::reserveCount);
 
 		const auto& node = bvh.nodes[nodeID];
-		if ( node.asleep || !::aabbOverlap( node.bounds, bounds ) ) return;
+		if ( node.isAsleep() || !::aabbOverlap( bounds, bvh.bounds[nodeID] ) ) return;
 
-		if ( node.count > 0 ) {
-			for ( auto i = 0; i < node.count; ++i ) outIndices.emplace_back(bvh.indices[node.start + i]);
+		if ( node.getCount() > 0 ) {
+			for ( auto i = 0; i < node.getCount(); ++i ) outIndices.emplace_back(bvh.indices[node.start + i]);
 			return;
 		}
 
@@ -572,25 +597,26 @@ namespace {
 	}
 
 	// query a BVH with a ray via a stack
-	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<int32_t>& outIndices, float maxDist ) {
+	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& outIndices, float maxDist ) {
 		if ( !bvh.flattened.empty() ) return ::queryFlatBVH( bvh, ray, outIndices, maxDist );
 
 		if ( bvh.nodes.empty() ) return;
 		outIndices.reserve(::reserveCount);
 
-		uf::stl::stack<int32_t> stack;
+		thread_local uf::stl::stack<pod::BVH::index_t> stack;
+		//stack.clear(); // there is no stack.clear(), and the stack should already be cleared by the end of this function
 		stack.push(0);
 
 		while ( !stack.empty() ) {
-			int32_t idx = stack.top(); stack.pop();
+			pod::BVH::index_t idx = stack.top(); stack.pop();
 			const auto& node = bvh.nodes[idx];
 
 			float tMin, tMax;
-			if ( node.asleep || !::rayAabbIntersect( ray, node.bounds, tMin, tMax ) ) continue;
+			if ( node.isAsleep() || !::rayAabbIntersect( ray, bvh.bounds[idx], tMin, tMax ) ) continue;
 			if ( tMin > maxDist ) continue;
 
-			if ( node.count > 0 ) {
-				for ( auto i = 0; i < node.count; ++i) outIndices.emplace_back(bvh.indices[node.start + i]);
+			if ( node.getCount() > 0 ) {
+				for ( auto i = 0; i < node.getCount(); ++i) outIndices.emplace_back(bvh.indices[node.start + i]);
 			} else {
 				stack.push(node.left);
 				stack.push(node.right);
@@ -598,18 +624,18 @@ namespace {
 		}
 	}
 	// query a BVH with a ray via recursion
-	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<int32_t>& outIndices, int32_t nodeID, float maxDist ) {
+	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& outIndices, pod::BVH::index_t nodeID, float maxDist ) {
 		if ( !bvh.flattened.empty() ) return ::queryFlatBVH( bvh, ray, outIndices, maxDist );
 
 		if ( nodeID == 0 ) outIndices.reserve(::reserveCount);
 
 		const auto& node = bvh.nodes[nodeID];
 		float tMin, tMax;
-		if ( node.asleep || !::rayAabbIntersect( ray, node.bounds, tMin, tMax ) ) return;
+		if ( node.isAsleep() || !::rayAabbIntersect( ray, bvh.bounds[nodeID], tMin, tMax ) ) return;
 		if ( tMin > maxDist ) return;
 
-		if ( node.count > 0 ) {
-			for ( auto i = 0; i < node.count; ++i ) outIndices.emplace_back(bvh.indices[node.start + i]);
+		if ( node.getCount() > 0 ) {
+			for ( auto i = 0; i < node.getCount(); ++i ) outIndices.emplace_back(bvh.indices[node.start + i]);
 			return;
 		}
 
@@ -629,16 +655,16 @@ namespace {
 
 		for ( auto i = 0; i < nodes.size(); ++i ) {
 			const auto& nodeA = nodes[i];
-			if ( nodeA.count <= 0 || nodeA.asleep ) continue;
+			if ( nodeA.getCount() <= 0 || nodeA.isAsleep() ) continue;
 
 			for ( auto j = i + 1; j < nodes.size(); ++j ) {
 				const auto& nodeB = nodes[j];
-				if ( nodeB.count <= 0 || nodeB.asleep ) continue;
+				if ( nodeB.getCount() <= 0 || nodeB.isAsleep() ) continue;
 
-				if ( !::aabbOverlap( nodeA.bounds, nodeB.bounds ) ) continue;
+				if ( !::aabbOverlap( bvh.flatBounds[i], bvh.flatBounds[j] ) ) continue;
 
-				for ( auto ia = 0; ia < nodeA.count; ++ia ) {
-					for ( auto ib = 0; ib < nodeB.count; ++ib ) {
+				for ( auto ia = 0; ia < nodeA.getCount(); ++ia ) {
+					for ( auto ib = 0; ib < nodeB.getCount(); ++ib ) {
 						auto indexA = indices[nodeA.start + ia];
 						auto indexB = indices[nodeB.start + ib];
 
@@ -664,16 +690,16 @@ namespace {
 
 		for ( auto i = 0; i < nodesA.size(); ++i ) {
 			const auto& nodeA = nodesA[i];
-			if ( nodeA.count <= 0 || nodeA.asleep ) continue;
+			if ( nodeA.getCount() <= 0 || nodeA.isAsleep() ) continue;
 
 			for ( auto j = 0; j < nodesB.size(); ++j ) {
 				const auto& nodeB = nodesB[j];
-				if ( nodeB.count <= 0 || nodeB.asleep ) continue;
+				if ( nodeB.getCount() <= 0 || nodeB.isAsleep() ) continue;
 
-				if ( !::aabbOverlap( nodeA.bounds, nodeB.bounds ) ) continue;
+				if ( !::aabbOverlap( bvhA.flatBounds[i], bvhB.flatBounds[j] ) ) continue;
 
-				for ( auto ia = 0; ia < nodeA.count; ++ia ) {
-					for (auto ib = 0; ib < nodeB.count; ++ib ) {
+				for ( auto ia = 0; ia < nodeA.getCount(); ++ia ) {
+					for (auto ib = 0; ib < nodeB.getCount(); ++ib ) {
 						auto indexA = indicesA[nodeA.start + ia];
 						auto indexB = indicesB[nodeB.start + ib];
 
@@ -684,20 +710,20 @@ namespace {
 		}
 	}
 
-	void queryFlatBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<int32_t>& outIndices ) {
+	void queryFlatBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& outIndices ) {
 		auto& nodes = bvh.flattened;
 		auto& indices = bvh.indices;
 
 		outIndices.reserve(::reserveCount);
 
-		int32_t idx = 0;
+		pod::BVH::index_t idx = 0;
 		while ( idx < nodes.size() ) {
 			const auto& node = nodes[idx];
 
-			if ( !node.asleep && ::aabbOverlap( bounds, node.bounds ) ) {
+			if ( !node.isAsleep() && ::aabbOverlap( bounds, bvh.flatBounds[idx] ) ) {
 				// leaf
-				if ( node.count > 0 ) {
-					for ( auto i = 0; i < node.count; ++i ) {
+				if ( node.getCount() > 0 ) {
+					for ( auto i = 0; i < node.getCount(); ++i ) {
 						outIndices.emplace_back( indices[node.start + i] );
 					}
 				}
@@ -708,20 +734,20 @@ namespace {
 			}
 		}
 	}
-	void queryFlatBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<int32_t>& outIndices, float maxDist ) {
+	void queryFlatBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& outIndices, float maxDist ) {
 		auto& nodes = bvh.flattened;
 		auto& indices = bvh.indices;
 
 		outIndices.reserve(::reserveCount);
 
-		int32_t idx = 0;
+		pod::BVH::index_t idx = 0;
 		while ( idx < nodes.size() ) {
 			const auto& node = nodes[idx];
 			float tMin, tMax;
-			if ( !node.asleep && ::rayAabbIntersect( ray, node.bounds, tMin, tMax ) && tMin <= maxDist ) {
+			if ( !node.isAsleep() && ::rayAabbIntersect( ray, bvh.flatBounds[idx], tMin, tMax ) && tMin <= maxDist ) {
 				// leaf
-				if ( node.count > 0 ) {
-					for ( auto i = 0; i < node.count; ++i ) {
+				if ( node.getCount() > 0 ) {
+					for ( auto i = 0; i < node.getCount(); ++i ) {
 						outIndices.emplace_back( indices[node.start + i] );
 					}
 				}
@@ -736,10 +762,10 @@ namespace {
 
 namespace {
 	struct UnionFind {
-		uf::stl::vector<int32_t> parent;
-		uf::stl::vector<int32_t> rank;
+		uf::stl::vector<pod::BVH::index_t> parent;
+		uf::stl::vector<pod::BVH::index_t> rank;
 
-		UnionFind( int32_t n ) {
+		UnionFind( pod::BVH::index_t n ) {
 			parent.resize(n);
 			rank.resize(n, 0);
 			
@@ -747,14 +773,14 @@ namespace {
 				parent[i] = i;
 		}
 
-		int32_t find( int32_t x ) {
+		pod::BVH::index_t find( pod::BVH::index_t x ) {
 			if ( parent[x] != x ) parent[x] = find(parent[x]);
 			return parent[x];
 		}
 
-		void unite( int32_t a, int32_t b ) {
-			int32_t rootA = find(a);
-			int32_t rootB = find(b);
+		void unite( pod::BVH::index_t a, pod::BVH::index_t b ) {
+			pod::BVH::index_t rootA = find(a);
+			pod::BVH::index_t rootB = find(b);
 
 			if ( rootA == rootB ) return;
 
@@ -776,20 +802,20 @@ namespace {
 		}
 
 		// map root to island index
-		uf::stl::unordered_map<int32_t, int32_t> rootToIsland;
+		uf::stl::unordered_map<pod::BVH::index_t, pod::BVH::index_t> rootToIsland;
 
 		islands.clear();
 		islands.reserve(bodies.size());
 
 		for ( auto i = 0; i < bodies.size(); i++ ) {
-			int32_t root = unionizer.find(i);
+			pod::BVH::index_t root = unionizer.find(i);
 
 			if (rootToIsland.find(root) == rootToIsland.end()) {
-				rootToIsland[root] = (int32_t) islands.size();
+				rootToIsland[root] = (pod::BVH::index_t) islands.size();
 				islands.emplace_back();
 			}
 
-			int32_t islandID = rootToIsland[root];
+			pod::BVH::index_t islandID = rootToIsland[root];
 			islands[islandID].indices.emplace_back( i );
 		}
 
@@ -798,8 +824,8 @@ namespace {
 			// do not insert these pairs if they're non-colliding
 			if ( !::shouldCollide( *bodies[a], *bodies[b] ) ) continue;
 
-			int32_t root = unionizer.find(a);
-			int32_t islandID = rootToIsland[root];
+			pod::BVH::index_t root = unionizer.find(a);
+			pod::BVH::index_t islandID = rootToIsland[root];
 			islands[islandID].pairs.emplace(a, b);
 		}
 	}
diff --git a/engine/src/utils/math/physics/helpers.inl b/engine/src/utils/math/physics/helpers.inl
index bf9b4a8b..c10cf14e 100644
--- a/engine/src/utils/math/physics/helpers.inl
+++ b/engine/src/utils/math/physics/helpers.inl
@@ -47,10 +47,10 @@ namespace {
 	
 	pod::Vector3f aabbCenter( const pod::AABB& aabb );
 
-	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<int32_t>& indices );
-	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<int32_t>& indices, int32_t nodeID );
-	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<int32_t>& indices, float maxDist = FLT_MAX );
-	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<int32_t>& indices, int32_t nodeID, float maxDist = FLT_MAX );
+	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& indices );
+	void queryBVH( const pod::BVH& bvh, const pod::AABB& bounds, uf::stl::vector<pod::BVH::index_t>& indices, pod::BVH::index_t nodeID );
+	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& indices, float maxDist = FLT_MAX );
+	void queryBVH( const pod::BVH& bvh, const pod::Ray& ray, uf::stl::vector<pod::BVH::index_t>& indices, pod::BVH::index_t nodeID, float maxDist = FLT_MAX );
 	void queryOverlaps( const pod::BVH& bvh, pod::BVH::pairs_t& outPairs );
 	void queryOverlaps( const pod::BVH& bvhA, const pod::BVH& bvhB, pod::BVH::pairs_t& outPairs );
 }
diff --git a/engine/src/utils/math/physics/impl.cpp b/engine/src/utils/math/physics/impl.cpp
index 925df8c4..550ba950 100644
--- a/engine/src/utils/math/physics/impl.cpp
+++ b/engine/src/utils/math/physics/impl.cpp
@@ -10,40 +10,39 @@ namespace {
 	bool psgContactSolver = true; // use PSG contact solver
 	bool useGjk = false; // currently don't have a way to broadphase mesh => narrowphase tri via GJK
 	bool fixedStep = true; // run physics simulation with a fixed delta time (with accumulation), rather than rely on actual engine deltatime
-	int32_t substeps = 0; // number of substeps per frame tick
-	int32_t reserveCount = 32; // amount of elements to reserve for vectors used in this system, to-do: have it tie to a memory pool allocator
+	uint32_t substeps = 0; // number of substeps per frame tick
+	uint32_t reserveCount = 32; // amount of elements to reserve for vectors used in this system, to-do: have it tie to a memory pool allocator
 
-	// increasing these make things lag for reasons I can imagine why
-	int32_t broadphaseBvhCapacity = 1; // number of bodies per leaf node
-	int32_t meshBvhCapacity = 1; // number of triangles per leaf node
+	// increasing these make things lag for reasons I can imagine why (having to test more triangles over just more boxes)
+	uint32_t broadphaseBvhCapacity = 4; // number of bodies per leaf node
+	uint32_t meshBvhCapacity = 1; // number of triangles per leaf node
 
 	// additionally flattens a BVH for linear iteration, rather than a recursive / stack-based traversal
 	bool flattenBvhBodies = true;
 	bool flattenBvhMeshes = true;
 	
 	// use surface area heuristics for building the BVH, rather than naive splits
-	bool useBvhSahBodies = false; // it actually seems slower to use these......
+	bool useBvhSahBodies = true; // it actually seems slower to use these......
 	bool useBvhSahMeshes = true;
 
 	bool useSplitBvhs = true; // creates separate BVHs for static / dynamic objects
 
 	// to-do: find possibly better values for this
-	int32_t solverIterations = 10;
+	uint32_t solverIterations = 10;
 	float baumgarteCorrectionPercent = 0.2f;
 	float baumgarteCorrectionSlop = 0.01f;
 	
 	uf::stl::unordered_map<size_t, pod::Manifold> manifoldsCache;
-	int32_t manifoldCacheLifetime = 6; // to-do: find a good value for this
+	uint32_t manifoldCacheLifetime = 6; // to-do: find a good value for this
 
 	uint32_t frameCounter = 0;
 
 	// to-do: tweak this to not be annoying
-	// currently seems only reliable when it hits its TTL, but too long of a wait is gross, and too frequent of an update causes lag
 	pod::BVH::UpdatePolicy bvhUpdatePolicy = {
 		.displacementThreshold = 0.25f,
 		.overlapThreshold = 2.0f,
 		.dirtyRatioThreshold = 0.3f,
-		.maxFramesBeforeRebuild = 120,
+		.maxFramesBeforeRebuild = 60 * 10, // 10 seconds
 	};
 }
 
@@ -476,7 +475,8 @@ pod::RayQuery uf::physics::impl::rayCast( const pod::Ray& ray, const pod::World&
 	auto& staticBvh = world.staticBvh;
 	auto& bodies = world.bodies;
 
-	uf::stl::vector<int32_t> candidates;
+	thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+	candidates.clear();
 	::queryBVH( dynamicBvh, ray, candidates );
 	if ( ::useSplitBvhs ) ::queryBVH( staticBvh, ray, candidates );
 
diff --git a/engine/src/utils/math/physics/mesh.inl b/engine/src/utils/math/physics/mesh.inl
index 5ca8b36e..83362502 100644
--- a/engine/src/utils/math/physics/mesh.inl
+++ b/engine/src/utils/math/physics/mesh.inl
@@ -24,7 +24,8 @@ namespace {
 
 		// transform to local space for BVH query
 		auto bounds = ::transformAabbToLocal( aabb.bounds, ::getTransform( mesh ) );
-		uf::stl::vector<int32_t> candidates;
+		thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+		candidates.clear();
 		::queryBVH( bvh, bounds, candidates );
 
 		bool hit = false;
@@ -47,7 +48,8 @@ namespace {
 
 		// transform to local space for BVH query
 		auto bounds = ::transformAabbToLocal( sphere.bounds, ::getTransform( mesh ) );		
-		uf::stl::vector<int32_t> candidates;
+		thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+		candidates.clear();
 		::queryBVH( bvh, bounds, candidates );
 
 		bool hit = false;
@@ -72,7 +74,8 @@ namespace {
 
 		// transform to local space for BVH query
 		auto bounds = ::transformAabbToLocal( plane.bounds, ::getTransform( mesh ) );		
-		uf::stl::vector<int32_t> candidates;
+		thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+		candidates.clear();
 		::queryBVH( bvh, bounds, candidates );
 
 		bool hit = false;
@@ -96,7 +99,8 @@ namespace {
 
 		// transform to local space for BVH query
 		auto bounds = ::transformAabbToLocal( capsule.bounds, ::getTransform( mesh ) );		
-		uf::stl::vector<int32_t> candidates;
+		thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+		candidates.clear();
 		::queryBVH( bvh, bounds, candidates );
 
 		bool hit = false;
@@ -120,7 +124,8 @@ namespace {
 		const auto& bvhB = *b.collider.mesh.bvh;
 
 		// compute overlaps between one BVH and another BVH
-		pod::BVH::pairs_t pairs;
+		thread_local pod::BVH::pairs_t pairs;
+		pairs.clear();
 		::queryOverlaps( bvhA, bvhB, pairs );
 
 		bool hit = false;
diff --git a/engine/src/utils/math/physics/ray.inl b/engine/src/utils/math/physics/ray.inl
index c87ac616..0b198313 100644
--- a/engine/src/utils/math/physics/ray.inl
+++ b/engine/src/utils/math/physics/ray.inl
@@ -202,7 +202,8 @@ namespace {
 		ray.origin	= uf::transform::applyInverse( transform, r.origin );
 		ray.direction = uf::quaternion::rotate( uf::quaternion::inverse( transform.orientation ), r.direction );
 
-		uf::stl::vector<int32_t> candidates;
+		thread_local uf::stl::vector<pod::BVH::index_t> candidates;
+		candidates.clear();
 		::queryBVH( bvh, ray, candidates );
 
 		for ( auto triID : candidates ) {