From 4adc49cd4013f14563fdb2163efb56be4fb54828 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Sun, 12 Sep 2021 15:04:52 +0100 Subject: [PATCH] Optimisations --- CMakeLists.txt | 2 +- GL/draw.c | 72 +++------------------------ GL/immediate.c | 54 +++++++++----------- GL/private.h | 87 +++++++++++++++++++++++++++++++- GL/state.c | 2 +- GL/texture.c | 2 +- containers/aligned_vector.c | 99 ++----------------------------------- containers/aligned_vector.h | 97 ++++++++++++++++++++++++++++++++++-- 8 files changed, 215 insertions(+), 200 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3aec4ae..f174f5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ if(NOT PLATFORM_DREAMCAST) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") endif() -set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 --fast-math") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 --fast-math -fexpensive-optimizations -funroll-all-loops") set( SOURCES diff --git a/GL/draw.c b/GL/draw.c index 9d65f45..8ba8f23 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -13,10 +13,12 @@ AttribPointer UV_POINTER; AttribPointer ST_POINTER; AttribPointer NORMAL_POINTER; AttribPointer DIFFUSE_POINTER; +GLuint ENABLED_VERTEX_ATTRIBUTES = 0; +GLboolean FAST_PATH_ENABLED = GL_FALSE; -static GLuint ENABLED_VERTEX_ATTRIBUTES = 0; static GLubyte ACTIVE_CLIENT_TEXTURE = 0; -static GLboolean FAST_PATH_ENABLED = GL_FALSE; + +extern inline GLboolean _glRecalcFastPath(); #define ITERATE(count) \ GLuint i = count; \ @@ -52,53 +54,7 @@ void _glInitAttributePointers() { NORMAL_POINTER.size = 3; } -GL_FORCE_INLINE GLboolean _glIsVertexDataFastPathCompatible() { - /* The fast path is enabled when all enabled elements of the vertex - * match the output format. This means: - * - * xyz == 3f - * uv == 2f - * rgba == argb4444 - * st == 2f - * normal == 3f - * - * When this happens we do inline straight copies of the enabled data - * and transforms for positions and normals happen while copying. - */ - if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) { - if(VERTEX_POINTER.size != 3 || VERTEX_POINTER.type != GL_FLOAT) { - return GL_FALSE; - } - } - - if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG)) { - if(UV_POINTER.size != 2 || UV_POINTER.type != GL_FLOAT) { - return GL_FALSE; - } - } - - if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG)) { - /* FIXME: Shouldn't this be a reversed format? */ - if(DIFFUSE_POINTER.size != GL_BGRA || DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) { - return GL_FALSE; - } - } - - if((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG)) { - if(ST_POINTER.size != 2 || ST_POINTER.type != GL_FLOAT) { - return GL_FALSE; - } - } - - if((ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG)) { - if(NORMAL_POINTER.size != 3 || NORMAL_POINTER.type != GL_FLOAT) { - return GL_FALSE; - } - } - - return GL_TRUE; -} GL_FORCE_INLINE GLsizei byte_size(GLenum type) { switch(type) { @@ -1182,19 +1138,8 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL target->extras = &extras; } - GLboolean doMultitexture, doTexture, doLighting; - GLint activeTexture; - glGetIntegerv(GL_ACTIVE_TEXTURE_ARB, &activeTexture); - - glActiveTextureARB(GL_TEXTURE0); - glGetBooleanv(GL_TEXTURE_2D, &doTexture); - - glActiveTextureARB(GL_TEXTURE1); - glGetBooleanv(GL_TEXTURE_2D, &doMultitexture); - - doLighting = _glIsLightingEnabled(); - - glActiveTextureARB(activeTexture); + const GLboolean doLighting = LIGHTING_ENABLED; + const GLboolean doMultitexture = TEXTURES_ENABLED[1]; /* Polygons are treated as triangle fans, the only time this would be a * problem is if we supported glPolygonMode(..., GL_LINE) but we don't. @@ -1423,11 +1368,6 @@ void APIENTRY glClientActiveTextureARB(GLenum texture) { ACTIVE_CLIENT_TEXTURE = (texture == GL_TEXTURE1_ARB) ? 1 : 0; } -GLboolean _glRecalcFastPath() { - FAST_PATH_ENABLED = _glIsVertexDataFastPathCompatible(); - return FAST_PATH_ENABLED; -} - void APIENTRY glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) { TRACE(); diff --git a/GL/immediate.c b/GL/immediate.c index 62bd7a2..f92be99 100644 --- a/GL/immediate.c +++ b/GL/immediate.c @@ -12,7 +12,9 @@ #include "private.h" -static GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE; +extern inline GLboolean _glRecalcFastPath(); + +GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE; static GLenum ACTIVE_POLYGON_MODE = GL_TRIANGLES; static AlignedVector VERTICES; @@ -39,7 +41,7 @@ extern AttribPointer DIFFUSE_POINTER; /* We store the list of attributes that have been "enabled" by a call to glColor, glNormal, glTexCoord etc. otherwise we already have defaults that can be applied faster */ -static GLuint ENABLED_VERTEX_ATTRIBUTES = 0; +static GLuint IM_ENABLED_VERTEX_ATTRIBUTES = 0; static inline uint32_t pack_vertex_attribute_vec3_1i(float x, float y, float z) { const float w = 0.0f; @@ -95,17 +97,6 @@ void _glInitImmediateMode(GLuint initial_size) { NORMAL = pack_vertex_attribute_vec3_1i(0.0f, 0.0f, 1.0f); } -GLubyte _glCheckImmediateModeInactive(const char* func) { - /* Returns 1 on error */ - if(IMMEDIATE_MODE_ACTIVE) { - _glKosThrowError(GL_INVALID_OPERATION, func); - _glKosPrintError(); - return 1; - } - - return 0; -} - void APIENTRY glBegin(GLenum mode) { if(IMMEDIATE_MODE_ACTIVE) { _glKosThrowError(GL_INVALID_OPERATION, __func__); @@ -118,7 +109,7 @@ void APIENTRY glBegin(GLenum mode) { } void APIENTRY glColor4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[A8IDX] = (GLubyte)(a * 255.0f); COLOR[R8IDX] = (GLubyte)(r * 255.0f); @@ -127,7 +118,7 @@ void APIENTRY glColor4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { } void APIENTRY glColor4ub(GLubyte r, GLubyte g, GLubyte b, GLubyte a) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[A8IDX] = a; COLOR[R8IDX] = r; @@ -136,7 +127,7 @@ void APIENTRY glColor4ub(GLubyte r, GLubyte g, GLubyte b, GLubyte a) { } void APIENTRY glColor4fv(const GLfloat* v) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[B8IDX] = (GLubyte)(v[2] * 255); COLOR[G8IDX] = (GLubyte)(v[1] * 255); @@ -145,7 +136,7 @@ void APIENTRY glColor4fv(const GLfloat* v) { } void APIENTRY glColor3f(GLfloat r, GLfloat g, GLfloat b) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[B8IDX] = (GLubyte)(b * 255); COLOR[G8IDX] = (GLubyte)(g * 255); @@ -154,7 +145,7 @@ void APIENTRY glColor3f(GLfloat r, GLfloat g, GLfloat b) { } void APIENTRY glColor3ub(GLubyte red, GLubyte green, GLubyte blue) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[A8IDX] = 255; COLOR[R8IDX] = red; @@ -163,7 +154,7 @@ void APIENTRY glColor3ub(GLubyte red, GLubyte green, GLubyte blue) { } void APIENTRY glColor3ubv(const GLubyte *v) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[A8IDX] = 255; COLOR[R8IDX] = v[0]; @@ -172,7 +163,7 @@ void APIENTRY glColor3ubv(const GLubyte *v) { } void APIENTRY glColor3fv(const GLfloat* v) { - ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; COLOR[A8IDX] = 255; COLOR[R8IDX] = (GLubyte)(v[0] * 255); @@ -181,7 +172,7 @@ void APIENTRY glColor3fv(const GLfloat* v) { } void APIENTRY glVertex3f(GLfloat x, GLfloat y, GLfloat z) { - ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG; GLVertexKOS* vert = aligned_vector_extend(&VERTICES, 1); @@ -192,12 +183,12 @@ void APIENTRY glVertex3f(GLfloat x, GLfloat y, GLfloat z) { vert->v = UV_COORD[1]; *((uint32_t*) vert->bgra) = *((uint32_t*) COLOR); - if(ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) { + if(IM_ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) { GLuint* n = aligned_vector_extend(&NORMALS, 1); *n = NORMAL; } - if(ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) { + if(IM_ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) { GLfloat* st = aligned_vector_extend(&ST_COORDS, 2); st[0] = ST_COORD[0]; st[1] = ST_COORD[1]; @@ -227,11 +218,11 @@ void APIENTRY glVertex4fv(const GLfloat* v) { void APIENTRY glMultiTexCoord2fARB(GLenum target, GLfloat s, GLfloat t) { if(target == GL_TEXTURE0) { - ENABLED_VERTEX_ATTRIBUTES |= UV_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= UV_ENABLED_FLAG; UV_COORD[0] = s; UV_COORD[1] = t; } else if(target == GL_TEXTURE1) { - ENABLED_VERTEX_ATTRIBUTES |= ST_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= ST_ENABLED_FLAG; ST_COORD[0] = s; ST_COORD[1] = t; } else { @@ -242,7 +233,7 @@ void APIENTRY glMultiTexCoord2fARB(GLenum target, GLfloat s, GLfloat t) { } void APIENTRY glTexCoord2f(GLfloat u, GLfloat v) { - ENABLED_VERTEX_ATTRIBUTES |= UV_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= UV_ENABLED_FLAG; UV_COORD[0] = u; UV_COORD[1] = v; } @@ -252,12 +243,12 @@ void APIENTRY glTexCoord2fv(const GLfloat* v) { } void APIENTRY glNormal3f(GLfloat x, GLfloat y, GLfloat z) { - ENABLED_VERTEX_ATTRIBUTES |= NORMAL_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= NORMAL_ENABLED_FLAG; NORMAL = pack_vertex_attribute_vec3_1i(x, y, z); } void APIENTRY glNormal3fv(const GLfloat* v) { - ENABLED_VERTEX_ATTRIBUTES |= NORMAL_ENABLED_FLAG; + IM_ENABLED_VERTEX_ATTRIBUTES |= NORMAL_ENABLED_FLAG; glNormal3f(v[0], v[1], v[2]); } @@ -272,7 +263,7 @@ void APIENTRY glEnd() { NORMAL_ATTRIB.ptr = NORMALS.data; ST_ATTRIB.ptr = ST_COORDS.data; - GLuint* attrs = _glGetEnabledAttributes(); + GLuint* attrs = &ENABLED_VERTEX_ATTRIBUTES; /* Stash existing values */ AttribPointer vptr = VERTEX_POINTER; @@ -290,10 +281,11 @@ void APIENTRY glEnd() { UV_POINTER = UV_ATTRIB; ST_POINTER = ST_ATTRIB; - *attrs = ENABLED_VERTEX_ATTRIBUTES; + *attrs = IM_ENABLED_VERTEX_ATTRIBUTES; #ifndef NDEBUG - _glRecalcFastPath(); + /* If we're not debugging, set to true - we assume we haven't broken it! */ + FAST_PATH_ENABLED = GL_TRUE; #else // Immediate mode should always activate the fast path GLboolean fastPathEnabled = _glRecalcFastPath(); diff --git a/GL/private.h b/GL/private.h index 0df8150..e528b6f 100644 --- a/GL/private.h +++ b/GL/private.h @@ -301,7 +301,6 @@ Matrix4x4* _glGetProjectionMatrix(); Matrix4x4* _glGetModelViewMatrix(); void _glWipeTextureOnFramebuffers(GLuint texture); -GLubyte _glCheckImmediateModeInactive(const char* func); PolyContext* _glGetPVRContext(); GLubyte _glInitTextures(); @@ -329,7 +328,12 @@ GLenum _glGetShadeModel(); TextureObject* _glGetTexture0(); TextureObject* _glGetTexture1(); TextureObject* _glGetBoundTexture(); + +extern GLubyte ACTIVE_TEXTURE; +extern GLboolean TEXTURES_ENABLED[]; + GLubyte _glGetActiveTexture(); + GLuint _glGetActiveClientTexture(); TexturePalette* _glGetSharedPalette(GLshort bank); void _glSetInternalPaletteFormat(GLenum val); @@ -367,13 +371,92 @@ GLboolean _glIsMipmapComplete(const TextureObject* obj); GLubyte* _glGetMipmapLocation(const TextureObject* obj, GLuint level); GLuint _glGetMipmapLevelCount(const TextureObject* obj); +extern GLboolean LIGHTING_ENABLED; GLboolean _glIsLightingEnabled(); + void _glEnableLight(GLubyte light, unsigned char value); GLboolean _glIsColorMaterialEnabled(); GLboolean _glIsNormalizeEnabled(); -GLboolean _glRecalcFastPath(); +extern AttribPointer VERTEX_POINTER; +extern AttribPointer UV_POINTER; +extern AttribPointer ST_POINTER; +extern AttribPointer NORMAL_POINTER; +extern AttribPointer DIFFUSE_POINTER; +extern GLuint ENABLED_VERTEX_ATTRIBUTES; +extern GLboolean FAST_PATH_ENABLED; + +GL_FORCE_INLINE GLboolean _glIsVertexDataFastPathCompatible() { + /* The fast path is enabled when all enabled elements of the vertex + * match the output format. This means: + * + * xyz == 3f + * uv == 2f + * rgba == argb4444 + * st == 2f + * normal == 3f + * + * When this happens we do inline straight copies of the enabled data + * and transforms for positions and normals happen while copying. + */ + + + + if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) { + if(VERTEX_POINTER.size != 3 || VERTEX_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } + + if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG)) { + if(UV_POINTER.size != 2 || UV_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } + + if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG)) { + /* FIXME: Shouldn't this be a reversed format? */ + if(DIFFUSE_POINTER.size != GL_BGRA || DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) { + return GL_FALSE; + } + } + + if((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG)) { + if(ST_POINTER.size != 2 || ST_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } + + if((ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG)) { + if(NORMAL_POINTER.size != 3 || NORMAL_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } + + return GL_TRUE; +} + +GL_FORCE_INLINE GLboolean _glRecalcFastPath() { + FAST_PATH_ENABLED = _glIsVertexDataFastPathCompatible(); + return FAST_PATH_ENABLED; +} + +extern GLboolean IMMEDIATE_MODE_ACTIVE; + +void _glKosThrowError(GLenum error, const char *function); +void _glKosPrintError(); + +GL_FORCE_INLINE GLboolean _glCheckImmediateModeInactive(const char* func) { + /* Returns 1 on error */ + if(IMMEDIATE_MODE_ACTIVE) { + _glKosThrowError(GL_INVALID_OPERATION, func); + _glKosPrintError(); + return GL_TRUE; + } + + return GL_FALSE; +} typedef struct { float n[3]; // 12 bytes diff --git a/GL/state.c b/GL/state.c index 9bcf0b2..95cea34 100644 --- a/GL/state.c +++ b/GL/state.c @@ -18,7 +18,7 @@ static GLenum FRONT_FACE = GL_CCW; static GLboolean CULLING_ENABLED = GL_FALSE; static GLboolean COLOR_MATERIAL_ENABLED = GL_FALSE; -static GLboolean LIGHTING_ENABLED = GL_FALSE; +GLboolean LIGHTING_ENABLED = GL_FALSE; /* Is the shared texture palette enabled? */ static GLboolean SHARED_PALETTE_ENABLED = GL_FALSE; diff --git a/GL/texture.c b/GL/texture.c index edc329e..4b14b38 100644 --- a/GL/texture.c +++ b/GL/texture.c @@ -20,7 +20,7 @@ static TextureObject* TEXTURE_UNITS[MAX_TEXTURE_UNITS] = {NULL, NULL}; static NamedArray TEXTURE_OBJECTS; -static GLubyte ACTIVE_TEXTURE = 0; +GLubyte ACTIVE_TEXTURE = 0; static TexturePalette* SHARED_PALETTES[4] = {NULL, NULL, NULL, NULL}; diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c index 21b6058..442e0b9 100644 --- a/containers/aligned_vector.c +++ b/containers/aligned_vector.c @@ -4,15 +4,6 @@ #include #include -#if defined(__APPLE__) || defined(__WIN32__) -/* Linux + Kos define this, OSX does not, so just use malloc there */ -static inline void* memalign(size_t alignment, size_t size) { - return malloc(size); -} -#else - #include -#endif - #ifdef _arch_dreamcast #include "../GL/private.h" #else @@ -21,6 +12,11 @@ static inline void* memalign(size_t alignment, size_t size) { #include "aligned_vector.h" +extern inline void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); +extern inline void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); +extern inline void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); +extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); + void aligned_vector_init(AlignedVector* vector, unsigned int element_size) { vector->size = vector->capacity = 0; vector->element_size = element_size; @@ -30,91 +26,6 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) { aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE); } -static inline unsigned int round_to_chunk_size(unsigned int val) { - const unsigned int n = val; - const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE; - - return ((n + m - 1) / m) * m; -} - -void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { - if(element_count == 0) { - return NULL; - } - - if(element_count <= vector->capacity) { - return NULL; - } - - unsigned int original_byte_size = vector->size * vector->element_size; - - /* We overallocate so that we don't make small allocations during push backs */ - element_count = round_to_chunk_size(element_count); - - unsigned int new_byte_size = element_count * vector->element_size; - unsigned char* original_data = vector->data; - - vector->data = (unsigned char*) memalign(0x20, new_byte_size); - assert(vector->data); - - if(original_data) { - FASTCPY(vector->data, original_data, original_byte_size); - free(original_data); - } - - vector->capacity = element_count; - - return vector->data + original_byte_size; -} - -void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) { - /* Resize enough room */ - assert(count); - assert(vector->element_size); - - unsigned int initial_size = vector->size; - aligned_vector_resize(vector, vector->size + count); - - assert(vector->size == initial_size + count); - - unsigned char* dest = vector->data + (vector->element_size * initial_size); - - /* Copy the objects in */ - FASTCPY(dest, objs, vector->element_size * count); - - return dest; -} - -void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) { - void* ret = NULL; - - unsigned int previousCount = vector->size; - - /* Don't change memory when resizing downwards, just change the size */ - if(element_count <= vector->size) { - vector->size = element_count; - return NULL; - } - - if(vector->capacity < element_count) { - ret = aligned_vector_reserve(vector, element_count); - vector->size = element_count; - } else if(previousCount < element_count) { - vector->size = element_count; - ret = aligned_vector_at(vector, previousCount); - } - - if(previousCount < vector->size) { - return ret; - } else { - return NULL; - } -} - -void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) { - return aligned_vector_resize(vector, vector->size + additional_count); -} - void aligned_vector_shrink_to_fit(AlignedVector* vector) { if(vector->size == 0) { free(vector->data); diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index afc950f..29167e3 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -2,11 +2,21 @@ #include #include +#include #ifdef __cplusplus extern "C" { #endif +#if defined(__APPLE__) || defined(__WIN32__) +/* Linux + Kos define this, OSX does not, so just use malloc there */ +static inline void* memalign(size_t alignment, size_t size) { + return malloc(size); +} +#else + #include +#endif + typedef struct { unsigned int size; unsigned int capacity; @@ -24,16 +34,95 @@ typedef struct { #define AV_FORCE_INLINE static AV_INLINE_DEBUG #endif +#define ROUND_TO_CHUNK_SIZE(v) \ + ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE) + + void aligned_vector_init(AlignedVector* vector, unsigned int element_size); -void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); -void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); -void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); + +AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { + if(element_count == 0) { + return NULL; + } + + if(element_count <= vector->capacity) { + return NULL; + } + + unsigned int original_byte_size = vector->size * vector->element_size; + + /* We overallocate so that we don't make small allocations during push backs */ + element_count = ROUND_TO_CHUNK_SIZE(element_count); + + unsigned int new_byte_size = element_count * vector->element_size; + unsigned char* original_data = vector->data; + + vector->data = (unsigned char*) memalign(0x20, new_byte_size); + assert(vector->data); + + if(original_data) { + memcpy(vector->data, original_data, original_byte_size); + free(original_data); + } + + vector->capacity = element_count; + + return vector->data + original_byte_size; +} AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { assert(index < vector->size); return &vector->data[index * vector->element_size]; } -void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); + +AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) { + void* ret = NULL; + + unsigned int previousCount = vector->size; + + /* Don't change memory when resizing downwards, just change the size */ + if(element_count <= vector->size) { + vector->size = element_count; + return NULL; + } + + if(vector->capacity < element_count) { + ret = aligned_vector_reserve(vector, element_count); + vector->size = element_count; + } else if(previousCount < element_count) { + vector->size = element_count; + ret = aligned_vector_at(vector, previousCount); + } + + if(previousCount < vector->size) { + return ret; + } else { + return NULL; + } +} + +AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) { + /* Resize enough room */ + assert(count); + assert(vector->element_size); + + unsigned int initial_size = vector->size; + aligned_vector_resize(vector, vector->size + count); + + assert(vector->size == initial_size + count); + + unsigned char* dest = vector->data + (vector->element_size * initial_size); + + /* Copy the objects in */ + memcpy(dest, objs, vector->element_size * count); + + return dest; +} + + +AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) { + return aligned_vector_resize(vector, vector->size + additional_count); +} AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){ vector->size = 0;