Browse Source

added tri tiling/binning: 2.5fps -> 100fps 17,700 tris

Iver 1 month ago
parent
commit
4be214ea3e
10 changed files with 191 additions and 226 deletions
  1. BIN
      builds/librasteriver.so
  2. BIN
      builds/main.bin
  3. 1 0
      changelog.txt
  4. 0 193
      main.c
  5. 0 2
      objects/plane.mtl
  6. 5 3
      readme.md
  7. 7 0
      src/headers/types.h
  8. 62 7
      src/kernels/kernels.cl
  9. 13 5
      src/launch program/main.c
  10. 103 16
      src/main/main.c

BIN
builds/librasteriver.so


BIN
builds/main.bin


+ 1 - 0
changelog.txt

@@ -0,0 +1 @@
+-added triangle tiling or whatever

+ 0 - 193
main.c

@@ -1,193 +0,0 @@
-#include <CL/cl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define CHECK_ERROR(err, msg) \
-    if (err != CL_SUCCESS) { \
-        fprintf(stderr, "โŒ %s failed (%d)\n", msg, err); \
-        exit(EXIT_FAILURE); \
-    }
-
-void print_platform_info(cl_platform_id platform) {
-    char buffer[1024];
-    printf("\n๐ŸŒ Platform Info:\n");
-
-    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(buffer), buffer, NULL);
-    printf("๐Ÿท๏ธ Name: %s\n", buffer);
-
-    clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(buffer), buffer, NULL);
-    printf("๐Ÿข Vendor: %s\n", buffer);
-
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(buffer), buffer, NULL);
-    printf("๐Ÿ’ฟ Version: %s\n", buffer);
-
-    clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, sizeof(buffer), buffer, NULL);
-    printf("๐Ÿงฉ Profile: %s\n", buffer);
-
-    printf("\n");
-}
-
-void print_device_info(cl_device_id device) {
-    char name[256], vendor[256], version[256];
-    cl_uint compute_units;
-    cl_ulong global_mem;
-    cl_ulong local_mem;
-    size_t max_wg;
-
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem), &global_mem, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem), &local_mem, NULL);
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_wg), &max_wg, NULL);
-
-    printf("๐Ÿ”น Device: %s\n", name);
-    printf("   Vendor: %s\n", vendor);
-    printf("   Version: %s\n", version);
-    printf("   Compute Units: %u\n", compute_units);
-    printf("   Global Memory: %.2f MB\n", global_mem / (1024.0 * 1024.0));
-    printf("   Local Memory: %.2f KB\n", local_mem / 1024.0);
-    printf("   Max Work Group Size: %zu\n\n", max_wg);
-}
-
-int main(void) {
-    cl_int err;
-
-    printf("๐Ÿš€ Starting OpenCL diagnostic + compute test\n");
-
-    // 1. Get all platforms
-    cl_uint num_platforms = 0;
-    err = clGetPlatformIDs(0, NULL, &num_platforms);
-    CHECK_ERROR(err, "clGetPlatformIDs(count)");
-
-    cl_platform_id *platforms = malloc(sizeof(cl_platform_id) * num_platforms);
-    err = clGetPlatformIDs(num_platforms, platforms, NULL);
-    CHECK_ERROR(err, "clGetPlatformIDs(list)");
-
-    printf("๐ŸŒ Found %u OpenCL platform(s)\n", num_platforms);
-
-    // 2. List platforms and pick POCL if available
-    cl_platform_id chosen_platform = NULL;
-    char pname[256];
-    for (cl_uint i = 0; i < num_platforms; i++) {
-        print_platform_info(platforms[i]);
-        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(pname), pname, NULL);
-
-        if (strstr(pname, "Portable") || strstr(pname, "pocl") || strstr(pname, "POCL")) {
-            chosen_platform = platforms[i];
-        }
-    }
-
-    if (!chosen_platform) {
-        printf("โš ๏ธ  No POCL platform found, using first available.\n");
-        chosen_platform = platforms[0];
-    }
-
-    clGetPlatformInfo(chosen_platform, CL_PLATFORM_NAME, sizeof(pname), pname, NULL);
-    printf("โœ… Selected platform: %s\n", pname);
-
-    // 3. Get devices
-    cl_uint num_devices = 0;
-    err = clGetDeviceIDs(chosen_platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
-    CHECK_ERROR(err, "clGetDeviceIDs(count)");
-
-    cl_device_id *devices = malloc(sizeof(cl_device_id) * num_devices);
-    err = clGetDeviceIDs(chosen_platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
-    CHECK_ERROR(err, "clGetDeviceIDs(list)");
-
-    printf("๐Ÿงฉ Found %u device(s)\n\n", num_devices);
-
-    for (cl_uint i = 0; i < num_devices; i++) {
-        print_device_info(devices[i]);
-    }
-
-    cl_device_id device = devices[0];
-
-    // 4. Create context and queue
-    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
-    CHECK_ERROR(err, "clCreateContext");
-
-    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
-    CHECK_ERROR(err, "clCreateCommandQueue");
-
-    // 5. Example kernel
-    const char *source =
-        "__kernel void vector_add(__global const float* a, __global const float* b, __global float* c) {"
-        "    int id = get_global_id(0);"
-        "    c[id] = a[id] + b[id];"
-        "}";
-
-    cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &err);
-    CHECK_ERROR(err, "clCreateProgramWithSource");
-
-    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
-    if (err != CL_SUCCESS) {
-        size_t log_size;
-        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        char *log = malloc(log_size);
-        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-        fprintf(stderr, "โŒ Build log:\n%s\n", log);
-        free(log);
-        CHECK_ERROR(err, "clBuildProgram");
-    }
-
-    printf("โœ… Program built successfully.\n");
-
-    cl_kernel kernel = clCreateKernel(program, "vector_add", &err);
-    CHECK_ERROR(err, "clCreateKernel");
-
-    // 6. Prepare data
-    const int N = 10;
-    float A[N], B[N], C[N];
-    for (int i = 0; i < N; i++) {
-        A[i] = (float)i;
-        B[i] = (float)(N - i);
-    }
-
-    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(A), A, &err);
-    CHECK_ERROR(err, "clCreateBuffer(A)");
-
-    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(B), B, &err);
-    CHECK_ERROR(err, "clCreateBuffer(B)");
-
-    cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(C), NULL, &err);
-    CHECK_ERROR(err, "clCreateBuffer(C)");
-
-    // 7. Set kernel args
-    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
-    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
-    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
-    CHECK_ERROR(err, "clSetKernelArg");
-
-    // 8. Run kernel
-    size_t global = N;
-    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
-    CHECK_ERROR(err, "clEnqueueNDRangeKernel");
-
-    clFinish(queue);
-
-    // 9. Read results
-    err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(C), C, 0, NULL, NULL);
-    CHECK_ERROR(err, "clEnqueueReadBuffer");
-
-    printf("\nโœ… Kernel executed successfully. Results:\n");
-    for (int i = 0; i < N; i++) {
-        printf("  %.2f + %.2f = %.2f\n", A[i], B[i], C[i]);
-    }
-
-    // 10. Cleanup
-    clReleaseMemObject(bufA);
-    clReleaseMemObject(bufB);
-    clReleaseMemObject(bufC);
-    clReleaseKernel(kernel);
-    clReleaseProgram(program);
-    clReleaseCommandQueue(queue);
-    clReleaseContext(context);
-    free(platforms);
-    free(devices);
-
-    printf("\n๐ŸŽ‰ All done!\n");
-    return 0;
-}

+ 0 - 2
objects/plane.mtl

@@ -1,2 +0,0 @@
-# Blender 4.3.2 MTL File: 'None'
-# www.blender.org

+ 5 - 3
readme.md

@@ -22,14 +22,16 @@
 
  - [ ] Normal maps
 
+ - [ ] occlusion culling
+
 ### By End of February, 2026
+
  - [ ] MTL support
+ 
+ - [ ] Trianlge binning
 
 ### By End of March, 2026
 
- - [ ] Rasterizer will use some sort of tree system for performance
-    - quad tree, K-D tree, et cetera
-
  - [ ] Custom TTF parser, SourParse, will be implemented
     - GPU font renderer
 

+ 7 - 0
src/headers/types.h

@@ -224,6 +224,7 @@ typedef struct {
     cl_command_queue queue;
     cl_kernel rasterization_kernel;
     cl_kernel transformation_kernel;
+    cl_kernel tile_clear_kernel;
     cl_mem textures_mem_buffer;
     cl_mem renderable_faces_mem_buffer;
     cl_mem faces_mem_buffer;
@@ -231,6 +232,7 @@ typedef struct {
     cl_mem vertecies_mem_buffer;
     cl_mem normals_mem_buffer;
     cl_mem uvs_mem_buffer;
+    cl_mem tiles_mem_buffer;
     RI_renderable_face *faces_to_render;
     RI_face *faces;
     RI_temp_face *temp_faces;
@@ -245,6 +247,11 @@ typedef struct {
     int length_of_renderable_faces_array;
     int number_of_faces_just_rendered;
     int length_of_textures_array;
+    int tile_width;
+    int tile_height;
+    int lagest_face_count;
+    int num_h_tiles;
+    int num_v_tiles;
 } RI_CL;
 
 typedef struct {

+ 62 - 7
src/kernels/kernels.cl

@@ -271,7 +271,22 @@ void global_quaternion_rotate(__global RI_vector_3 *position, RI_vector_4 rotati
     *position = (RI_vector_3){rotation.x, rotation.y, rotation.z};
 }
 
-__kernel void transformer(__global RI_face *faces, __global RI_renderable_face *renderable_faces, double actor_x, double actor_y, double actor_z, double actor_r_w, double actor_r_x, double actor_r_y, double actor_r_z, double actor_s_x, double actor_s_y, double actor_s_z, int has_normals, int has_uvs, int face_array_offset_index, int face_count, int width, int height, double horizontal_fov_factor, double vertical_fov_factor, float min_clip, float max_clip, double camera_x, double camera_y, double camera_z, double camera_r_w, double camera_r_x, double camera_r_y, double camera_r_z, int renderable_face_offset, int face_sqrt, ushort texture_width, ushort texture_height, uint texture_index){
+__kernel void clear_tile_array(__global uint* tiles){
+    uint num_faces_per_tile = tiles[2];
+    uint number_of_horizontal_tiles = tiles[3];
+    uint number_of_vertical_tiles = tiles[4];
+
+    int x = get_global_id(0); if (x >= number_of_horizontal_tiles) return;
+    int y = get_global_id(1); if (y >= number_of_vertical_tiles) return;
+
+    uint index = 5 + (y * number_of_horizontal_tiles + x) * (1 + num_faces_per_tile); 
+
+    tiles[index] = 0;
+
+    return;
+}
+
+__kernel void transformer(__global RI_face *faces, __global RI_renderable_face *renderable_faces, double actor_x, double actor_y, double actor_z, double actor_r_w, double actor_r_x, double actor_r_y, double actor_r_z, double actor_s_x, double actor_s_y, double actor_s_z, int has_normals, int has_uvs, int face_array_offset_index, int face_count, int width, int height, double horizontal_fov_factor, double vertical_fov_factor, float min_clip, float max_clip, double camera_x, double camera_y, double camera_z, double camera_r_w, double camera_r_x, double camera_r_y, double camera_r_z, int renderable_face_offset, int face_sqrt, ushort texture_width, ushort texture_height, uint texture_index, __global uint* tiles){
     int face_index = get_global_id(1) * face_sqrt + get_global_id(0); if (face_index >= face_count) return;
 
     RI_vector_3 current_actor_position = (RI_vector_3){actor_x, actor_y, actor_z};
@@ -582,24 +597,61 @@ __kernel void transformer(__global RI_face *faces, __global RI_renderable_face *
     if (pos_1->y > cur_r_face->max_screen_y) cur_r_face->max_screen_y = pos_1->y;
     if (pos_2->y > cur_r_face->max_screen_y) cur_r_face->max_screen_y = pos_2->y;
     cur_r_face->max_screen_y = min(cur_r_face->max_screen_y, (short)(height / 2)); 
-    
+
+    uint tile_width = tiles[0];
+    uint tile_height = tiles[1];
+    uint faces_per_tile = tiles[2];
+    uint number_of_horizontal_tiles = tiles[3];
+    uint number_of_vertical_tiles = tiles[4];
+
+    uint tile_x_min = fmin(fmax(floor((float)((float)(cur_r_face->min_screen_x + width / 2) / (float)tile_width)), 0), number_of_horizontal_tiles - 1);
+    uint tile_x_max = fmin(fmax(floor((float)((float)(cur_r_face->max_screen_x + width / 2) / (float)tile_width)), 0), number_of_horizontal_tiles - 1);    
+    uint tile_y_min = fmin(fmax(floor((float)((float)(cur_r_face->min_screen_y + height / 2) / (float)tile_height)), 0), number_of_vertical_tiles - 1);
+    uint tile_y_max = fmin(fmax(floor((float)((float)(cur_r_face->max_screen_y + height / 2) / (float)tile_height)), 0), number_of_vertical_tiles - 1);
+
+    for (uint y = tile_y_min; y <= tile_y_max; y++){
+        for (uint x = tile_x_min; x <= tile_x_max; x++){
+            uint tile_array_index = 5 + (y * number_of_horizontal_tiles + x) * (1 + faces_per_tile);
+
+            uint num_faces_in_cur_tile = atomic_fetch_add((volatile __global atomic_uint*)&tiles[tile_array_index], 1);
+
+            tiles[tile_array_index + num_faces_in_cur_tile + 1] = face_index * 2 + renderable_face_offset;
+        }
+    }
+
     return;
 }
 
-__kernel void rasterizer(__global RI_renderable_face *renderable_faces, __global uint* textures, __global uint *frame_buffer, int width, int height, int half_width, int half_height, int number_of_renderable_faces, int number_of_split_renderable_faces){
+__kernel void rasterizer(__global RI_renderable_face *renderable_faces, __global uint* textures, __global uint *frame_buffer, int width, int height, int half_width, int half_height, int number_of_renderable_faces, int number_of_split_renderable_faces, __global uint* tiles){
     int pixel_x = get_global_id(0); if (pixel_x >= width) return;
     int pixel_y = get_global_id(1); if (pixel_y >= height) return;
-    int idx = pixel_y * width + pixel_x;
+    int idx = (height - pixel_y) * width + pixel_x;
 
     int x = pixel_x - half_width;
-    int y = height - pixel_y - half_height;
+    int y = pixel_y - half_height;
+
+    uint tile_width = tiles[0];
+    uint tile_height = tiles[1];
+    uint faces_per_tile = tiles[2];
+    uint number_of_horizontal_tiles = tiles[3];
+    uint number_of_vertical_tiles = tiles[4];
+
+    uint tile_x = fmin(fmax(floor((float)(pixel_x / tile_width)), 0), number_of_horizontal_tiles);
+    uint tile_y = fmin(fmax(floor((float)(pixel_y / tile_height)), 0), number_of_vertical_tiles);
 
     double z = INFINITY;
 
     uint pixel_color = 0x11111111;
 
-    for (int face_i = 0; face_i < number_of_renderable_faces * 2; ++face_i){
-        __global RI_renderable_face *current_face = &renderable_faces[face_i];
+    uint tile_array_index = 5 + (tile_y * number_of_horizontal_tiles + tile_x) * (1 + faces_per_tile);
+
+    uint num_faces_in_cur_tile = tiles[tile_array_index];
+
+    // debug tiles
+    // if (num_faces_in_cur_tile > 0) pixel_color = 0x00FF00FF;
+
+    for (int face_i = 0; face_i < num_faces_in_cur_tile; ++face_i){
+        __global RI_renderable_face *current_face = &renderable_faces[tiles[tile_array_index + face_i + 1]];
         
         if (!current_face->should_render) continue;
         
@@ -657,6 +709,9 @@ __kernel void rasterizer(__global RI_renderable_face *renderable_faces, __global
         z = interpolated_z;
     }
     
+    // debug tiles
+    // if (pixel_x % tile_width == 0 || pixel_y % tile_height == 0) pixel_color = 0xFFFFFFFF;
+
     frame_buffer[idx] = pixel_color;
 
     return;

+ 13 - 5
src/launch program/main.c

@@ -19,6 +19,9 @@ int main(){
     scene->camera.FOV = 1.5;
     scene->camera.min_clip = 0.1;
 
+    context->opencl.tile_width = 64;
+    context->opencl.tile_height = 36;
+
     RI_mesh *skybox_mesh = RI_load_mesh("objects/skybox.obj");
     RI_mesh *gordon_mesh = RI_load_mesh("objects/gordon_freeman.obj");
     RI_mesh *gordon_head_mesh = RI_load_mesh("objects/gordon_freeman_head.obj");
@@ -37,33 +40,38 @@ int main(){
     scene->actors[2] = RI_new_actor();
     scene->actors[3] = RI_new_actor();
     scene->actors[4] = RI_new_actor();
-         
+
     context->window.aspect_mode = RI_ASPECT_MODE_LETTERBOX;
 
     scene->actors[0]->mesh = skybox_mesh;
     scene->actors[0]->texture = skybox_texture;
-    scene->actors[0]->scale = (RI_vector_3){10000, 10000, 10000};
+    scene->actors[0]->scale = (RI_vector_3){1000, 1000, 1000};
     scene->actors[0]->position = (RI_vector_3){0, 0, 0};
+// scene->actors[0]->active = 0;
 
     scene->actors[1]->mesh = gordon_head_mesh;
     scene->actors[1]->texture = gordon_face_texture;
     scene->actors[1]->scale = (RI_vector_3){1, 1, 1};
     scene->actors[1]->position = (RI_vector_3){0, -60, 150};
+// scene->actors[1]->active = 0;
 
     scene->actors[2]->mesh = gordon_mesh;
     scene->actors[2]->texture = gordon_texture;
     scene->actors[2]->scale = (RI_vector_3){1, 1, 1};
     scene->actors[2]->position = (RI_vector_3){0, -60, 150};
-    
+    // scene->actors[2]->active = 0;
+
     scene->actors[3]->mesh = text_mesh;
     // scene->actors[3]->texture = gordon_texture;
     scene->actors[3]->scale = (RI_vector_3){.4, .8, .8};
     scene->actors[3]->position = (RI_vector_3){0, 10, 150};
+// scene->actors[3]->active = 0;
 
     scene->actors[4]->mesh = plane_mesh;
     scene->actors[4]->texture = emoji_texture;
     scene->actors[4]->scale = (RI_vector_3){100, 50, 30};
-    scene->actors[4]->position = (RI_vector_3){100, 40, 180};
+    scene->actors[4]->position = (RI_vector_3){60, 40, 180};
+// scene->actors[4]->active = 0;
 
     RI_euler_rotation_to_quaternion(&scene->actors[0]->rotation, (RI_vector_3){0, 0, 0});
     RI_euler_rotation_to_quaternion(&scene->actors[1]->rotation, (RI_vector_3){0, 3.14159, 0});
@@ -87,7 +95,7 @@ int main(){
         
         // scene->camera.FOV = context->current_frame;
         
-        RI_euler_rotation_to_quaternion(&scene->actors[3]->rotation, (RI_vector_3){0, rotation, 0});
+        RI_euler_rotation_to_quaternion(&scene->actors[0]->rotation, (RI_vector_3){0, rotation, 0});
         RI_euler_rotation_to_quaternion(&scene->actors[4]->rotation, (RI_vector_3){-1.5 + rotation * .6, rotation, 1 + rotation * .6});
 
 

+ 103 - 16
src/main/main.c

@@ -334,6 +334,55 @@ RI_mesh *RI_load_mesh(char *filename){
     new_mesh->face_count = object_face_count;
     new_mesh->face_index = previous_face_count;
 
+    if (object_face_count > context.opencl.lagest_face_count){
+        context.opencl.lagest_face_count = object_face_count;
+    
+        // rebuild tile array
+        if (context.opencl.tiles_mem_buffer) clReleaseMemObject(context.opencl.tiles_mem_buffer);
+
+        // # of horizontal tiles
+        int num_of_h_tiles = ceil((double)context.window.width / (double)context.opencl.tile_width);
+        // # of vertical tiles
+        int num_of_v_tiles = ceil((double)context.window.height / (double)context.opencl.tile_height);
+
+        context.opencl.num_h_tiles = num_of_h_tiles;
+        
+        context.opencl.num_v_tiles = num_of_v_tiles;
+
+        int tile_count = num_of_v_tiles * num_of_h_tiles + num_of_h_tiles;
+
+        // t width, t height, faces per tile,    {faces in this tile, ... faces} <- repeat for # of tiles
+        int tile_buffer_size = sizeof(uint32_t) * (5 + tile_count * (object_face_count + 1));
+
+        context.opencl.tiles_mem_buffer = clCreateBuffer(
+            context.opencl.context, 
+            CL_MEM_READ_WRITE, 
+            tile_buffer_size, 
+            NULL, NULL
+        );
+
+        uint32_t t_info[5] = {context.opencl.tile_width, context.opencl.tile_height, object_face_count, num_of_h_tiles, num_of_v_tiles};
+
+        clEnqueueWriteBuffer(
+            context.opencl.queue, 
+            context.opencl.tiles_mem_buffer, 
+            CL_TRUE, 
+            0, 
+            sizeof(uint32_t) * 5, 
+            t_info, 
+            0, NULL, NULL
+        );
+
+        clFinish(context.opencl.queue);
+
+        clSetKernelArg(context.opencl.tile_clear_kernel, 0, sizeof(cl_mem), &context.opencl.tiles_mem_buffer);
+
+        clSetKernelArg(context.opencl.rasterization_kernel, 9, sizeof(cl_mem), &context.opencl.tiles_mem_buffer);
+
+        // 34: uint32_t tiles_mem_buffer
+        clSetKernelArg(context.opencl.transformation_kernel, 34, sizeof(cl_mem), &context.opencl.tiles_mem_buffer);
+    }
+
     debug(
         "[Mesh Loader] Loaded mesh \"%s\"! %d faces, %d verticies, %d normals, %d uvs", 
         RI_DEBUG_MESH_LOADER_LOADED_MESH,
@@ -366,8 +415,7 @@ RI_mesh *RI_load_mesh(char *filename){
             0, NULL, NULL
         );
 
-    clFinish(context.opencl.queue);
-
+        clFinish(context.opencl.queue);
 
         clSetKernelArg(
             context.opencl.transformation_kernel, 
@@ -439,12 +487,45 @@ void RI_render(RI_scene *scene){
     clock_t start_time, end_time;
     
     start_time = clock();
-    
+  
+    int local_group_size_x = 16;
+    int local_group_size_y = 16;
+
     debug("---FRAME START-------------------------------------------\n", 
         RI_DEBUG_FRAME_START_END_MARKERS
     );
 
-    context.defaults.default_texture = RI_load_image("textures/missing_texture.bmp");
+    int local_c_size_x = (int)fmin(context.opencl.num_h_tiles, local_group_size_x);
+    int local_c_size_y = (int)fmin(context.opencl.num_v_tiles, local_group_size_y);
+
+    const size_t c_global_work_size[2] = {
+            local_c_size_x * ceil(context.opencl.num_h_tiles / (float)local_c_size_x), 
+            local_c_size_y * ceil(context.opencl.num_v_tiles / (float)local_c_size_y)
+        };
+
+    const size_t c_local_work_size[2] = {
+        (int)fmin(context.opencl.num_h_tiles, local_group_size_x), 
+        (int)fmin(context.opencl.num_v_tiles, local_group_size_y)
+    };
+
+    clerror = clEnqueueNDRangeKernel(
+        context.opencl.queue, 
+        context.opencl.tile_clear_kernel, 
+        2, 
+        NULL, 
+        c_global_work_size, 
+        c_local_work_size, 
+        0, NULL, &clevent
+    );
+
+    if (clerror != CL_SUCCESS) 
+        debug("error enqueing tile clear kernel (%d)", 
+            RI_DEBUG_TRANSFORMER_ERROR, 
+            clerror
+        );
+    
+    clFinish(context.opencl.queue);
+
 
     // transformer 
     
@@ -480,10 +561,7 @@ void RI_render(RI_scene *scene){
     clSetKernelArg(context.opencl.transformation_kernel, 27, sizeof(double), &scene->camera.rotation.y);
     // 28, double camera_r_z
     clSetKernelArg(context.opencl.transformation_kernel, 28, sizeof(double), &scene->camera.rotation.z);
-
-
-    int local_group_size_x = 16;
-    int local_group_size_y = 16;
+    
 
     // count faces
     scene->face_count = 0;
@@ -561,14 +639,14 @@ void RI_render(RI_scene *scene){
     for (int actor_index = 0; actor_index < scene->length_of_actors_array; ++actor_index){
         RI_actor *actor = scene->actors[actor_index];
         
+        if (scene->actors[actor_index]->mesh->face_count <= 0 || !scene->actors[actor_index]->active) continue;
+        
         debug("actor index: %d face count: %d", 
             RI_DEBUG_TRANSFORMER_CURRENT_ACTOR, 
             actor_index, 
             actor->mesh->face_count
         );
 
-        if (scene->actors[actor_index]->mesh->face_count <= 0) continue;
-        
         int face_sqrt = ceil(sqrt(scene->actors[actor_index]->mesh->face_count));
 
         int local_t_size = (int)fmin(face_sqrt, local_group_size_x);
@@ -667,7 +745,7 @@ void RI_render(RI_scene *scene){
         );
 
         if (clerror != CL_SUCCESS) 
-            debug("error enqueing kernel (%d)", 
+            debug("error enqueing transformation kernel (%d)", 
                 RI_DEBUG_TRANSFORMER_ERROR, 
                 clerror
             );
@@ -839,7 +917,7 @@ void RI_tick(){
 
 RI_context *RI_get_context(){
     context.sdl = (RI_SDL){NULL, NULL, NULL, NULL, -1};
-    context.opencl = (RI_CL){NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 1, 0, 0};
+    context.opencl = (RI_CL){NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, 1, 0, 0, 16, 16, 0, 0, 0};
     context.window = (RI_window){800, 800, 400, 400, "RasterIver Window", RI_ASPECT_MODE_LETTERBOX};
     
     context.debug_flags = RI_DEBUG_ERRORS;
@@ -1007,6 +1085,13 @@ int RI_init(){
         return 1;
     }
 
+    context.opencl.tile_clear_kernel = clCreateKernel(rasterization_program, "clear_tile_array", &clerror);
+
+    if (clerror != CL_SUCCESS){
+        debug("couldn't create tile array clearer kernel", RI_DEBUG_OPENCL_ERROR);
+        return 1;
+    }
+
     // rasterizer
 
     context.opencl.renderable_faces_mem_buffer = clCreateBuffer(
@@ -1055,8 +1140,7 @@ int RI_init(){
     clSetKernelArg(context.opencl.rasterization_kernel, 6, sizeof(int), &context.window.half_height);
     clSetKernelArg(context.opencl.rasterization_kernel, 7, sizeof(int), &context.current_renderable_face_index);
     clSetKernelArg(context.opencl.rasterization_kernel, 8, sizeof(int), &context.current_split_renderable_face_index);
-
-    // transformer
+    clSetKernelArg(context.opencl.rasterization_kernel, 9, sizeof(cl_mem), &context.opencl.tiles_mem_buffer);
 
     // transformer
 
@@ -1152,7 +1236,10 @@ int RI_init(){
     // clSetKernelArg(context.opencl.transformation_kernel, 32, sizeof(uint16_t), &texture_height);
     // // 33: uint32_t texture_index
     // clSetKernelArg(context.opencl.transformation_kernel, 33, sizeof(uint32_t), &texture_index);
-
+    
+    // // 34: uint32_t tiles mem buffer
+    // clSetKernelArg(context.opencl.transformation_kernel, 34, sizeof(cl_mem), &context.opencl.tiles_mem_buffer);
+    
     context.defaults.default_actor = RI_malloc(sizeof(RI_actor));
 
     context.defaults.default_actor->mesh = RI_load_mesh("objects/error_object.obj");
@@ -1161,7 +1248,7 @@ int RI_init(){
     context.defaults.default_actor->position = (RI_vector_3){0, 0, 0};
     context.defaults.default_actor->rotation = (RI_vector_4){1, 0, 0, 0};
     context.defaults.default_actor->scale = (RI_vector_3){1, 1, 1};
-    context.defaults.default_actor->texture = RI_load_image("textures/missing_texture.bmp");
+    context.defaults.default_actor->texture = RI_load_image("textures/missing_texture.bmp");                                                                                                                                                   
 
     return 0;
 }