#include "CudaVersionCheck.h"
#include "LpvStructs.h"
#include "CommonStructs.h"
#include "LpvUtils.h"

#include "cutil.h"
#include <cstdio>
#include "../Defines.h"
#include "cutil_math.h"
#include <math_constants.h>

__global__ void deviceInjectGeometry(float4* target, int3 sz, RSMComponents rsm, char* ix, Transform transform, BoundingBox* boundingBox, float factor) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int idy = blockIdx.y * blockDim.y + threadIdx.y;
	int idz = blockIdx.z * blockDim.z + threadIdx.z;
	
	int3 unscaledId = make_int3(idx, idy, idz);
	int3 id = unscaledId * rsm.stride;
	
	if (id.x < rsm.size.x && id.y < rsm.size.y && id.z < rsm.size.z) {
		
		// Extract and decode normal from RSM
		float3 normal = normalize(make_float3(tex3D(rsm_normal, id.x, id.y, id.z)) * 2.0f - 1.0f);
		
		// Extract and decode depth from RSM into a position
		float3 texCoord = make_float3(id) / make_float3(rsm.size - 1);
		float4 posLightSpace = rsmDepthExtractLightSpacePosition(rsm_depth, rsm.size, texCoord);
		
		// Transform texel position into world space
		float4 posWorldSpace = transform4x4(posLightSpace, transform);
		
		// Transform world space position into volume space
		float3 gridSize = (boundingBox->max - boundingBox->min) / make_float3(sz);
		float3 pos = (make_float3(posWorldSpace)/posWorldSpace.w - boundingBox->min) / gridSize;
		
		float4 sh = constructSHClampedCosineLobeAroundDirection(normal);
		
		// For the geometry volume we want to offset the position by half a grid position. 
		int3 gi = make_int3(pos - 0.5f * normal/* + make_float3(0.5f)*/);
		
		if(isInside(sz, gi)) {
			
			float w = calculatePixelWeight(unscaledId, rsm.sampleSize);
			componentwiseAtomicFloat4Add(target[makeVolumeIndex(sz, gi)], w * factor * sh);
			ix[makeVolumeIndex(sz, gi)] = 0;
		}
	}
}

void cudaInjectGeometry(void* target, int gvSize, void* rsms[3], int rsmSize, int rsmSampleSize, void* ix, void* transform, void* boundingBox, float factor) {
	
	cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();

	CudaSafeCall(cudaBindTextureToArray(rsm_depth, reinterpret_cast<cudaArray*>(rsms[0]), desc));
	CudaSafeCall(cudaBindTextureToArray(rsm_color, reinterpret_cast<cudaArray*>(rsms[1]), desc));
	CudaSafeCall(cudaBindTextureToArray(rsm_normal, reinterpret_cast<cudaArray*>(rsms[2]), desc));
	
	RSMComponents rsm;
	rsm.size = make_int3(rsmSize, rsmSize, 6);
	rsm.sampleSize = make_int3(rsmSampleSize, rsmSampleSize, 6);
	rsm.stride = make_int3(rsmSize / rsmSampleSize, rsmSize / rsmSampleSize, 1);
	
	dim3 dimBlock(BLOCKSIZE, BLOCKSIZE, rsm.size.z);
	dim3 dimGrid(rsmSampleSize/BLOCKSIZE, rsmSampleSize/BLOCKSIZE, 1);
	
	deviceInjectGeometry<<<dimGrid, dimBlock>>>((float4*)target, make_int3(gvSize), rsm, (char*)ix, *(Transform*)transform, (BoundingBox*)boundingBox, factor);
	
	cudaUnbindTexture(rsm_depth);
	cudaUnbindTexture(rsm_color);
	cudaUnbindTexture(rsm_normal);
}
