/*
 * Copyright (c) 2018-2020
 * Jianjia Ma
 * majianjia@live.com
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Change Logs:
 * Date           Author       Notes
 * 2019-07-23     Jianjia Ma   The first version
 */

#include <stdint.h>
#include <string.h>
#include <stdbool.h>

#include "nnom.h"
#include "nnom_local.h"
#include "nnom_layers.h"
#include "layers/nnom_concat.h"

nnom_layer_t *concat_s(const nnom_concat_config_t *config)
{
	nnom_layer_t* layer = Concat(config->axis);
	if(layer)
		layer->config = (void*) config;
	return layer;
}

// concate method
// concate requires more than one input module. aux input will be allocated in model.merge()
nnom_layer_t *Concat(int8_t axis)
{
	nnom_concat_layer_t *layer;
	nnom_layer_io_t *in, *out;
	size_t mem_size;

	// apply a block memory for all the sub handles.
	mem_size = sizeof(nnom_concat_layer_t) + sizeof(nnom_layer_io_t) * 2;
	layer = nnom_mem(mem_size);
	if (layer == NULL)
		return NULL;

	// distribut the memory to sub handles.
	in = (void *)((uint8_t*)layer + sizeof(nnom_concat_layer_t));
	out = (void *)((uint8_t*)in + sizeof(nnom_layer_io_t));

	// set type in layer parent
	layer->super.type = NNOM_CONCAT;
	layer->super.run = concat_run;
	layer->super.build = concat_build;
	// set buf state
	in->type = NNOM_TENSOR_BUF_TEMP;
	out->type = NNOM_TENSOR_BUF_TEMP;
	// put in & out on the layer.
	layer->super.in = io_init(layer, in);
	layer->super.out = io_init(layer, out);
	
	// axis
	layer->axis = axis; 

	return (nnom_layer_t *)layer;
}


nnom_status_t concat_build(nnom_layer_t *layer)
{
	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
	nnom_layer_io_t *in;
	uint32_t in_num = 0;
	int32_t num_dim;

	// for each input module, copy the shape from the output of last layer
	in = layer->in;
	while (in != NULL)
	{
		//get the last layer's output as input shape
		in->tensor = in->hook.io->tensor;
		in = in->aux;
		in_num++;
	}
	
	// allocate new tensor for output, keep the same dimension lenght
	layer->out->tensor = new_tensor(NNOM_QTYPE_PER_TENSOR, layer->in->tensor->num_dim, tensor_get_num_channel(layer->in->tensor));
	tensor_cpy_attr(layer->out->tensor, layer->in->tensor);
	
	// convert the axis. 
	if (cl->axis < 0)
		cl->axis = (layer->in->tensor->num_dim + cl->axis);
	else if (cl->axis >0)
		cl->axis = cl->axis -1; // keras use axis start from 1. we are using 0, 1, 2 (check?)

	// find out the concated axis
	num_dim = layer->in->tensor->num_dim;
	for (uint32_t i = 0; i < num_dim; i ++)
	{
		// exclue the concat axies
		if (i == cl->axis)
		{
			layer->out->tensor->dim[i] = 0;

			// add the same axis from all input up. 
			in = layer->in;
			while (in != NULL)
			{
				layer->out->tensor->dim[i] += in->tensor->dim[i];
				in = in->aux;
			}
			continue;
		}

		// check others, all other must be same shape
		in = layer->in;
		while (in != NULL && in->aux != NULL)
		{
			if (in->tensor->dim[i] != in->aux->tensor->dim[i])
				return NN_ARGUMENT_ERROR;
			in = in->aux;
		}

		// now set other axis
		layer->out->tensor->dim[i] = layer->in->tensor->dim[i];
	}

	return NN_SUCCESS;
}


#ifdef NNOM_USING_CHW
// axis index converter between HWC and CHW
static inline int chw_i(int hwc, int num_dim)
{
    num_dim = num_dim -1;
	hwc = hwc + 1;			
	if(hwc>num_dim) 
		hwc = 0;
	return hwc;
}
static inline int hwc_i(int chw, int num_dim)
{
    num_dim = num_dim -1;
	chw = chw - 1;			
	if(chw<num_dim) 
		chw = num_dim;
	return chw;
}
#endif

nnom_status_t concat_run(nnom_layer_t *layer)
{
	// by default, concat layer has mutiple (>=2) input and 1 output.
	nnom_concat_layer_t *cl = (nnom_concat_layer_t *)layer;
	nnom_layer_io_t *in;
    uint32_t dwidth = layer->in->tensor->bitwidth/8; // data width in byte

#ifdef NNOM_USING_CHW
	// Concatenate for HWC	
	uint8_t *pin;
	uint8_t *pout = layer->out->tensor->p_data;
	uint32_t block_size;
	uint32_t n_block;
	uint8_t num_dim = layer->in->tensor->num_dim;
	
	// calcualte number of block to concat. the other shapes before the concat axis
	n_block = 1;
	for(int i= 0; i< chw_i(cl->axis, num_dim); i++)
	{
		n_block *= layer->in->tensor->dim[hwc_i(i, num_dim)];
	}
	
	// concat all input layers
	for(int i=0; i<n_block; i++)
	{
		in = layer->in;
		while (in != NULL)
		{
			// the block size of concat data in this layer
			block_size = dwidth;
			for(int j= num_dim-1; j >= chw_i(cl->axis, num_dim); j--)
				block_size *= in->tensor->dim[hwc_i(j, num_dim)];
			// concat		
			pin = (uint8_t *)in->tensor->p_data + i * block_size;
			nnom_memcpy(pout, pin, block_size);
			pout += block_size;
			in = in->aux;
		}
	}
	
#else // end of CHW concate

	// Concatenate for HWC	
	uint8_t* pin;
	uint8_t* pout = layer->out->tensor->p_data;
	uint32_t block_size;
	uint32_t n_block;
	uint8_t num_dim = layer->in->tensor->num_dim;

	// calcualte the number of block to concat. (the other shapes before the concat axis)
	n_block = 1;
	for (int i = 0; i < cl->axis; i++)
		n_block *= layer->in->tensor->dim[i];

	// concat all input layers
	for (int i = 0; i < n_block; i++)
	{
		in = layer->in;
		while (in != NULL)
		{
			// the block size of concat data in this layer
			block_size = dwidth;
			for (int j = cl->axis; j < num_dim; j++)
				block_size *= in->tensor->dim[j];
			// concat		
			pin = (uint8_t*)in->tensor->p_data + i * block_size;
			nnom_memcpy(pout, pin, block_size);
			pout += block_size;
			in = in->aux;
		}
	}
#endif
	return NN_SUCCESS;
}