Job_SignsPads/STM32/Code/STM32F405/nnom_src/backends/nnom_local.c

1690 lines
58 KiB
C
Raw Normal View History

2025-04-22 02:29:37 +00:00
/*
* Copyright (c) 2018-2020
* Jianjia Ma
* majianjia@live.com
*
* SPDX-License-Identifier: Apache-2.0
*
* Notice:
* Code in this file inlcudes derivative works from CMSIS
* Please check the LICENSE file for detial.
*
* Change Logs:
* Date Author Notes
* 2019-02-05 Jianjia Ma The first version
* 2019-03-19 Jianjia Ma Local C implementation partly from CMSIS-NN
* 2019-06-19 Jianjia Ma Implement CHW functions
*/
#include "nnom.h"
#include "nnom_local.h"
// modified from CMSIS-NN test_ref
void local_avepool_q7_HWC(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
const uint16_t output_shift, // output right shift
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int sum = 0;
int count = 0;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
count++;
}
}
}
Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
}
}
}
}
void local_avepool_q7_CHW(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
const uint16_t output_shift, // output right shift
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
int32_t ch_offset;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int sum = 0;
int count = 0;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
sum += Im_in[ch_offset + (k_x + k_y * dim_im_in_x)];
count++;
}
}
}
Im_out[i_ch_in*dim_im_out_x*dim_im_out_y + (i_x + i_y * dim_im_out_x)] = sum / (count>>output_shift);
}
}
}
}
// modified from CMSIS-NN test_ref
void local_maxpool_q7_HWC(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int max = -129;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)] > max)
{
max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
}
}
}
}
Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = max;
}
}
}
}
void local_maxpool_q7_CHW(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
int32_t ch_offset;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
ch_offset = i_ch_in * dim_im_out_x * dim_im_out_y;
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int max = -129;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
if (Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)] > max)
{
max = Im_in[i_ch_in * dim_im_in_x * dim_im_in_y + (k_x + k_y * dim_im_in_x)];
}
}
}
}
Im_out[ch_offset+(i_x + i_y * dim_im_out_x)] = max;
}
}
}
}
// temporary for the thesis
// shift according to the maximum
void local_sumpool_q7_HWC(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, size = 4*output_size
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
int32_t *buf = (int32_t *)bufferA;
// stage2
// int32_t max_abs = 0;
// int32_t output_shift;
// size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
// save in 32bit
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int sum = 0;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
}
}
}
// 32bit
buf[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum;
}
}
}
// // find max amount results
// for (int i = 0; i < output_size; i++)
// {
// int32_t val = buf[i];
// if (val < 0)
// val = -val;
// if (val > max_abs)
// max_abs = val;
// }
// // find best shift to cover the max
// for (output_shift = 0;; output_shift++)
// {
// if (127 * (1 + output_shift) >= max_abs)
// break;
// }
// // shift the results
// for (int i = 0; i < output_size; i++)
// {
// Im_out[i] = buf[i] >> output_shift;
// }
//return output_shift;
}
// temporary for the thesis
// shift according to the maximum
void local_sumpool_q7_CHW(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t padding_x, // padding sizes
const uint16_t padding_y, // padding sizes
const uint16_t stride_x, // stride
const uint16_t stride_y, // stride
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, size = 4*output_size
q7_t *Im_out)
{
int16_t i_ch_in, i_x, i_y;
int16_t k_x, k_y;
int32_t *buf = (int32_t *)bufferA;
int32_t i_ch_offset, o_ch_offset;
// stage2
// int32_t max_abs = 0;
// int32_t output_shift;
// size_t output_size = dim_im_out_x * dim_im_out_x * ch_im_in;
// save in 32bit
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
i_ch_offset = i_ch_in*dim_im_in_x*dim_im_in_y;
o_ch_offset = i_ch_in*dim_im_out_x*dim_im_out_y;
for (i_y = 0; i_y < dim_im_out_y; i_y++)
{
for (i_x = 0; i_x < dim_im_out_x; i_x++)
{
int sum = 0;
for (k_y = i_y * stride_y - padding_y; k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++)
{
for (k_x = i_x * stride_x - padding_x; k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x)
{
sum += Im_in[i_ch_offset + (k_x + k_y * dim_im_in_x)];
}
}
}
// 32bit
buf[o_ch_offset + (i_x + i_y * dim_im_out_x)] = sum;
}
}
}
// // find max amount results
// for (int i = 0; i < output_size; i++)
// {
// int32_t val = buf[i];
// if (val < 0)
// val = -val;
// if (val > max_abs)
// max_abs = val;
// }
// // find best shift to cover the max
// for (output_shift = 0;; output_shift++)
// {
// if (127 * (1 + output_shift) >= max_abs)
// break;
// }
// // shift the results
// for (int i = 0; i < output_size; i++)
// {
// Im_out[i] = buf[i] >> output_shift;
// }
//return output_shift;
}
// customised up sample pooling
void local_up_sampling_q7_HWC(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_x, i_y;
// for loop for each pixel in input image.
for (i_y = 0; i_y < dim_im_in_y; i_y++)
{
for (i_x = 0; i_x < dim_im_in_x; i_x++)
{
// copy all the channels together.
const q7_t *p_in = Im_in + (i_y * dim_im_in_x + i_x ) * ch_im_in;
q7_t *pout = Im_out + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y) * ch_im_in;
// copy along x axis
for(int i = 0; i<dim_kernel_x; i++)
nnom_memcpy(pout + i * ch_im_in, p_in, ch_im_in);
// duplicate the copied x data into y axis.
for(int i = 1; i<dim_kernel_y; i++)
nnom_memcpy(pout + i * ch_im_in * dim_im_in_x * dim_kernel_x, pout, ch_im_in * dim_kernel_x);
}
}
}
void local_up_sampling_q7_CHW(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimension x or W
const uint16_t dim_im_in_y, // input image dimension y or H
const uint16_t ch_im_in, // number of input image channels
const uint16_t dim_kernel_x, // window kernel size
const uint16_t dim_kernel_y, // window kernel size
const uint16_t dim_im_out_x, // output image dimension x or W
const uint16_t dim_im_out_y, // output image dimension y or H
q7_t *bufferA, // a buffer for local storage, NULL by now
q7_t *Im_out)
{
int16_t i_x, i_y, ch;
// for loop for channel
for(ch=0; ch<ch_im_in; ch++)
{
// for loop for each pixel in input image.
for (i_y = 0; i_y < dim_im_in_y; i_y++)
{
for (i_x = 0; i_x < dim_im_in_x; i_x++)
{
const q7_t *p_in = Im_in + ch * dim_im_in_x * dim_im_in_y + (i_y * dim_im_in_x + i_x);
q7_t *pout = Im_out + ch * dim_im_out_x * dim_im_out_y + (i_y * dim_im_in_x * dim_kernel_x * dim_kernel_y + i_x * dim_kernel_y);
// cpy along x axis
for(int i = 0; i<dim_kernel_x; i++)
*(pout + i) = *p_in;
// duplicate the copied x data into y axis.
for(int i = 1; i<dim_kernel_y; i++)
nnom_memcpy(pout + i * dim_im_in_x * dim_kernel_x, pout, dim_kernel_x);
}
}
}
}
void local_convolve_HWC_q7_nonsquare(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const q7_t *wt, // kernel weights
const uint16_t ch_im_out, // number of filters, i.e., output image channels
const uint16_t dim_kernel_x, // filter kernel size x
const uint16_t dim_kernel_y, // filter kernel size y
const uint16_t padding_x, // padding sizes x
const uint16_t padding_y, // padding sizes y
const uint16_t stride_x, // stride x
const uint16_t stride_y, // stride y
const uint16_t dilation_x, // dilation x
const uint16_t dilation_y, // dilation y
const q7_t *bias, // bias
const nnom_qformat_param_t *bias_shift, // bias shifts
const nnom_qformat_param_t *out_shift, // output shift
const nnom_qtype_t q_type, // per channel or per tensor
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y, // output image dimension y
q15_t *bufferA, //buffer space for input
q7_t *bufferB //buffer space for output
)
{
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
int in_pix_loc, wt_loc;
int shift_idx, shift_steps;
if(q_type == NNOM_QTYPE_PER_AXIS)
shift_steps = 1;
else
shift_steps = 0;
for (i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
{
for (j = 0; j < dim_im_out_y; j++)
{
int32_t base_idx_y = stride_y * j - padding_y;
for (k = 0; k < dim_im_out_x; k++)
{
int32_t base_idx_x = stride_x * k - padding_x;
int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
if(bias)
conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
else
conv_out = (q31_t) NNOM_ROUND(out_shift[shift_idx]);
for (m = ker_y_start; m < ker_y_end; m++)
{
for (n = ker_x_start; n < ker_x_end; n++)
{
in_row = stride_y * j + m * dilation_y - padding_y;
in_col = stride_x * k + n * dilation_x - padding_x;
// pre-calculate the pixel location and weight location to improve the performance.
in_pix_loc = (in_row * dim_im_in_x + in_col) * ch_im_in;
wt_loc = i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in;
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[in_pix_loc + l] * wt[wt_loc + l];
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
}
}
}
}
void local_convolve_CHW_q7_nonsquare(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const q7_t *wt, // kernel weights
const uint16_t ch_im_out, // number of filters, i.e., output image channels
const uint16_t dim_kernel_x, // filter kernel size x
const uint16_t dim_kernel_y, // filter kernel size y
const uint16_t padding_x, // padding sizes x
const uint16_t padding_y, // padding sizes y
const uint16_t stride_x, // stride x
const uint16_t stride_y, // stride y
const uint16_t dilation_x, // dilation x
const uint16_t dilation_y, // dilation y
const q7_t *bias, // bias
const nnom_qformat_param_t *bias_shift, // bias shifts
const nnom_qformat_param_t *out_shift, // output shift
const nnom_qtype_t q_type, // per channel or per tensor
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y, // output image dimension y
q15_t *bufferA, //buffer space for input
q7_t *bufferB //buffer space for output
)
{
int i, j, k, l, m, n;
long conv_out;
int in_row, in_col;
int shift_idx, shift_steps;
if(q_type == NNOM_QTYPE_PER_AXIS)
shift_steps = 1;
else
shift_steps = 0;
for(i = 0, shift_idx = 0; i < ch_im_out; i++, shift_idx += shift_steps)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
if(bias)
conv_out = ((q31_t)(bias[i]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
else
conv_out = (q31_t) NNOM_ROUND(out_shift[shift_idx]);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m * dilation_y - padding_y;
in_col = stride_x * k + n * dilation_x - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) + l * dim_im_in_x * dim_im_in_y] *
wt[(m * dim_kernel_x + n) * ch_im_in * ch_im_out + l * ch_im_out + i];
}
}
}
}
Im_out[i * dim_im_out_x * dim_im_out_y + (j * dim_im_out_x + k)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
}
}
}
}
#define FALSE 0
#define TRUE 1
static int alg_deconv2d_calculate_position(
int pos,
int stride,
int padding,
int dim_kernel,
int dim_in,
int* in_start,
int* kernel_start,
int* kernel_end)
{
int is_zero = FALSE;
int of, adj;
is_zero = FALSE;
*in_start = pos/stride;
of = pos%stride;
*kernel_start = padding - of;
if(*kernel_start >= 0) {
adj = MIN(*in_start, *kernel_start/stride);
*kernel_start -= adj*stride;
*in_start -= adj;
} else {
adj = -*kernel_start + dim_kernel;
if(adj<=stride) {
is_zero = TRUE;
} else {
adj = MIN(dim_in-1-*in_start, adj/stride);
*kernel_start += adj*stride;
*in_start += adj;
}
}
of = dim_kernel - 1 - *kernel_start;
adj = MIN(dim_in-1-*in_start, of/stride);
*kernel_end = *kernel_start + adj*stride;
return is_zero;
}
void local_conv_trans_HWC_q7_nonsquare(const int8_t * Im_in,
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const q7_t *wt, // kernel weights
const uint16_t ch_im_out, // number of filters, i.e., output image channels
const uint16_t dim_kernel_x, // filter kernel size x
const uint16_t dim_kernel_y, // filter kernel size y
const uint16_t padding_x, // padding sizes x
const uint16_t padding_y, // padding sizes y
const uint16_t stride_x, // stride x
const uint16_t stride_y, // stride y
const uint16_t dilation_x, // dilation x
const uint16_t dilation_y, // dilation y
const q7_t *bias, // bias
const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y, // output image dimension y
q15_t *bufferA, //buffer space for input
q7_t *bufferB //buffer space for output
)
// {
// int ox, oy, oc, ky, kx, kc, ix, iy;
// int conv_out;
// int in_pix_loc, wt_loc;
// (void)dilation_y;
// (void)dilation_x;
// // padding and stride are applied to output
// for (oc = 0; oc < ch_im_out; oc++)
// {
// for (oy = 0; oy < dim_im_out_y; oy++)
// {
// for (ox = 0; ox < dim_im_out_x; ox++)
// {
// conv_out = ((q31_t)(bias[oc]) << bias_shift) + NNOM_ROUND(out_shift);
// for (ky = 0; ky < dim_kernel_y; ky++)
// {
// for (kx = 0; kx < dim_kernel_x; kx++)
// {
// // input y, input x location
// iy = oy / stride_y + ky - padding_y;
// ix = ox / stride_x + kx - padding_x;
// if(ix >= 0 && iy >= 0 && ix < dim_im_in_y && iy< dim_im_in_y)
// {
// in_pix_loc = (iy * dim_im_in_x + ix) * ch_im_in;
// wt_loc = oc * ch_im_in * dim_kernel_y * dim_kernel_x + (ky * dim_kernel_x + kx) * ch_im_in;
// for (kc = 0; kc < ch_im_in; kc++)
// {
// conv_out += Im_in[in_pix_loc + kc] * wt[wt_loc + kc];
// }
// }
// }
// }
// Im_out[oc + (oy * dim_im_out_x + ox) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
// }
// }
// }
// }
{
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
int kernel_start_x,kernel_end_x;
int kernel_start_y,kernel_end_y;
int in_row_start, in_col_start;
int is_zero;
for (i = 0; i < ch_im_out; i++) {
for (j = 0; j < dim_im_out_y; j++) {
is_zero = alg_deconv2d_calculate_position(j, stride_y, padding_y, dim_kernel_y,
dim_im_in_y, &in_row_start, &kernel_start_y, &kernel_end_y);
if(is_zero) {
conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
conv_out = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
for (k = 0; k < dim_im_out_x; k++) {
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) conv_out;
}
continue;
}
for (k = 0; k < dim_im_out_x; k++) {
conv_out = ((q31_t)(bias[i]) << bias_shift) + NNOM_ROUND(out_shift);
is_zero = alg_deconv2d_calculate_position(k, stride_x, padding_x, dim_kernel_x,
dim_im_in_x, &in_col_start, &kernel_start_x, &kernel_end_x);
if(is_zero) {
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = conv_out;
continue;
}
for (m = kernel_start_y, in_row = in_row_start; m <= kernel_end_y; m+=stride_y, in_row++) {
for (n = kernel_start_x, in_col = in_col_start; n <= kernel_end_x; n+=stride_x, in_col++) {
if ((in_row >= 0) && (in_col >= 0) &&
(in_row < dim_im_in_y) && (in_col < dim_im_in_x)) {
for (l = 0; l < ch_im_in; l++) {
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __NNOM_SSAT((conv_out >> out_shift), 8);
}
}
}
}
void local_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,// input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const q7_t *wt, // kernel weights
const uint16_t ch_im_out, // number of filters, i.e., output image channels
const uint16_t dim_kernel_x, // filter kernel size x
const uint16_t dim_kernel_y, // filter kernel size y
const uint16_t padding_x, // padding sizes x
const uint16_t padding_y, // padding sizes y
const uint16_t stride_x, // stride x
const uint16_t stride_y, // stride y
const uint16_t dilation_x, // dilation x
const uint16_t dilation_y, // dilation y
const q7_t *bias, // bias
const nnom_qformat_param_t *bias_shift, // bias shifts
const nnom_qformat_param_t *out_shift, // output shift
const nnom_qtype_t q_type, // per channel or per tensor
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y, // output image dimension y
q15_t *bufferA, //buffer space for input
q7_t *bufferB //buffer space for output
)
{
int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
int i_ker_y, i_ker_x;
int i_out = 0;
int shift_idx, shift_steps;
int ch_mult = ch_im_out / ch_im_in;
q31_t conv_out;
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
const int32_t base_idx_y = stride_y * i_out_y - padding_y;
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - padding_x;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for(i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
i_ch_out = i_ch_mult + i_ch_in * ch_mult;
int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
if (bias)
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
else
conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) * ch_im_in + i_ch_in;
int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * (ch_im_in * ch_mult) + i_ch_out;
conv_out += Im_in[in_pix_loc] * wt[wt_loc];
}
}
Im_out[i_out++] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
}
}
}
}
}
void local_depthwise_separable_conv_CHW_q7_nonsquare(const q7_t *Im_in,// input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const q7_t *wt, // kernel weights
const uint16_t ch_im_out, // number of filters, i.e., output image channels
const uint16_t dim_kernel_x, // filter kernel size x
const uint16_t dim_kernel_y, // filter kernel size y
const uint16_t padding_x, // padding sizes x
const uint16_t padding_y, // padding sizes y
const uint16_t stride_x, // stride x
const uint16_t stride_y, // stride y
const uint16_t dilation_x, // dilation x
const uint16_t dilation_y, // dilation y
const q7_t *bias, // bias
const nnom_qformat_param_t *bias_shift, // bias shifts
const nnom_qformat_param_t *out_shift, // output shift
const nnom_qtype_t q_type, // per channel or per tensor
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y, // output image dimension y
q15_t *bufferA, //buffer space for input
q7_t *bufferB //buffer space for output
)
{
int i_out_y, i_out_x, i_ch_out, i_ch_in, i_ch_mult;
int i_ker_y, i_ker_x;
int i_out = 0;
int shift_idx, shift_steps;
int ch_mult = ch_im_out / ch_im_in;
q31_t conv_out;
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
const int32_t base_idx_y = stride_y * i_out_y - padding_y;
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - padding_x;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
i_ch_out = i_ch_mult + i_ch_in * ch_mult;
int32_t ker_y_start = MAX(0, -(base_idx_y-(dilation_y-1))/dilation_y);
int32_t ker_x_start = MAX(0, -(base_idx_x-(dilation_x-1))/dilation_x);
int32_t ker_y_end = MIN(dim_kernel_y, (dim_im_in_y - base_idx_y + (dilation_y-1))/dilation_y);
int32_t ker_x_end = MIN(dim_kernel_x, (dim_im_in_x - base_idx_x + (dilation_x-1))/dilation_x);
shift_idx = q_type == NNOM_QTYPE_PER_AXIS ? i_ch_out : 0;
if (bias)
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift[shift_idx]) + NNOM_ROUND(out_shift[shift_idx]);
else
conv_out = (q31_t)NNOM_ROUND(out_shift[shift_idx]);
for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y * dilation_y;
for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x * dilation_x;
int32_t in_pix_loc = (idx_y * dim_im_in_x + idx_x) + i_ch_in * dim_im_in_x * dim_im_in_y;
int32_t wt_loc = (i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out;
conv_out += Im_in[in_pix_loc] * wt[wt_loc];
}
}
Im_out[i_ch_out * dim_im_out_x * dim_im_out_y + (i_out_y * dim_im_out_x + i_out_x)] = (q7_t)__NNOM_SSAT((conv_out >> out_shift[shift_idx]), 8);
}
}
}
}
}
void local_zero_padding_HWC_q7(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const uint16_t padding_top, // padding sizes y
const uint16_t padding_bottom, // padding sizes y
const uint16_t padding_left, // padding sizes x
const uint16_t padding_right, // padding sizes x
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y) // output image dimension y
{
int i, size;
q7_t * p_out = Im_out;
// top rows
size = dim_im_out_x*ch_im_in*padding_top;
nnom_memset(p_out, 0, size);
p_out += size;
// middle
for(i=0; i<dim_im_in_y; i++)
{
// left - set to 0
size = ch_im_in * padding_left;
nnom_memset(p_out, 0, size);
p_out += size;
// data - copy a row
size = dim_im_in_x * ch_im_in;
nnom_memcpy(p_out, Im_in + i*size, size);
p_out += size;
// right - set to 0
size = ch_im_in * padding_right;
nnom_memset(p_out, 0, size);
p_out += size;
}
// bottom rows
nnom_memset(p_out, 0, dim_im_out_x*ch_im_in*padding_bottom);
}
void local_zero_padding_CHW_q7(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const uint16_t padding_top, // padding sizes y
const uint16_t padding_bottom, // padding sizes y
const uint16_t padding_left, // padding sizes x
const uint16_t padding_right, // padding sizes x
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y) // output image dimension y
{
int i, size, ch_offset;
q7_t * p_out = Im_out;
for(int ch=0; ch < ch_im_in; ch++)
{
p_out = Im_out + ch * dim_im_out_x * dim_im_out_y;
// top rows
size = dim_im_out_x*padding_top;
nnom_memset(p_out, 0, size);
p_out += size;
// middle
ch_offset = ch*dim_im_in_x*dim_im_in_y;
for(i=0; i<dim_im_in_y; i++)
{
// left - set to 0
nnom_memset(p_out, 0, padding_left);
p_out += padding_left;
// data - copy a row
nnom_memcpy(p_out, Im_in + i*dim_im_in_x + ch_offset, dim_im_in_x);
p_out += dim_im_in_x;
// right - set to 0
nnom_memset(p_out, 0, size);
p_out += padding_right;
}
// bottom
nnom_memset(p_out, 0, dim_im_out_x*padding_bottom);
}
}
void local_cropping_HWC_q7(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const uint16_t padding_top, // padding sizes y
const uint16_t padding_bottom, // padding sizes y
const uint16_t padding_left, // padding sizes x
const uint16_t padding_right, // padding sizes x
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y) // output image dimension y
{
int i, row_size;
const q7_t * p_in = Im_in;
// top rows to ignore
p_in += dim_im_in_x*ch_im_in*padding_top;
// middle
row_size = dim_im_out_x * ch_im_in;
for(i=0; i<dim_im_out_y; i++)
{
// left to ignore
p_in += ch_im_in * padding_left;
// data - copy a row
nnom_memcpy(Im_out + i*row_size, p_in, row_size);
p_in += row_size;
// right to ingore
p_in += ch_im_in * padding_right;
}
}
void local_cropping_CHW_q7(const q7_t *Im_in, // input image
const uint16_t dim_im_in_x, // input image dimention x
const uint16_t dim_im_in_y, // input image dimention y
const uint16_t ch_im_in, // number of input image channels
const uint16_t padding_top, // padding sizes y
const uint16_t padding_bottom, // padding sizes y
const uint16_t padding_left, // padding sizes x
const uint16_t padding_right, // padding sizes x
q7_t *Im_out, // output image
const uint16_t dim_im_out_x, // output image dimension x
const uint16_t dim_im_out_y) // output image dimension y
{
int i, ch, ch_offset;
const q7_t * p_in;
for(ch=0; ch < ch_im_in; ch++)
{
p_in = Im_in + dim_im_in_x * dim_im_in_y * ch; // ch offset to input image
p_in += dim_im_in_x*padding_top; // top to ignore
ch_offset = ch*dim_im_out_x*dim_im_out_y;
for(i=0; i<dim_im_out_y; i++)
{
// data - middle of a row
nnom_memcpy(Im_out + i*dim_im_out_x + ch_offset, p_in+padding_left, dim_im_out_x);
p_in += dim_im_in_x; // middle and right padding
}
}
}
void local_dot_q7_opt(const q7_t *pV, // pointer to vector
const q7_t *pM, // pointer to matrix
const uint16_t dim_vec, // length of the vector
const uint16_t num_of_rows, // numCol of A
const uint16_t out_shift, // amount of right-shift for output
q7_t *pOut) // output operand)
{
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q7_t *pA;
q7_t *pO = pOut;
while (rowCnt)
{
pA = pV;
q31_t sum = (q31_t) NNOM_ROUND(out_shift);
q31_t sum2 = (q31_t) NNOM_ROUND(out_shift);
q31_t sum3 = (q31_t) NNOM_ROUND(out_shift);
q31_t sum4 = (q31_t) NNOM_ROUND(out_shift);
uint16_t colCnt = dim_vec >> 2;
while (colCnt)
{
q7_t inA1 = *pA++;
q7_t inA3 = *pA++;
q7_t inA2 = *pA++;
q7_t inA4 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum += inA3 * inB1 + inA4 * inB2;
sum2 += inA3 * inB3 + inA4 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA3 * inB1 + inA4 * inB2;
sum4 += inA3 * inB3 + inA4 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out = (q31_t) NNOM_ROUND (out_shift);
pA = pV;
for (int j = 0; j < dim_vec; j++)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
rowCnt--;
}
}
void local_dot_q7(const q7_t *pV, // pointer to vector
const q7_t *pM, // pointer to matrix
const uint16_t dim_vec, // length of the vector
const uint16_t num_of_rows, // numCol of A
const uint16_t out_shift, // amount of right-shift for output
q7_t *pOut) // output operand)
{
for (int i = 0; i < num_of_rows; i++)
{
int ip_out = (q31_t) NNOM_ROUND(out_shift);
for (int j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
}
}
void local_fully_connected_q7_opt(const q7_t *pV, // pointer to vector
const q7_t *pM, // pointer to matrix
const uint16_t dim_vec, // length of the vector
const uint16_t num_of_rows, // numCol of A
const uint16_t bias_shift, // amount of left-shift for bias
const uint16_t out_shift, // amount of right-shift for output
const q7_t *bias, q7_t *pOut, // output operand
q15_t *vec_buffer)
{
uint16_t rowCnt = num_of_rows >> 2;
const q7_t *pB = pM;
const q7_t *pA;
q7_t *pO = pOut;
const q7_t *pBias = bias;
while (rowCnt)
{
pA = pV;
q31_t sum;
q31_t sum2;
q31_t sum3;
q31_t sum4;
uint16_t colCnt = dim_vec >> 2;
if(bias)
{
sum = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
sum2 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
sum3 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
sum4 = ((q31_t)(*pBias++) << bias_shift) + NNOM_ROUND(out_shift);
}
else
{
sum = (q31_t) NNOM_ROUND(out_shift);
sum2 = (q31_t) NNOM_ROUND(out_shift);
sum3 = (q31_t) NNOM_ROUND(out_shift);
sum4 = (q31_t) NNOM_ROUND(out_shift);
}
while (colCnt)
{
q7_t inA1 = *pA++;
q7_t inA3 = *pA++;
q7_t inA2 = *pA++;
q7_t inA4 = *pA++;
q7_t inB1 = *pB++;
q7_t inB3 = *pB++;
q7_t inB2 = *pB++;
q7_t inB4 = *pB++;
sum += inA1 * inB1 + inA2 * inB2;
sum2 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA1 * inB1 + inA2 * inB2;
sum4 += inA1 * inB3 + inA2 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum += inA3 * inB1 + inA4 * inB2;
sum2 += inA3 * inB3 + inA4 * inB4;
inB1 = *pB++;
inB3 = *pB++;
inB2 = *pB++;
inB4 = *pB++;
sum3 += inA3 * inB1 + inA4 * inB2;
sum4 += inA3 * inB3 + inA4 * inB4;
colCnt--;
}
colCnt = dim_vec & 0x3;
while (colCnt)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
sum += inA * inB;
inB = *pB++;
sum2 += inA * inB;
inB = *pB++;
sum3 += inA * inB;
inB = *pB++;
sum4 += inA * inB;
colCnt--;
}
*pO++ = (q7_t)__NNOM_SSAT((sum >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum2 >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum3 >> out_shift), 8);
*pO++ = (q7_t)__NNOM_SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = num_of_rows & 0x3;
while (rowCnt)
{
int ip_out;
if(bias)
ip_out=((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift);
else
ip_out=(q31_t)NNOM_ROUND(out_shift);
pA = pV;
for (int j = 0; j < dim_vec; j++)
{
q7_t inA = *pA++;
q7_t inB = *pB++;
ip_out += inA * inB;
}
*pO++ = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
rowCnt--;
}
}
void local_fully_connected_q7(const q7_t *pV, // pointer to vector
const q7_t *pM, // pointer to matrix
const uint16_t dim_vec, // length of the vector
const uint16_t num_of_rows, // numCol of A
const uint16_t bias_shift, // amount of left-shift for bias
const uint16_t out_shift, // amount of right-shift for output
const q7_t *bias, q7_t *pOut, // output operand
q15_t *vec_buffer)
{
if(bias)
{
for (int i = 0; i < num_of_rows; i++)
{
int ip_out = ((q31_t)(*bias++) << bias_shift) + NNOM_ROUND(out_shift);
for (int j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
}
}
else
{
for (int i = 0; i < num_of_rows; i++)
{
int ip_out = (q31_t)NNOM_ROUND(out_shift);
for (int j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = (q7_t)__NNOM_SSAT((ip_out >> out_shift), 8);
}
}
}
void local_softmax_q7(const q7_t *vec_in, const uint32_t dim_vec, q7_t *p_out)
{
q31_t sum;
int32_t i;
uint8_t shift;
q15_t base;
base = -257;
/* We first search for the maximum */
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
base = vec_in[i];
}
}
/*
* So the base is set to max-8, meaning
* that we ignore really small values.
* anyway, they will be 0 after shrinking to q7_t.
*/
base = base - 8;
sum = 0;
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
shift = (uint8_t)__NNOM_USAT(vec_in[i] - base, 5);
sum += 0x1 << shift;
}
}
/* This is effectively (0x1 << 20) / sum */
int output_base = 0x100000 / sum;
/*
* Final confidence will be output_base >> ( 13 - (vec_in[i] - base) )
* so 128 (0x1<<7) -> 100% confidence when sum = 0x1 << 8, output_base = 0x1 << 12
* and vec_in[i]-base = 8
*/
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
/* Here minimum value of 13+base-vec_in[i] will be 5 */
shift = (uint8_t)__NNOM_USAT(13 + base - vec_in[i], 5);
p_out[i] = (q7_t)__NNOM_SSAT((output_base >> shift), 8);
}
else
{
p_out[i] = 0;
}
}
}
// hard sigmoid,
// y=-1 if x < -2.5
// y=1 if x > 2.5
// otherwise y = 0.2 * x + 0.5 (y=0.20315 * x + 0.5)
void local_hard_sigmoid_q7(q7_t *data, uint32_t size, int16_t dec_bit)
{
int16_t limit = 2.5f * (1 << dec_bit)-1;
int16_t offset = 64; // 0.5 * 128
int16_t mult = 26; // 0.2 * 128
// int bit >= 0
for(int i=0; i<size; i++)
{
if(data[i] <= -limit)
data[i] = 0;
else if(data[i] >= limit)
data[i] = 127;
else
{
data[i] = ((int16_t)(data[i] * mult) >> dec_bit) + offset;
}
}
}
// hard tanh
// y=-1 if x < -1
// y=1 if x > 1
// otherwise y = x
void local_hard_tanh_q7(q7_t *data, uint32_t size, int16_t dec_bit)
{
int16_t int_bit = 7 - dec_bit;
int16_t limit = 1 << dec_bit;
if(dec_bit == 7)
return;
// int bit < 0
if(int_bit < 0)
for(int i=0; i<size; i++)
{
if(data[i] <= -limit)
data[i] = -128;
else if(data[i] >= limit)
data[i] = 127;
else
{
data[i] = data[i] >> (-int_bit);
}
}
else
// int bit >= 0
for(int i=0; i<size; i++)
{
if(data[i] <= -limit)
data[i] = -128;
else if(data[i] >= limit)
data[i] = 127;
else
{
data[i] = data[i] << int_bit;
}
}
}
void local_sigmoid_q7(q7_t *data, uint32_t size, int16_t int_width)
{
uint32_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
// saturation if int bit too large
if(int_width > 3)
{
while (i)
{
if(*pIn++ > 0)
*pOut++ = 127;
else
*pOut++ = 0;
i--;
}
}
// otherwise search table
else
{
while (i)
{
in = *pIn++;
out = nnom_sigmoid_table_q7[(uint8_t)(in >> shift_size)];
*pOut++ = out;
i--;
}
}
}
void local_tanh_q7(q7_t *data, uint32_t size, int16_t int_width)
{
uint32_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
// saturation if int bit too large
if(int_width > 3)
{
while (i)
{
in = *pIn++;
if(in > 0)
*pOut++ = 127;
else if ( in == 0)
*pOut++ = 0;
else
*pOut++ = -128;
i--;
}
}
// otherwise search table
else
{
while (i)
{
in = *pIn++;
out = nnom_tanh_table_q7[(uint8_t)(in >> shift_size)];
*pOut++ = out;
i--;
}
}
}
void local_relu_q7(q7_t *data, uint32_t size)
{
uint32_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
}
// alpha in q7 format with dec_bit=7
void local_leaky_relu_q7(q7_t *data, q7_t alpha, uint32_t size)
{
uint32_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
{
data[i] = data[i] * alpha / 128;
}
}
}
// alpha in q7 format with dec_bit=7
// max and threshold has the same Q format with the activation
void local_adv_relu_q7(q7_t *data, q7_t negative_slope, q7_t max, q7_t threshold, uint32_t size)
{
uint32_t i;
for (i = 0; i < size; i++)
{
// `f(x) = max_value` for `x >= max_value`,
// `f(x) = x` for `threshold <= x < max_value`,
// `f(x) = alpha * (x - threshold)` otherwise.
if(data[i] > max)
data[i] = max;
if (data[i] < threshold)
data[i] = (data[i] - threshold) * negative_slope / 128;
}
}
// matrix ops
void local_mult_q7(q7_t *pSrcA,
q7_t *pSrcB,
q7_t *pDst,
const uint16_t out_shift,
uint32_t blockSize)
{
uint32_t i;
for (i = 0; i < blockSize; i++)
{
q31_t product = pSrcA[i] * pSrcB[i];
pDst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_add_q7(q7_t *pSrcA,
q7_t *pSrcB,
q7_t *pDst,
const uint16_t out_shift,
uint32_t blockSize)
{
uint32_t i;
for (i = 0; i < blockSize; i++)
{
q31_t sum = pSrcA[i] + pSrcB[i];
pDst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_sub_q7(q7_t *pSrcA,
q7_t *pSrcB,
q7_t *pDst,
const uint16_t out_shift,
uint32_t blockSize)
{
uint32_t i;
for (i = 0; i < blockSize; i++)
{
q31_t sub = pSrcA[i] - pSrcB[i];
pDst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_multiple_add_q7( q7_t *p_dst,
const int16_t out_shift,
uint32_t block_size,
uint32_t num_block,
q7_t **p_src)
{
uint32_t i, blk;
q31_t sum;
for (i = 0; i < block_size; i++)
{
sum = 0;
for(blk=0; blk < num_block; blk++)
sum += p_src[blk][i];
p_dst[i] = (q7_t) __NNOM_SSAT(((sum + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_multiple_mult_q7( q7_t *p_dst,
const int16_t out_shift,
uint32_t block_size,
uint32_t num_block,
q7_t **p_src)
{
uint32_t i, blk;
q31_t product;
for (i = 0; i < block_size; i++)
{
product = 1;
for(blk=0; blk < num_block; blk++)
product *= p_src[blk][i];
p_dst[i] = (q7_t) __NNOM_SSAT(((product + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_multiple_sub_q7( q7_t *p_dst,
const int16_t out_shift,
uint32_t block_size,
uint32_t num_block,
q7_t **p_src)
{
uint32_t i, blk;
q31_t sub;
for (i = 0; i < block_size; i++)
{
sub = p_src[0][i];
for(blk=1; blk < num_block; blk++)
sub -= p_src[blk][i];
p_dst[i] = (q7_t) __NNOM_SSAT(((sub + NNOM_ROUND(out_shift)) >> out_shift), 8);
}
}
void local_q7_to_q15_no_shift(const q7_t *src, q15_t *des, uint32_t size)
{
// simple unloop
uint32_t count = size/8;
while (count-- > 0)
{
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
*des++ = (q15_t)*src++;
}
count = size%8;
while(count-- > 0)
*des++ = (q15_t)*src++;
}
void local_q7_to_q15(const q7_t *src, q15_t *des, uint32_t size)
{
// simple unloop
uint32_t count = size/8;
while (count-- > 0)
{
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
*des++ = (q15_t)*src++<<8;
}
count = size%8;
while(count-- > 0)
*des++ = (q15_t)*src++<<8;
}
// right shift q15 to q7
void local_q15_to_q7(const q15_t *src, q7_t *des, uint32_t shift, uint32_t size)
{
while(size-- >0)
{
*des = *src >> shift;
des++;
src++;
}
}