Context Navigation

arm_nnfunctions.h

Last change on this file was 1, checked in by AlexLir, 3 years ago

File size: 48.5 KB

Rev	Line
[1]	1	/*
	2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
	3	*
	4	* SPDX-License-Identifier: Apache-2.0
	5	*
	6	* Licensed under the Apache License, Version 2.0 (the License); you may
	7	* not use this file except in compliance with the License.
	8	* You may obtain a copy of the License at
	9	*
	10	* www.apache.org/licenses/LICENSE-2.0
	11	*
	12	* Unless required by applicable law or agreed to in writing, software
	13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	15	* See the License for the specific language governing permissions and
	16	* limitations under the License.
	17	*/
	18
	19	/* ----------------------------------------------------------------------
	20	* Project: CMSIS NN Library
	21	* Title: arm_nnfunctions.h
	22	* Description: Public header file for CMSIS NN Library
	23	*
	24	* $Date: 13. July 2018
	25	* $Revision: V.1.0.0
	26	*
	27	* Target Processor: Cortex-M cores
	28	* -------------------------------------------------------------------- */
	29
	30	/**
	31	\mainpage CMSIS NN Software Library
	32	*
	33	* Introduction
	34	* ------------
	35	*
	36	* This user manual describes the CMSIS NN software library,
	37	* a collection of efficient neural network kernels developed to maximize the
	38	* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
	39	*
	40	* The library is divided into a number of functions each covering a specific category:
	41	* - Neural Network Convolution Functions
	42	* - Neural Network Activation Functions
	43	* - Fully-connected Layer Functions
	44	* - Neural Network Pooling Functions
	45	* - Softmax Functions
	46	* - Neural Network Support Functions
	47	*
	48	* The library has separate functions for operating on different weight and activation data
	49	* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
	50	* kernels are included in the function description. The implementation details are also
	51	* described in this paper [1].
	52	*
	53	* Block Diagram
	54	* --------
	55	* \image html CMSIS-NN-OVERVIEW.PNG
	56	*
	57	* Examples
	58	* --------
	59	*
	60	* The library ships with a number of examples which demonstrate how to use the library functions.
	61	*
	62	* Pre-processor Macros
	63	* ------------
	64	*
	65	* Each library project have differant pre-processor macros.
	66	*
	67	* - ARM_MATH_DSP:
	68	*
	69	* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
	70	*
	71	* - ARM_MATH_BIG_ENDIAN:
	72	*
	73	* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
	74	*
	75	* - ARM_NN_TRUNCATE:
	76	*
	77	* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
	78	*
	79	* Copyright Notice
	80	* ------------
	81	*
	82	* Copyright (C) 2010-2018 Arm Limited. All rights reserved.
	83	*
	84	* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
	85	*/
	86
	87	/**
	88	* @defgroup groupNN Neural Network Functions
	89	* These functions perform basic operations for neural network layers.
	90	*/
	91
	92	#ifndef _ARM_NNFUNCTIONS_H
	93	#define _ARM_NNFUNCTIONS_H
	94
	95	#include "arm_nnsupportfunctions.h"
	96	#include "arm_nn_tables.h"
	97
	98	#define USE_INTRINSIC
	99
	100	//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
	101
	102	#ifdef __cplusplus
	103	extern "C"
	104	{
	105	#endif
	106
	107	/**
	108	* @defgroup NNConv Neural Network Convolution Functions
	109	*
	110	* Perform convolution layer
	111	*
	112	* The convolution is implemented in 2 steps: im2col and GEMM
	113	*
	114	* im2col is a process of converting each patch of image data into
	115	* a column. After im2col, the convolution is computed as matrix-matrix
	116	* multiplication.
	117	*
	118	* To reduce the memory footprint, the im2col is performed partially.
	119	* Each iteration, only a few column (i.e., patches) are generated and
	120	* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
	121	*
	122	*/
	123
	124	/**
	125	* @brief Basic Q7 convolution function
	126	* @param[in] Im_in pointer to input tensor
	127	* @param[in] dim_im_in input tensor dimention
	128	* @param[in] ch_im_in number of input tensor channels
	129	* @param[in] wt pointer to kernel weights
	130	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	131	* @param[in] dim_kernel filter kernel size
	132	* @param[in] padding padding sizes
	133	* @param[in] stride convolution stride
	134	* @param[in] bias pointer to bias
	135	* @param[in] bias_shift amount of left-shift for bias
	136	* @param[in] out_shift amount of right-shift for output
	137	* @param[in,out] Im_out pointer to output tensor
	138	* @param[in] dim_im_out output tensor dimension
	139	* @param[in,out] bufferA pointer to buffer space for input
	140	* @param[in,out] bufferB pointer to buffer space for output
	141	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	142	*
	143	*/
	144
	145	arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
	146	const uint16_t dim_im_in,
	147	const uint16_t ch_im_in,
	148	const q7_t * wt,
	149	const uint16_t ch_im_out,
	150	const uint16_t dim_kernel,
	151	const uint16_t padding,
	152	const uint16_t stride,
	153	const q7_t * bias,
	154	const uint16_t bias_shift,
	155	const uint16_t out_shift,
	156	q7_t * Im_out,
	157	const uint16_t dim_im_out,
	158	q15_t * bufferA,
	159	q7_t * bufferB);
	160
	161	/**
	162	* @brief Basic Q7 convolution function (non-sqaure shape)
	163	* @param[in] Im_in pointer to input tensor
	164	* @param[in] dim_im_in_x input tensor dimention x
	165	* @param[in] dim_im_in_y input tensor dimention y
	166	* @param[in] ch_im_in number of input tensor channels
	167	* @param[in] wt pointer to kernel weights
	168	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	169	* @param[in] dim_kernel_x filter kernel size x
	170	* @param[in] dim_kernel_y filter kernel size y
	171	* @param[in] padding_x padding size x
	172	* @param[in] padding_y padding size y
	173	* @param[in] stride_x convolution stride x
	174	* @param[in] stride_y convolution stride y
	175	* @param[in] bias pointer to bias
	176	* @param[in] bias_shift amount of left-shift for bias
	177	* @param[in] out_shift amount of right-shift for output
	178	* @param[in,out] Im_out pointer to output tensor
	179	* @param[in] dim_im_out_x output tensor dimension x
	180	* @param[in] dim_im_out_y output tensor dimension y
	181	* @param[in,out] bufferA pointer to buffer space for input
	182	* @param[in,out] bufferB pointer to buffer space for output
	183	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	184	*/
	185
	186	arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
	187	const uint16_t dim_im_in_x,
	188	const uint16_t dim_im_in_y,
	189	const uint16_t ch_im_in,
	190	const q7_t * wt,
	191	const uint16_t ch_im_out,
	192	const uint16_t dim_kernel_x,
	193	const uint16_t dim_kernel_y,
	194	const uint16_t padding_x,
	195	const uint16_t padding_y,
	196	const uint16_t stride_x,
	197	const uint16_t stride_y,
	198	const q7_t * bias,
	199	const uint16_t bias_shift,
	200	const uint16_t out_shift,
	201	q7_t * Im_out,
	202	const uint16_t dim_im_out_x,
	203	const uint16_t dim_im_out_y,
	204	q15_t * bufferA,
	205	q7_t * bufferB);
	206
	207	/**
	208	* @brief Basic Q15 convolution function
	209	* @param[in] Im_in pointer to input tensor
	210	* @param[in] dim_im_in input tensor dimention
	211	* @param[in] ch_im_in number of input tensor channels
	212	* @param[in] wt pointer to kernel weights
	213	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	214	* @param[in] dim_kernel filter kernel size
	215	* @param[in] padding padding sizes
	216	* @param[in] stride convolution stride
	217	* @param[in] bias pointer to bias
	218	* @param[in] bias_shift amount of left-shift for bias
	219	* @param[in] out_shift amount of right-shift for output
	220	* @param[in,out] Im_out pointer to output tensor
	221	* @param[in] dim_im_out output tensor dimension
	222	* @param[in,out] bufferA pointer to buffer space for input
	223	* @param[in,out] bufferB pointer to buffer space for output
	224	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	225	*
	226	*/
	227
	228	arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
	229	const uint16_t dim_im_in,
	230	const uint16_t ch_im_in,
	231	const q15_t * wt,
	232	const uint16_t ch_im_out,
	233	const uint16_t dim_kernel,
	234	const uint16_t padding,
	235	const uint16_t stride,
	236	const q15_t * bias,
	237	const uint16_t bias_shift,
	238	const uint16_t out_shift,
	239	q15_t * Im_out,
	240	const uint16_t dim_im_out,
	241	q15_t * bufferA,
	242	q7_t * bufferB);
	243
	244	/**
	245	* @brief Fast Q7 convolution function
	246	* @param[in] Im_in pointer to input tensor
	247	* @param[in] dim_im_in input tensor dimention
	248	* @param[in] ch_im_in number of input tensor channels
	249	* @param[in] wt pointer to kernel weights
	250	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	251	* @param[in] dim_kernel filter kernel size
	252	* @param[in] padding padding sizes
	253	* @param[in] stride convolution stride
	254	* @param[in] bias pointer to bias
	255	* @param[in] bias_shift amount of left-shift for bias
	256	* @param[in] out_shift amount of right-shift for output
	257	* @param[in,out] Im_out pointer to output tensor
	258	* @param[in] dim_im_out output tensor dimension
	259	* @param[in,out] bufferA pointer to buffer space for input
	260	* @param[in,out] bufferB pointer to buffer space for output
	261	* @return The function returns either
	262	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	263	*
	264	* This function is the version with full list of optimization tricks, but with
	265	* some contraints:
	266	* ch_im_in is multiple of 4
	267	* ch_im_out is multiple of 2
	268	*/
	269
	270	arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
	271	const uint16_t dim_im_in,
	272	const uint16_t ch_im_in,
	273	const q7_t * wt,
	274	const uint16_t ch_im_out,
	275	const uint16_t dim_kernel,
	276	const uint16_t padding,
	277	const uint16_t stride,
	278	const q7_t * bias,
	279	const uint16_t bias_shift,
	280	const uint16_t out_shift,
	281	q7_t * Im_out,
	282	const uint16_t dim_im_out,
	283	q15_t * bufferA,
	284	q7_t * bufferB);
	285
	286	/**
	287	* @brief Fast Q7 convolution function (non-sqaure shape)
	288	* @param[in] Im_in pointer to input tensor
	289	* @param[in] dim_im_in_x input tensor dimention x
	290	* @param[in] dim_im_in_y input tensor dimention y
	291	* @param[in] ch_im_in number of input tensor channels
	292	* @param[in] wt pointer to kernel weights
	293	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	294	* @param[in] dim_kernel_x filter kernel size x
	295	* @param[in] dim_kernel_y filter kernel size y
	296	* @param[in] padding_x padding size x
	297	* @param[in] padding_y padding size y
	298	* @param[in] stride_x convolution stride x
	299	* @param[in] stride_y convolution stride y
	300	* @param[in] bias pointer to bias
	301	* @param[in] bias_shift amount of left-shift for bias
	302	* @param[in] out_shift amount of right-shift for output
	303	* @param[in,out] Im_out pointer to output tensor
	304	* @param[in] dim_im_out_x output tensor dimension x
	305	* @param[in] dim_im_out_y output tensor dimension y
	306	* @param[in,out] bufferA pointer to buffer space for input
	307	* @param[in,out] bufferB pointer to buffer space for output
	308	* @return The function returns either
	309	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	310	*
	311	* This function is the version with full list of optimization tricks, but with
	312	* some contraints:
	313	* ch_im_in is multiple of 4
	314	* ch_im_out is multiple of 2
	315	*/
	316
	317	arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
	318	const uint16_t dim_im_in_x,
	319	const uint16_t dim_im_in_y,
	320	const uint16_t ch_im_in,
	321	const q7_t * wt,
	322	const uint16_t ch_im_out,
	323	const uint16_t dim_kernel_x,
	324	const uint16_t dim_kernel_y,
	325	const uint16_t padding_x,
	326	const uint16_t padding_y,
	327	const uint16_t stride_x,
	328	const uint16_t stride_y,
	329	const q7_t * bias,
	330	const uint16_t bias_shift,
	331	const uint16_t out_shift,
	332	q7_t * Im_out,
	333	const uint16_t dim_im_out_x,
	334	const uint16_t dim_im_out_y,
	335	q15_t * bufferA,
	336	q7_t * bufferB);
	337
	338	/**
	339	* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
	340	* @param[in] Im_in pointer to input tensor
	341	* @param[in] dim_im_in_x input tensor dimention x
	342	* @param[in] dim_im_in_y input tensor dimention y
	343	* @param[in] ch_im_in number of input tensor channels
	344	* @param[in] wt pointer to kernel weights
	345	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	346	* @param[in] dim_kernel_x filter kernel size x
	347	* @param[in] dim_kernel_y filter kernel size y
	348	* @param[in] padding_x padding size x
	349	* @param[in] padding_y padding size y
	350	* @param[in] stride_x convolution stride x
	351	* @param[in] stride_y convolution stride y
	352	* @param[in] bias pointer to bias
	353	* @param[in] bias_shift amount of left-shift for bias
	354	* @param[in] out_shift amount of right-shift for output
	355	* @param[in,out] Im_out pointer to output tensor
	356	* @param[in] dim_im_out_x output tensor dimension x
	357	* @param[in] dim_im_out_y output tensor dimension y
	358	* @param[in,out] bufferA pointer to buffer space for input
	359	* @param[in,out] bufferB pointer to buffer space for output
	360	* @return The function returns either
	361	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	362	*
	363	* This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
	364	* and dim_kernel_y=1). It can be used for
	365	* second half of MobileNets after depthwise separable convolution.
	366	*
	367	* This function is the version with full list of optimization tricks, but with
	368	* some contraints:
	369	* ch_im_in is multiple of 4
	370	* ch_im_out is multiple of 2
	371	*/
	372	arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
	373	const uint16_t dim_im_in_x,
	374	const uint16_t dim_im_in_y,
	375	const uint16_t ch_im_in,
	376	const q7_t * wt,
	377	const uint16_t ch_im_out,
	378	const uint16_t dim_kernel_x,
	379	const uint16_t dim_kernel_y,
	380	const uint16_t padding_x,
	381	const uint16_t padding_y,
	382	const uint16_t stride_x,
	383	const uint16_t stride_y,
	384	const q7_t * bias,
	385	const uint16_t bias_shift,
	386	const uint16_t out_shift,
	387	q7_t * Im_out,
	388	const uint16_t dim_im_out_x,
	389	const uint16_t dim_im_out_y,
	390	q15_t * bufferA,
	391	q7_t * bufferB);
	392
	393	/**
	394	* @brief Q7 version of convolution for RGB image
	395	* @param[in] Im_in pointer to input tensor
	396	* @param[in] dim_im_in input tensor dimention
	397	* @param[in] ch_im_in number of input tensor channels
	398	* @param[in] wt pointer to kernel weights
	399	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	400	* @param[in] dim_kernel filter kernel size
	401	* @param[in] padding padding sizes
	402	* @param[in] stride convolution stride
	403	* @param[in] bias pointer to bias
	404	* @param[in] bias_shift amount of left-shift for bias
	405	* @param[in] out_shift amount of right-shift for output
	406	* @param[in,out] Im_out pointer to output tensor
	407	* @param[in] dim_im_out output tensor dimension
	408	* @param[in,out] bufferA pointer to buffer space for input
	409	* @param[in,out] bufferB pointer to buffer space for output
	410	* @return The function returns either
	411	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	412	*
	413	* This kernel is written exclusively for convolution with ch_im_in
	414	* equals 3. This applies on the first layer of CNNs which has input
	415	* image with RGB format.
	416	*/
	417
	418	arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
	419	const uint16_t dim_im_in,
	420	const uint16_t ch_im_in,
	421	const q7_t * wt,
	422	const uint16_t ch_im_out,
	423	const uint16_t dim_kernel,
	424	const uint16_t padding,
	425	const uint16_t stride,
	426	const q7_t * bias,
	427	const uint16_t bias_shift,
	428	const uint16_t out_shift,
	429	q7_t * Im_out,
	430	const uint16_t dim_im_out,
	431	q15_t * bufferA,
	432	q7_t * bufferB);
	433
	434	/**
	435	* @brief Fast Q15 convolution function
	436	* @param[in] Im_in pointer to input tensor
	437	* @param[in] dim_im_in input tensor dimention
	438	* @param[in] ch_im_in number of input tensor channels
	439	* @param[in] wt pointer to kernel weights
	440	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	441	* @param[in] dim_kernel filter kernel size
	442	* @param[in] padding padding sizes
	443	* @param[in] stride convolution stride
	444	* @param[in] bias pointer to bias
	445	* @param[in] bias_shift amount of left-shift for bias
	446	* @param[in] out_shift amount of right-shift for output
	447	* @param[in,out] Im_out pointer to output tensor
	448	* @param[in] dim_im_out output tensor dimension
	449	* @param[in,out] bufferA pointer to buffer space for input
	450	* @param[in,out] bufferB pointer to buffer space for output
	451	* @return The function returns either
	452	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	453	*
	454	* This function is the version with full list of optimization tricks, but with
	455	* some contraints:
	456	* ch_im_in is multiple of 2
	457	* ch_im_out is multiple of 2
	458	*/
	459
	460	arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
	461	const uint16_t dim_im_in,
	462	const uint16_t ch_im_in,
	463	const q15_t * wt,
	464	const uint16_t ch_im_out,
	465	const uint16_t dim_kernel,
	466	const uint16_t padding,
	467	const uint16_t stride,
	468	const q15_t * bias,
	469	const uint16_t bias_shift,
	470	const uint16_t out_shift,
	471	q15_t * Im_out,
	472	const uint16_t dim_im_out,
	473	q15_t * bufferA,
	474	q7_t * bufferB);
	475
	476	/**
	477	* @brief Fast Q15 convolution function (non-sqaure shape)
	478	* @param[in] Im_in pointer to input tensor
	479	* @param[in] dim_im_in_x input tensor dimention x
	480	* @param[in] dim_im_in_y input tensor dimention y
	481	* @param[in] ch_im_in number of input tensor channels
	482	* @param[in] wt pointer to kernel weights
	483	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	484	* @param[in] dim_kernel_x filter kernel size x
	485	* @param[in] dim_kernel_y filter kernel size y
	486	* @param[in] padding_x padding size x
	487	* @param[in] padding_y padding size y
	488	* @param[in] stride_x convolution stride x
	489	* @param[in] stride_y convolution stride y
	490	* @param[in] bias pointer to bias
	491	* @param[in] bias_shift amount of left-shift for bias
	492	* @param[in] out_shift amount of right-shift for output
	493	* @param[in,out] Im_out pointer to output tensor
	494	* @param[in] dim_im_out_x output tensor dimension x
	495	* @param[in] dim_im_out_y output tensor dimension y
	496	* @param[in,out] bufferA pointer to buffer space for input
	497	* @param[in,out] bufferB pointer to buffer space for output
	498	* @return The function returns either
	499	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	500	*
	501	* @details
	502	*
	503	* <b>Buffer size:</b>
	504	*
	505	* bufferA size: 2ch_im_indim_kernel*dim_kernel
	506	*
	507	* bufferB size: 0
	508	*
	509	* <b>Input dimension constraints:</b>
	510	*
	511	* ch_im_in is multiple of 2
	512	*
	513	* ch_im_out is multipe of 2
	514	*
	515	*/
	516
	517	arm_status
	518	arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
	519	const uint16_t dim_im_in_x,
	520	const uint16_t dim_im_in_y,
	521	const uint16_t ch_im_in,
	522	const q15_t * wt,
	523	const uint16_t ch_im_out,
	524	const uint16_t dim_kernel_x,
	525	const uint16_t dim_kernel_y,
	526	const uint16_t padding_x,
	527	const uint16_t padding_y,
	528	const uint16_t stride_x,
	529	const uint16_t stride_y,
	530	const q15_t * bias,
	531	const uint16_t bias_shift,
	532	const uint16_t out_shift,
	533	q15_t * Im_out,
	534	const uint16_t dim_im_out_x,
	535	const uint16_t dim_im_out_y,
	536	q15_t * bufferA,
	537	q7_t * bufferB);
	538
	539	/**
	540	* @brief Q7 depthwise separable convolution function
	541	* @param[in] Im_in pointer to input tensor
	542	* @param[in] dim_im_in input tensor dimention
	543	* @param[in] ch_im_in number of input tensor channels
	544	* @param[in] wt pointer to kernel weights
	545	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	546	* @param[in] dim_kernel filter kernel size
	547	* @param[in] padding padding sizes
	548	* @param[in] stride convolution stride
	549	* @param[in] bias pointer to bias
	550	* @param[in] bias_shift amount of left-shift for bias
	551	* @param[in] out_shift amount of right-shift for output
	552	* @param[in,out] Im_out pointer to output tensor
	553	* @param[in] dim_im_out output tensor dimension
	554	* @param[in,out] bufferA pointer to buffer space for input
	555	* @param[in,out] bufferB pointer to buffer space for output
	556	* @return The function returns either
	557	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	558	*
	559	* This function is the version with full list of optimization tricks, but with
	560	* some contraints:
	561	* ch_im_in is multiple of 2
	562	* ch_im_out is multiple of 2
	563	*/
	564
	565	arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
	566	const uint16_t dim_im_in,
	567	const uint16_t ch_im_in,
	568	const q7_t * wt,
	569	const uint16_t ch_im_out,
	570	const uint16_t dim_kernel,
	571	const uint16_t padding,
	572	const uint16_t stride,
	573	const q7_t * bias,
	574	const uint16_t bias_shift,
	575	const uint16_t out_shift,
	576	q7_t * Im_out,
	577	const uint16_t dim_im_out,
	578	q15_t * bufferA,
	579	q7_t * bufferB);
	580
	581	/**
	582	* @brief Q7 depthwise separable convolution function (non-square shape)
	583	* @param[in] Im_in pointer to input tensor
	584	* @param[in] dim_im_in_x input tensor dimention x
	585	* @param[in] dim_im_in_y input tensor dimention y
	586	* @param[in] ch_im_in number of input tensor channels
	587	* @param[in] wt pointer to kernel weights
	588	* @param[in] ch_im_out number of filters, i.e., output tensor channels
	589	* @param[in] dim_kernel_x filter kernel size x
	590	* @param[in] dim_kernel_y filter kernel size y
	591	* @param[in] padding_x padding sizes x
	592	* @param[in] padding_y padding sizes y
	593	* @param[in] stride_x convolution stride x
	594	* @param[in] stride_y convolution stride y
	595	* @param[in] bias pointer to bias
	596	* @param[in] bias_shift amount of left-shift for bias
	597	* @param[in] out_shift amount of right-shift for output
	598	* @param[in,out] Im_out pointer to output tensor
	599	* @param[in] dim_im_out_x output tensor dimension x
	600	* @param[in] dim_im_out_y output tensor dimension y
	601	* @param[in,out] bufferA pointer to buffer space for input
	602	* @param[in,out] bufferB pointer to buffer space for output
	603	* @return The function returns either
	604	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
	605	*
	606	* This function is the version with full list of optimization tricks, but with
	607	* some contraints:
	608	* ch_im_in is multiple of 2
	609	* ch_im_out is multiple of 2
	610	*/
	611	arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
	612	const uint16_t dim_im_in_x,
	613	const uint16_t dim_im_in_y,
	614	const uint16_t ch_im_in,
	615	const q7_t * wt,
	616	const uint16_t ch_im_out,
	617	const uint16_t dim_kernel_x,
	618	const uint16_t dim_kernel_y,
	619	const uint16_t padding_x,
	620	const uint16_t padding_y,
	621	const uint16_t stride_x,
	622	const uint16_t stride_y,
	623	const q7_t * bias,
	624	const uint16_t bias_shift,
	625	const uint16_t out_shift,
	626	q7_t * Im_out,
	627	const uint16_t dim_im_out_x,
	628	const uint16_t dim_im_out_y,
	629	q15_t * bufferA,
	630	q7_t * bufferB);
	631
	632
	633	/**
	634	* @defgroup FC Fully-connected Layer Functions
	635	*
	636	* Perform fully-connected layer
	637	*
	638	* Fully-connected layer is basically a matrix-vector multiplication
	639	* with bias. The matrix is the weights and the input/output vectors
	640	* are the activation values. Supported {weight, activation} precisions
	641	* include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
	642	*
	643	* Here we have two types of kernel functions. The basic function
	644	* implements the function using regular GEMV approach. The opt functions
	645	* operates with weights in interleaved formats.
	646	*
	647	*/
	648
	649	/**
	650	* @brief Q7 basic fully-connected layer function
	651	* @param[in] pV pointer to input vector
	652	* @param[in] pM pointer to matrix weights
	653	* @param[in] dim_vec length of the vector
	654	* @param[in] num_of_rows number of rows in weight matrix
	655	* @param[in] bias_shift amount of left-shift for bias
	656	* @param[in] out_shift amount of right-shift for output
	657	* @param[in] bias pointer to bias
	658	* @param[in,out] pOut pointer to output vector
	659	* @param[in,out] vec_buffer pointer to buffer space for input
	660	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	661	*
	662	*/
	663
	664	arm_status arm_fully_connected_q7(const q7_t * pV,
	665	const q7_t * pM,
	666	const uint16_t dim_vec,
	667	const uint16_t num_of_rows,
	668	const uint16_t bias_shift,
	669	const uint16_t out_shift,
	670	const q7_t * bias,
	671	q7_t * pOut,
	672	q15_t * vec_buffer);
	673
	674	/**
	675	* @brief Q7 opt fully-connected layer function
	676	* @param[in] pV pointer to input vector
	677	* @param[in] pM pointer to matrix weights
	678	* @param[in] dim_vec length of the vector
	679	* @param[in] num_of_rows number of rows in weight matrix
	680	* @param[in] bias_shift amount of left-shift for bias
	681	* @param[in] out_shift amount of right-shift for output
	682	* @param[in] bias pointer to bias
	683	* @param[in,out] pOut pointer to output vector
	684	* @param[in,out] vec_buffer pointer to buffer space for input
	685	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	686	*
	687	*/
	688
	689	arm_status arm_fully_connected_q7_opt(const q7_t * pV,
	690	const q7_t * pM,
	691	const uint16_t dim_vec,
	692	const uint16_t num_of_rows,
	693	const uint16_t bias_shift,
	694	const uint16_t out_shift,
	695	const q7_t * bias,
	696	q7_t * pOut,
	697	q15_t * vec_buffer);
	698
	699	/**
	700	* @brief Q15 basic fully-connected layer function
	701	* @param[in] pV pointer to input vector
	702	* @param[in] pM pointer to matrix weights
	703	* @param[in] dim_vec length of the vector
	704	* @param[in] num_of_rows number of rows in weight matrix
	705	* @param[in] bias_shift amount of left-shift for bias
	706	* @param[in] out_shift amount of right-shift for output
	707	* @param[in] bias pointer to bias
	708	* @param[in,out] pOut pointer to output vector
	709	* @param[in,out] vec_buffer pointer to buffer space for input
	710	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	711	*
	712	*/
	713
	714	arm_status arm_fully_connected_q15(const q15_t * pV,
	715	const q15_t * pM,
	716	const uint16_t dim_vec,
	717	const uint16_t num_of_rows,
	718	const uint16_t bias_shift,
	719	const uint16_t out_shift,
	720	const q15_t * bias,
	721	q15_t * pOut,
	722	q15_t * vec_buffer);
	723
	724	/**
	725	* @brief Q15 opt fully-connected layer function
	726	* @param[in] pV pointer to input vector
	727	* @param[in] pM pointer to matrix weights
	728	* @param[in] dim_vec length of the vector
	729	* @param[in] num_of_rows number of rows in weight matrix
	730	* @param[in] bias_shift amount of left-shift for bias
	731	* @param[in] out_shift amount of right-shift for output
	732	* @param[in] bias pointer to bias
	733	* @param[in,out] pOut pointer to output vector
	734	* @param[in,out] vec_buffer pointer to buffer space for input
	735	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	736	*
	737	*/
	738
	739	arm_status arm_fully_connected_q15_opt(const q15_t * pV,
	740	const q15_t * pM,
	741	const uint16_t dim_vec,
	742	const uint16_t num_of_rows,
	743	const uint16_t bias_shift,
	744	const uint16_t out_shift,
	745	const q15_t * bias,
	746	q15_t * pOut,
	747	q15_t * vec_buffer);
	748
	749	/**
	750	* @brief Mixed Q15-Q7 fully-connected layer function
	751	* @param[in] pV pointer to input vector
	752	* @param[in] pM pointer to matrix weights
	753	* @param[in] dim_vec length of the vector
	754	* @param[in] num_of_rows number of rows in weight matrix
	755	* @param[in] bias_shift amount of left-shift for bias
	756	* @param[in] out_shift amount of right-shift for output
	757	* @param[in] bias pointer to bias
	758	* @param[in,out] pOut pointer to output vector
	759	* @param[in,out] vec_buffer pointer to buffer space for input
	760	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	761	*
	762	*/
	763
	764	arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
	765	const q7_t * pM,
	766	const uint16_t dim_vec,
	767	const uint16_t num_of_rows,
	768	const uint16_t bias_shift,
	769	const uint16_t out_shift,
	770	const q7_t * bias,
	771	q15_t * pOut,
	772	q15_t * vec_buffer);
	773
	774	/**
	775	* @brief Mixed Q15-Q7 opt fully-connected layer function
	776	* @param[in] pV pointer to input vector
	777	* @param[in] pM pointer to matrix weights
	778	* @param[in] dim_vec length of the vector
	779	* @param[in] num_of_rows number of rows in weight matrix
	780	* @param[in] bias_shift amount of left-shift for bias
	781	* @param[in] out_shift amount of right-shift for output
	782	* @param[in] bias pointer to bias
	783	* @param[in,out] pOut pointer to output vector
	784	* @param[in,out] vec_buffer pointer to buffer space for input
	785	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	786	*
	787	*/
	788
	789	arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
	790	const q7_t * pM,
	791	const uint16_t dim_vec,
	792	const uint16_t num_of_rows,
	793	const uint16_t bias_shift,
	794	const uint16_t out_shift,
	795	const q7_t * bias,
	796	q15_t * pOut,
	797	q15_t * vec_buffer);
	798
	799	/**
	800	* @brief Matrix-Multiplication Kernels for Convolution
	801	*
	802	* These functions are used within convolution layer functions for
	803	* matrix multiplication.
	804	*
	805	* The implementation is similar to CMSIS-DSP arm_mat_mult functions
	806	* with one Q7 and one Q15 operands. The Q15 operand is the im2col
	807	* output which is always with 2 columns.
	808	*
	809	*/
	810
	811	/**
	812	* @brief Matrix-multiplication function for convolution
	813	* @param[in] pA pointer to operand A
	814	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
	815	* @param[in] ch_im_out numRow of A
	816	* @param[in] numCol_A numCol of A
	817	* @param[in] bias_shift amount of left-shift for bias
	818	* @param[in] out_shift amount of right-shift for output
	819	* @param[in] bias the bias
	820	* @param[in,out] pOut pointer to output
	821	* @return The function returns the incremented output pointer
	822	*/
	823
	824	q7_t arm_nn_mat_mult_kernel_q7_q15(const q7_t pA,
	825	const q15_t * pInBuffer,
	826	const uint16_t ch_im_out,
	827	const uint16_t numCol_A,
	828	const uint16_t bias_shift,
	829	const uint16_t out_shift,
	830	const q7_t * bias,
	831	q7_t * pOut);
	832
	833	/**
	834	* @brief Matrix-multiplication function for convolution with reordered columns
	835	* @param[in] pA pointer to operand A
	836	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
	837	* @param[in] ch_im_out numRow of A
	838	* @param[in] numCol_A numCol of A
	839	* @param[in] bias_shift amount of left-shift for bias
	840	* @param[in] out_shift amount of right-shift for output
	841	* @param[in] bias the bias
	842	* @param[in,out] pOut pointer to output
	843	* @return The function returns the incremented output pointer
	844	*/
	845
	846	q7_t arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t pA,
	847	const q15_t * pInBuffer,
	848	const uint16_t ch_im_out,
	849	const uint16_t numCol_A,
	850	const uint16_t bias_shift,
	851	const uint16_t out_shift,
	852	const q7_t * bias,
	853	q7_t * pOut);
	854
	855	#ifdef __cplusplus
	856	}
	857	#endif
	858
	859	/*
	860	* Other functions
	861	* These layers are typically not timing critical
	862	* Basic implementation is supported here
	863	*/
	864
	865	#ifdef __cplusplus
	866	extern "C"
	867	{
	868	#endif
	869
	870	/**
	871	* @defgroup Acti Neural Network Activation Functions
	872	*
	873	* Perform activation layers, including ReLU (Rectified Linear Unit),
	874	* sigmoid and tanh
	875	*
	876	*/
	877
	878	/**
	879	* @brief Q7 RELU function
	880	* @param[in,out] data pointer to input
	881	* @param[in] size number of elements
	882	* @return none.
	883	*/
	884
	885	void arm_relu_q7(q7_t * data, uint16_t size);
	886
	887	/**
	888	* @brief Q15 RELU function
	889	* @param[in,out] data pointer to input
	890	* @param[in] size number of elements
	891	* @return none.
	892	*/
	893
	894	void arm_relu_q15(q15_t * data, uint16_t size);
	895
	896	/**
	897	* @brief Q7 neural network activation function using direct table look-up
	898	* @param[in,out] data pointer to input
	899	* @param[in] size number of elements
	900	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
	901	* @param[in] type type of activation functions
	902	* @return none.
	903	*/
	904
	905	void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
	906	arm_nn_activation_type type);
	907
	908	/**
	909	* @brief Q15 neural network activation function using direct table look-up
	910	* @param[in,out] data pointer to input
	911	* @param[in] size number of elements
	912	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
	913	* @param[in] type type of activation functions
	914	* @return none.
	915	*/
	916
	917	void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
	918	arm_nn_activation_type type);
	919
	920	/**
	921	* @defgroup Pooling Neural Network Pooling Functions
	922	*
	923	* Perform pooling functions, including max pooling and average pooling
	924	*
	925	*/
	926
	927	/**
	928	* @brief Q7 max pooling function
	929	* @param[in] Im_in pointer to input tensor
	930	* @param[in] dim_im_in input tensor dimention
	931	* @param[in] ch_im_in number of input tensor channels
	932	* @param[in] dim_kernel filter kernel size
	933	* @param[in] padding padding sizes
	934	* @param[in] stride convolution stride
	935	* @param[in] dim_im_out output tensor dimension
	936	* @param[in,out] bufferA pointer to buffer space for input
	937	* @param[in,out] Im_out pointer to output tensor
	938	* @return none.
	939	*
	940	*/
	941
	942	void arm_maxpool_q7_HWC(q7_t * Im_in,
	943	const uint16_t dim_im_in,
	944	const uint16_t ch_im_in,
	945	const uint16_t dim_kernel,
	946	const uint16_t padding,
	947	const uint16_t stride,
	948	const uint16_t dim_im_out,
	949	q7_t * bufferA,
	950	q7_t * Im_out);
	951
	952	/**
	953	* @brief Q7 average pooling function
	954	* @param[in] Im_in pointer to input tensor
	955	* @param[in] dim_im_in input tensor dimention
	956	* @param[in] ch_im_in number of input tensor channels
	957	* @param[in] dim_kernel filter kernel size
	958	* @param[in] padding padding sizes
	959	* @param[in] stride convolution stride
	960	* @param[in] dim_im_out output tensor dimension
	961	* @param[in,out] bufferA pointer to buffer space for input
	962	* @param[in,out] Im_out pointer to output tensor
	963	* @return none.
	964	*
	965	*/
	966
	967	void arm_avepool_q7_HWC(q7_t * Im_in,
	968	const uint16_t dim_im_in,
	969	const uint16_t ch_im_in,
	970	const uint16_t dim_kernel,
	971	const uint16_t padding,
	972	const uint16_t stride,
	973	const uint16_t dim_im_out,
	974	q7_t * bufferA,
	975	q7_t * Im_out);
	976
	977	/**
	978	* @defgroup Softmax Softmax Functions
	979	*
	980	* EXP(2) based softmax function
	981	*
	982	*/
	983
	984	/**
	985	* @brief Q7 softmax function
	986	* @param[in] vec_in pointer to input vector
	987	* @param[in] dim_vec input vector dimention
	988	* @param[out] p_out pointer to output vector
	989	* @return none.
	990	*
	991	*/
	992
	993	void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
	994
	995	/**
	996	* @brief Q15 softmax function
	997	* @param[in] vec_in pointer to input vector
	998	* @param[in] dim_vec input vector dimention
	999	* @param[out] p_out pointer to output vector
	1000	* @return none.
	1001	*
	1002	*/
	1003
	1004	void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
	1005
	1006	#ifdef __cplusplus
	1007	}
	1008	#endif
	1009
	1010	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: S-port/trunk/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Download in other formats: