Context Navigation

arm_nnfunctions.h

Last change on this file was 1, checked in by AlexLir, 3 years ago

File size: 48.5 KB

Line
1	/*
2	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3	*
4	* SPDX-License-Identifier: Apache-2.0
5	*
6	* Licensed under the Apache License, Version 2.0 (the License); you may
7	* not use this file except in compliance with the License.
8	* You may obtain a copy of the License at
9	*
10	* www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing, software
13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	* See the License for the specific language governing permissions and
16	* limitations under the License.
17	*/
18
19	/* ----------------------------------------------------------------------
20	* Project: CMSIS NN Library
21	* Title: arm_nnfunctions.h
22	* Description: Public header file for CMSIS NN Library
23	*
24	* $Date: 13. July 2018
25	* $Revision: V.1.0.0
26	*
27	* Target Processor: Cortex-M cores
28	* -------------------------------------------------------------------- */
29
30	/**
31	\mainpage CMSIS NN Software Library
32	*
33	* Introduction
34	* ------------
35	*
36	* This user manual describes the CMSIS NN software library,
37	* a collection of efficient neural network kernels developed to maximize the
38	* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39	*
40	* The library is divided into a number of functions each covering a specific category:
41	* - Neural Network Convolution Functions
42	* - Neural Network Activation Functions
43	* - Fully-connected Layer Functions
44	* - Neural Network Pooling Functions
45	* - Softmax Functions
46	* - Neural Network Support Functions
47	*
48	* The library has separate functions for operating on different weight and activation data
49	* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
50	* kernels are included in the function description. The implementation details are also
51	* described in this paper [1].
52	*
53	* Block Diagram
54	* --------
55	* \image html CMSIS-NN-OVERVIEW.PNG
56	*
57	* Examples
58	* --------
59	*
60	* The library ships with a number of examples which demonstrate how to use the library functions.
61	*
62	* Pre-processor Macros
63	* ------------
64	*
65	* Each library project have differant pre-processor macros.
66	*
67	* - ARM_MATH_DSP:
68	*
69	* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
70	*
71	* - ARM_MATH_BIG_ENDIAN:
72	*
73	* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
74	*
75	* - ARM_NN_TRUNCATE:
76	*
77	* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
78	*
79	* Copyright Notice
80	* ------------
81	*
82	* Copyright (C) 2010-2018 Arm Limited. All rights reserved.
83	*
84	* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
85	*/
86
87	/**
88	* @defgroup groupNN Neural Network Functions
89	* These functions perform basic operations for neural network layers.
90	*/
91
92	#ifndef _ARM_NNFUNCTIONS_H
93	#define _ARM_NNFUNCTIONS_H
94
95	#include "arm_nnsupportfunctions.h"
96	#include "arm_nn_tables.h"
97
98	#define USE_INTRINSIC
99
100	//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
101
102	#ifdef __cplusplus
103	extern "C"
104	{
105	#endif
106
107	/**
108	* @defgroup NNConv Neural Network Convolution Functions
109	*
110	* Perform convolution layer
111	*
112	* The convolution is implemented in 2 steps: im2col and GEMM
113	*
114	* im2col is a process of converting each patch of image data into
115	* a column. After im2col, the convolution is computed as matrix-matrix
116	* multiplication.
117	*
118	* To reduce the memory footprint, the im2col is performed partially.
119	* Each iteration, only a few column (i.e., patches) are generated and
120	* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
121	*
122	*/
123
124	/**
125	* @brief Basic Q7 convolution function
126	* @param[in] Im_in pointer to input tensor
127	* @param[in] dim_im_in input tensor dimention
128	* @param[in] ch_im_in number of input tensor channels
129	* @param[in] wt pointer to kernel weights
130	* @param[in] ch_im_out number of filters, i.e., output tensor channels
131	* @param[in] dim_kernel filter kernel size
132	* @param[in] padding padding sizes
133	* @param[in] stride convolution stride
134	* @param[in] bias pointer to bias
135	* @param[in] bias_shift amount of left-shift for bias
136	* @param[in] out_shift amount of right-shift for output
137	* @param[in,out] Im_out pointer to output tensor
138	* @param[in] dim_im_out output tensor dimension
139	* @param[in,out] bufferA pointer to buffer space for input
140	* @param[in,out] bufferB pointer to buffer space for output
141	* @return The function returns <code>ARM_MATH_SUCCESS</code>
142	*
143	*/
144
145	arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
146	const uint16_t dim_im_in,
147	const uint16_t ch_im_in,
148	const q7_t * wt,
149	const uint16_t ch_im_out,
150	const uint16_t dim_kernel,
151	const uint16_t padding,
152	const uint16_t stride,
153	const q7_t * bias,
154	const uint16_t bias_shift,
155	const uint16_t out_shift,
156	q7_t * Im_out,
157	const uint16_t dim_im_out,
158	q15_t * bufferA,
159	q7_t * bufferB);
160
161	/**
162	* @brief Basic Q7 convolution function (non-sqaure shape)
163	* @param[in] Im_in pointer to input tensor
164	* @param[in] dim_im_in_x input tensor dimention x
165	* @param[in] dim_im_in_y input tensor dimention y
166	* @param[in] ch_im_in number of input tensor channels
167	* @param[in] wt pointer to kernel weights
168	* @param[in] ch_im_out number of filters, i.e., output tensor channels
169	* @param[in] dim_kernel_x filter kernel size x
170	* @param[in] dim_kernel_y filter kernel size y
171	* @param[in] padding_x padding size x
172	* @param[in] padding_y padding size y
173	* @param[in] stride_x convolution stride x
174	* @param[in] stride_y convolution stride y
175	* @param[in] bias pointer to bias
176	* @param[in] bias_shift amount of left-shift for bias
177	* @param[in] out_shift amount of right-shift for output
178	* @param[in,out] Im_out pointer to output tensor
179	* @param[in] dim_im_out_x output tensor dimension x
180	* @param[in] dim_im_out_y output tensor dimension y
181	* @param[in,out] bufferA pointer to buffer space for input
182	* @param[in,out] bufferB pointer to buffer space for output
183	* @return The function returns <code>ARM_MATH_SUCCESS</code>
184	*/
185
186	arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
187	const uint16_t dim_im_in_x,
188	const uint16_t dim_im_in_y,
189	const uint16_t ch_im_in,
190	const q7_t * wt,
191	const uint16_t ch_im_out,
192	const uint16_t dim_kernel_x,
193	const uint16_t dim_kernel_y,
194	const uint16_t padding_x,
195	const uint16_t padding_y,
196	const uint16_t stride_x,
197	const uint16_t stride_y,
198	const q7_t * bias,
199	const uint16_t bias_shift,
200	const uint16_t out_shift,
201	q7_t * Im_out,
202	const uint16_t dim_im_out_x,
203	const uint16_t dim_im_out_y,
204	q15_t * bufferA,
205	q7_t * bufferB);
206
207	/**
208	* @brief Basic Q15 convolution function
209	* @param[in] Im_in pointer to input tensor
210	* @param[in] dim_im_in input tensor dimention
211	* @param[in] ch_im_in number of input tensor channels
212	* @param[in] wt pointer to kernel weights
213	* @param[in] ch_im_out number of filters, i.e., output tensor channels
214	* @param[in] dim_kernel filter kernel size
215	* @param[in] padding padding sizes
216	* @param[in] stride convolution stride
217	* @param[in] bias pointer to bias
218	* @param[in] bias_shift amount of left-shift for bias
219	* @param[in] out_shift amount of right-shift for output
220	* @param[in,out] Im_out pointer to output tensor
221	* @param[in] dim_im_out output tensor dimension
222	* @param[in,out] bufferA pointer to buffer space for input
223	* @param[in,out] bufferB pointer to buffer space for output
224	* @return The function returns <code>ARM_MATH_SUCCESS</code>
225	*
226	*/
227
228	arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
229	const uint16_t dim_im_in,
230	const uint16_t ch_im_in,
231	const q15_t * wt,
232	const uint16_t ch_im_out,
233	const uint16_t dim_kernel,
234	const uint16_t padding,
235	const uint16_t stride,
236	const q15_t * bias,
237	const uint16_t bias_shift,
238	const uint16_t out_shift,
239	q15_t * Im_out,
240	const uint16_t dim_im_out,
241	q15_t * bufferA,
242	q7_t * bufferB);
243
244	/**
245	* @brief Fast Q7 convolution function
246	* @param[in] Im_in pointer to input tensor
247	* @param[in] dim_im_in input tensor dimention
248	* @param[in] ch_im_in number of input tensor channels
249	* @param[in] wt pointer to kernel weights
250	* @param[in] ch_im_out number of filters, i.e., output tensor channels
251	* @param[in] dim_kernel filter kernel size
252	* @param[in] padding padding sizes
253	* @param[in] stride convolution stride
254	* @param[in] bias pointer to bias
255	* @param[in] bias_shift amount of left-shift for bias
256	* @param[in] out_shift amount of right-shift for output
257	* @param[in,out] Im_out pointer to output tensor
258	* @param[in] dim_im_out output tensor dimension
259	* @param[in,out] bufferA pointer to buffer space for input
260	* @param[in,out] bufferB pointer to buffer space for output
261	* @return The function returns either
262	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
263	*
264	* This function is the version with full list of optimization tricks, but with
265	* some contraints:
266	* ch_im_in is multiple of 4
267	* ch_im_out is multiple of 2
268	*/
269
270	arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
271	const uint16_t dim_im_in,
272	const uint16_t ch_im_in,
273	const q7_t * wt,
274	const uint16_t ch_im_out,
275	const uint16_t dim_kernel,
276	const uint16_t padding,
277	const uint16_t stride,
278	const q7_t * bias,
279	const uint16_t bias_shift,
280	const uint16_t out_shift,
281	q7_t * Im_out,
282	const uint16_t dim_im_out,
283	q15_t * bufferA,
284	q7_t * bufferB);
285
286	/**
287	* @brief Fast Q7 convolution function (non-sqaure shape)
288	* @param[in] Im_in pointer to input tensor
289	* @param[in] dim_im_in_x input tensor dimention x
290	* @param[in] dim_im_in_y input tensor dimention y
291	* @param[in] ch_im_in number of input tensor channels
292	* @param[in] wt pointer to kernel weights
293	* @param[in] ch_im_out number of filters, i.e., output tensor channels
294	* @param[in] dim_kernel_x filter kernel size x
295	* @param[in] dim_kernel_y filter kernel size y
296	* @param[in] padding_x padding size x
297	* @param[in] padding_y padding size y
298	* @param[in] stride_x convolution stride x
299	* @param[in] stride_y convolution stride y
300	* @param[in] bias pointer to bias
301	* @param[in] bias_shift amount of left-shift for bias
302	* @param[in] out_shift amount of right-shift for output
303	* @param[in,out] Im_out pointer to output tensor
304	* @param[in] dim_im_out_x output tensor dimension x
305	* @param[in] dim_im_out_y output tensor dimension y
306	* @param[in,out] bufferA pointer to buffer space for input
307	* @param[in,out] bufferB pointer to buffer space for output
308	* @return The function returns either
309	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
310	*
311	* This function is the version with full list of optimization tricks, but with
312	* some contraints:
313	* ch_im_in is multiple of 4
314	* ch_im_out is multiple of 2
315	*/
316
317	arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
318	const uint16_t dim_im_in_x,
319	const uint16_t dim_im_in_y,
320	const uint16_t ch_im_in,
321	const q7_t * wt,
322	const uint16_t ch_im_out,
323	const uint16_t dim_kernel_x,
324	const uint16_t dim_kernel_y,
325	const uint16_t padding_x,
326	const uint16_t padding_y,
327	const uint16_t stride_x,
328	const uint16_t stride_y,
329	const q7_t * bias,
330	const uint16_t bias_shift,
331	const uint16_t out_shift,
332	q7_t * Im_out,
333	const uint16_t dim_im_out_x,
334	const uint16_t dim_im_out_y,
335	q15_t * bufferA,
336	q7_t * bufferB);
337
338	/**
339	* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
340	* @param[in] Im_in pointer to input tensor
341	* @param[in] dim_im_in_x input tensor dimention x
342	* @param[in] dim_im_in_y input tensor dimention y
343	* @param[in] ch_im_in number of input tensor channels
344	* @param[in] wt pointer to kernel weights
345	* @param[in] ch_im_out number of filters, i.e., output tensor channels
346	* @param[in] dim_kernel_x filter kernel size x
347	* @param[in] dim_kernel_y filter kernel size y
348	* @param[in] padding_x padding size x
349	* @param[in] padding_y padding size y
350	* @param[in] stride_x convolution stride x
351	* @param[in] stride_y convolution stride y
352	* @param[in] bias pointer to bias
353	* @param[in] bias_shift amount of left-shift for bias
354	* @param[in] out_shift amount of right-shift for output
355	* @param[in,out] Im_out pointer to output tensor
356	* @param[in] dim_im_out_x output tensor dimension x
357	* @param[in] dim_im_out_y output tensor dimension y
358	* @param[in,out] bufferA pointer to buffer space for input
359	* @param[in,out] bufferB pointer to buffer space for output
360	* @return The function returns either
361	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
362	*
363	* This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
364	* and dim_kernel_y=1). It can be used for
365	* second half of MobileNets after depthwise separable convolution.
366	*
367	* This function is the version with full list of optimization tricks, but with
368	* some contraints:
369	* ch_im_in is multiple of 4
370	* ch_im_out is multiple of 2
371	*/
372	arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
373	const uint16_t dim_im_in_x,
374	const uint16_t dim_im_in_y,
375	const uint16_t ch_im_in,
376	const q7_t * wt,
377	const uint16_t ch_im_out,
378	const uint16_t dim_kernel_x,
379	const uint16_t dim_kernel_y,
380	const uint16_t padding_x,
381	const uint16_t padding_y,
382	const uint16_t stride_x,
383	const uint16_t stride_y,
384	const q7_t * bias,
385	const uint16_t bias_shift,
386	const uint16_t out_shift,
387	q7_t * Im_out,
388	const uint16_t dim_im_out_x,
389	const uint16_t dim_im_out_y,
390	q15_t * bufferA,
391	q7_t * bufferB);
392
393	/**
394	* @brief Q7 version of convolution for RGB image
395	* @param[in] Im_in pointer to input tensor
396	* @param[in] dim_im_in input tensor dimention
397	* @param[in] ch_im_in number of input tensor channels
398	* @param[in] wt pointer to kernel weights
399	* @param[in] ch_im_out number of filters, i.e., output tensor channels
400	* @param[in] dim_kernel filter kernel size
401	* @param[in] padding padding sizes
402	* @param[in] stride convolution stride
403	* @param[in] bias pointer to bias
404	* @param[in] bias_shift amount of left-shift for bias
405	* @param[in] out_shift amount of right-shift for output
406	* @param[in,out] Im_out pointer to output tensor
407	* @param[in] dim_im_out output tensor dimension
408	* @param[in,out] bufferA pointer to buffer space for input
409	* @param[in,out] bufferB pointer to buffer space for output
410	* @return The function returns either
411	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
412	*
413	* This kernel is written exclusively for convolution with ch_im_in
414	* equals 3. This applies on the first layer of CNNs which has input
415	* image with RGB format.
416	*/
417
418	arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
419	const uint16_t dim_im_in,
420	const uint16_t ch_im_in,
421	const q7_t * wt,
422	const uint16_t ch_im_out,
423	const uint16_t dim_kernel,
424	const uint16_t padding,
425	const uint16_t stride,
426	const q7_t * bias,
427	const uint16_t bias_shift,
428	const uint16_t out_shift,
429	q7_t * Im_out,
430	const uint16_t dim_im_out,
431	q15_t * bufferA,
432	q7_t * bufferB);
433
434	/**
435	* @brief Fast Q15 convolution function
436	* @param[in] Im_in pointer to input tensor
437	* @param[in] dim_im_in input tensor dimention
438	* @param[in] ch_im_in number of input tensor channels
439	* @param[in] wt pointer to kernel weights
440	* @param[in] ch_im_out number of filters, i.e., output tensor channels
441	* @param[in] dim_kernel filter kernel size
442	* @param[in] padding padding sizes
443	* @param[in] stride convolution stride
444	* @param[in] bias pointer to bias
445	* @param[in] bias_shift amount of left-shift for bias
446	* @param[in] out_shift amount of right-shift for output
447	* @param[in,out] Im_out pointer to output tensor
448	* @param[in] dim_im_out output tensor dimension
449	* @param[in,out] bufferA pointer to buffer space for input
450	* @param[in,out] bufferB pointer to buffer space for output
451	* @return The function returns either
452	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
453	*
454	* This function is the version with full list of optimization tricks, but with
455	* some contraints:
456	* ch_im_in is multiple of 2
457	* ch_im_out is multiple of 2
458	*/
459
460	arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
461	const uint16_t dim_im_in,
462	const uint16_t ch_im_in,
463	const q15_t * wt,
464	const uint16_t ch_im_out,
465	const uint16_t dim_kernel,
466	const uint16_t padding,
467	const uint16_t stride,
468	const q15_t * bias,
469	const uint16_t bias_shift,
470	const uint16_t out_shift,
471	q15_t * Im_out,
472	const uint16_t dim_im_out,
473	q15_t * bufferA,
474	q7_t * bufferB);
475
476	/**
477	* @brief Fast Q15 convolution function (non-sqaure shape)
478	* @param[in] Im_in pointer to input tensor
479	* @param[in] dim_im_in_x input tensor dimention x
480	* @param[in] dim_im_in_y input tensor dimention y
481	* @param[in] ch_im_in number of input tensor channels
482	* @param[in] wt pointer to kernel weights
483	* @param[in] ch_im_out number of filters, i.e., output tensor channels
484	* @param[in] dim_kernel_x filter kernel size x
485	* @param[in] dim_kernel_y filter kernel size y
486	* @param[in] padding_x padding size x
487	* @param[in] padding_y padding size y
488	* @param[in] stride_x convolution stride x
489	* @param[in] stride_y convolution stride y
490	* @param[in] bias pointer to bias
491	* @param[in] bias_shift amount of left-shift for bias
492	* @param[in] out_shift amount of right-shift for output
493	* @param[in,out] Im_out pointer to output tensor
494	* @param[in] dim_im_out_x output tensor dimension x
495	* @param[in] dim_im_out_y output tensor dimension y
496	* @param[in,out] bufferA pointer to buffer space for input
497	* @param[in,out] bufferB pointer to buffer space for output
498	* @return The function returns either
499	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
500	*
501	* @details
502	*
503	* <b>Buffer size:</b>
504	*
505	* bufferA size: 2ch_im_indim_kernel*dim_kernel
506	*
507	* bufferB size: 0
508	*
509	* <b>Input dimension constraints:</b>
510	*
511	* ch_im_in is multiple of 2
512	*
513	* ch_im_out is multipe of 2
514	*
515	*/
516
517	arm_status
518	arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
519	const uint16_t dim_im_in_x,
520	const uint16_t dim_im_in_y,
521	const uint16_t ch_im_in,
522	const q15_t * wt,
523	const uint16_t ch_im_out,
524	const uint16_t dim_kernel_x,
525	const uint16_t dim_kernel_y,
526	const uint16_t padding_x,
527	const uint16_t padding_y,
528	const uint16_t stride_x,
529	const uint16_t stride_y,
530	const q15_t * bias,
531	const uint16_t bias_shift,
532	const uint16_t out_shift,
533	q15_t * Im_out,
534	const uint16_t dim_im_out_x,
535	const uint16_t dim_im_out_y,
536	q15_t * bufferA,
537	q7_t * bufferB);
538
539	/**
540	* @brief Q7 depthwise separable convolution function
541	* @param[in] Im_in pointer to input tensor
542	* @param[in] dim_im_in input tensor dimention
543	* @param[in] ch_im_in number of input tensor channels
544	* @param[in] wt pointer to kernel weights
545	* @param[in] ch_im_out number of filters, i.e., output tensor channels
546	* @param[in] dim_kernel filter kernel size
547	* @param[in] padding padding sizes
548	* @param[in] stride convolution stride
549	* @param[in] bias pointer to bias
550	* @param[in] bias_shift amount of left-shift for bias
551	* @param[in] out_shift amount of right-shift for output
552	* @param[in,out] Im_out pointer to output tensor
553	* @param[in] dim_im_out output tensor dimension
554	* @param[in,out] bufferA pointer to buffer space for input
555	* @param[in,out] bufferB pointer to buffer space for output
556	* @return The function returns either
557	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
558	*
559	* This function is the version with full list of optimization tricks, but with
560	* some contraints:
561	* ch_im_in is multiple of 2
562	* ch_im_out is multiple of 2
563	*/
564
565	arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
566	const uint16_t dim_im_in,
567	const uint16_t ch_im_in,
568	const q7_t * wt,
569	const uint16_t ch_im_out,
570	const uint16_t dim_kernel,
571	const uint16_t padding,
572	const uint16_t stride,
573	const q7_t * bias,
574	const uint16_t bias_shift,
575	const uint16_t out_shift,
576	q7_t * Im_out,
577	const uint16_t dim_im_out,
578	q15_t * bufferA,
579	q7_t * bufferB);
580
581	/**
582	* @brief Q7 depthwise separable convolution function (non-square shape)
583	* @param[in] Im_in pointer to input tensor
584	* @param[in] dim_im_in_x input tensor dimention x
585	* @param[in] dim_im_in_y input tensor dimention y
586	* @param[in] ch_im_in number of input tensor channels
587	* @param[in] wt pointer to kernel weights
588	* @param[in] ch_im_out number of filters, i.e., output tensor channels
589	* @param[in] dim_kernel_x filter kernel size x
590	* @param[in] dim_kernel_y filter kernel size y
591	* @param[in] padding_x padding sizes x
592	* @param[in] padding_y padding sizes y
593	* @param[in] stride_x convolution stride x
594	* @param[in] stride_y convolution stride y
595	* @param[in] bias pointer to bias
596	* @param[in] bias_shift amount of left-shift for bias
597	* @param[in] out_shift amount of right-shift for output
598	* @param[in,out] Im_out pointer to output tensor
599	* @param[in] dim_im_out_x output tensor dimension x
600	* @param[in] dim_im_out_y output tensor dimension y
601	* @param[in,out] bufferA pointer to buffer space for input
602	* @param[in,out] bufferB pointer to buffer space for output
603	* @return The function returns either
604	* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
605	*
606	* This function is the version with full list of optimization tricks, but with
607	* some contraints:
608	* ch_im_in is multiple of 2
609	* ch_im_out is multiple of 2
610	*/
611	arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
612	const uint16_t dim_im_in_x,
613	const uint16_t dim_im_in_y,
614	const uint16_t ch_im_in,
615	const q7_t * wt,
616	const uint16_t ch_im_out,
617	const uint16_t dim_kernel_x,
618	const uint16_t dim_kernel_y,
619	const uint16_t padding_x,
620	const uint16_t padding_y,
621	const uint16_t stride_x,
622	const uint16_t stride_y,
623	const q7_t * bias,
624	const uint16_t bias_shift,
625	const uint16_t out_shift,
626	q7_t * Im_out,
627	const uint16_t dim_im_out_x,
628	const uint16_t dim_im_out_y,
629	q15_t * bufferA,
630	q7_t * bufferB);
631
632
633	/**
634	* @defgroup FC Fully-connected Layer Functions
635	*
636	* Perform fully-connected layer
637	*
638	* Fully-connected layer is basically a matrix-vector multiplication
639	* with bias. The matrix is the weights and the input/output vectors
640	* are the activation values. Supported {weight, activation} precisions
641	* include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
642	*
643	* Here we have two types of kernel functions. The basic function
644	* implements the function using regular GEMV approach. The opt functions
645	* operates with weights in interleaved formats.
646	*
647	*/
648
649	/**
650	* @brief Q7 basic fully-connected layer function
651	* @param[in] pV pointer to input vector
652	* @param[in] pM pointer to matrix weights
653	* @param[in] dim_vec length of the vector
654	* @param[in] num_of_rows number of rows in weight matrix
655	* @param[in] bias_shift amount of left-shift for bias
656	* @param[in] out_shift amount of right-shift for output
657	* @param[in] bias pointer to bias
658	* @param[in,out] pOut pointer to output vector
659	* @param[in,out] vec_buffer pointer to buffer space for input
660	* @return The function returns <code>ARM_MATH_SUCCESS</code>
661	*
662	*/
663
664	arm_status arm_fully_connected_q7(const q7_t * pV,
665	const q7_t * pM,
666	const uint16_t dim_vec,
667	const uint16_t num_of_rows,
668	const uint16_t bias_shift,
669	const uint16_t out_shift,
670	const q7_t * bias,
671	q7_t * pOut,
672	q15_t * vec_buffer);
673
674	/**
675	* @brief Q7 opt fully-connected layer function
676	* @param[in] pV pointer to input vector
677	* @param[in] pM pointer to matrix weights
678	* @param[in] dim_vec length of the vector
679	* @param[in] num_of_rows number of rows in weight matrix
680	* @param[in] bias_shift amount of left-shift for bias
681	* @param[in] out_shift amount of right-shift for output
682	* @param[in] bias pointer to bias
683	* @param[in,out] pOut pointer to output vector
684	* @param[in,out] vec_buffer pointer to buffer space for input
685	* @return The function returns <code>ARM_MATH_SUCCESS</code>
686	*
687	*/
688
689	arm_status arm_fully_connected_q7_opt(const q7_t * pV,
690	const q7_t * pM,
691	const uint16_t dim_vec,
692	const uint16_t num_of_rows,
693	const uint16_t bias_shift,
694	const uint16_t out_shift,
695	const q7_t * bias,
696	q7_t * pOut,
697	q15_t * vec_buffer);
698
699	/**
700	* @brief Q15 basic fully-connected layer function
701	* @param[in] pV pointer to input vector
702	* @param[in] pM pointer to matrix weights
703	* @param[in] dim_vec length of the vector
704	* @param[in] num_of_rows number of rows in weight matrix
705	* @param[in] bias_shift amount of left-shift for bias
706	* @param[in] out_shift amount of right-shift for output
707	* @param[in] bias pointer to bias
708	* @param[in,out] pOut pointer to output vector
709	* @param[in,out] vec_buffer pointer to buffer space for input
710	* @return The function returns <code>ARM_MATH_SUCCESS</code>
711	*
712	*/
713
714	arm_status arm_fully_connected_q15(const q15_t * pV,
715	const q15_t * pM,
716	const uint16_t dim_vec,
717	const uint16_t num_of_rows,
718	const uint16_t bias_shift,
719	const uint16_t out_shift,
720	const q15_t * bias,
721	q15_t * pOut,
722	q15_t * vec_buffer);
723
724	/**
725	* @brief Q15 opt fully-connected layer function
726	* @param[in] pV pointer to input vector
727	* @param[in] pM pointer to matrix weights
728	* @param[in] dim_vec length of the vector
729	* @param[in] num_of_rows number of rows in weight matrix
730	* @param[in] bias_shift amount of left-shift for bias
731	* @param[in] out_shift amount of right-shift for output
732	* @param[in] bias pointer to bias
733	* @param[in,out] pOut pointer to output vector
734	* @param[in,out] vec_buffer pointer to buffer space for input
735	* @return The function returns <code>ARM_MATH_SUCCESS</code>
736	*
737	*/
738
739	arm_status arm_fully_connected_q15_opt(const q15_t * pV,
740	const q15_t * pM,
741	const uint16_t dim_vec,
742	const uint16_t num_of_rows,
743	const uint16_t bias_shift,
744	const uint16_t out_shift,
745	const q15_t * bias,
746	q15_t * pOut,
747	q15_t * vec_buffer);
748
749	/**
750	* @brief Mixed Q15-Q7 fully-connected layer function
751	* @param[in] pV pointer to input vector
752	* @param[in] pM pointer to matrix weights
753	* @param[in] dim_vec length of the vector
754	* @param[in] num_of_rows number of rows in weight matrix
755	* @param[in] bias_shift amount of left-shift for bias
756	* @param[in] out_shift amount of right-shift for output
757	* @param[in] bias pointer to bias
758	* @param[in,out] pOut pointer to output vector
759	* @param[in,out] vec_buffer pointer to buffer space for input
760	* @return The function returns <code>ARM_MATH_SUCCESS</code>
761	*
762	*/
763
764	arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
765	const q7_t * pM,
766	const uint16_t dim_vec,
767	const uint16_t num_of_rows,
768	const uint16_t bias_shift,
769	const uint16_t out_shift,
770	const q7_t * bias,
771	q15_t * pOut,
772	q15_t * vec_buffer);
773
774	/**
775	* @brief Mixed Q15-Q7 opt fully-connected layer function
776	* @param[in] pV pointer to input vector
777	* @param[in] pM pointer to matrix weights
778	* @param[in] dim_vec length of the vector
779	* @param[in] num_of_rows number of rows in weight matrix
780	* @param[in] bias_shift amount of left-shift for bias
781	* @param[in] out_shift amount of right-shift for output
782	* @param[in] bias pointer to bias
783	* @param[in,out] pOut pointer to output vector
784	* @param[in,out] vec_buffer pointer to buffer space for input
785	* @return The function returns <code>ARM_MATH_SUCCESS</code>
786	*
787	*/
788
789	arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
790	const q7_t * pM,
791	const uint16_t dim_vec,
792	const uint16_t num_of_rows,
793	const uint16_t bias_shift,
794	const uint16_t out_shift,
795	const q7_t * bias,
796	q15_t * pOut,
797	q15_t * vec_buffer);
798
799	/**
800	* @brief Matrix-Multiplication Kernels for Convolution
801	*
802	* These functions are used within convolution layer functions for
803	* matrix multiplication.
804	*
805	* The implementation is similar to CMSIS-DSP arm_mat_mult functions
806	* with one Q7 and one Q15 operands. The Q15 operand is the im2col
807	* output which is always with 2 columns.
808	*
809	*/
810
811	/**
812	* @brief Matrix-multiplication function for convolution
813	* @param[in] pA pointer to operand A
814	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
815	* @param[in] ch_im_out numRow of A
816	* @param[in] numCol_A numCol of A
817	* @param[in] bias_shift amount of left-shift for bias
818	* @param[in] out_shift amount of right-shift for output
819	* @param[in] bias the bias
820	* @param[in,out] pOut pointer to output
821	* @return The function returns the incremented output pointer
822	*/
823
824	q7_t arm_nn_mat_mult_kernel_q7_q15(const q7_t pA,
825	const q15_t * pInBuffer,
826	const uint16_t ch_im_out,
827	const uint16_t numCol_A,
828	const uint16_t bias_shift,
829	const uint16_t out_shift,
830	const q7_t * bias,
831	q7_t * pOut);
832
833	/**
834	* @brief Matrix-multiplication function for convolution with reordered columns
835	* @param[in] pA pointer to operand A
836	* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
837	* @param[in] ch_im_out numRow of A
838	* @param[in] numCol_A numCol of A
839	* @param[in] bias_shift amount of left-shift for bias
840	* @param[in] out_shift amount of right-shift for output
841	* @param[in] bias the bias
842	* @param[in,out] pOut pointer to output
843	* @return The function returns the incremented output pointer
844	*/
845
846	q7_t arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t pA,
847	const q15_t * pInBuffer,
848	const uint16_t ch_im_out,
849	const uint16_t numCol_A,
850	const uint16_t bias_shift,
851	const uint16_t out_shift,
852	const q7_t * bias,
853	q7_t * pOut);
854
855	#ifdef __cplusplus
856	}
857	#endif
858
859	/*
860	* Other functions
861	* These layers are typically not timing critical
862	* Basic implementation is supported here
863	*/
864
865	#ifdef __cplusplus
866	extern "C"
867	{
868	#endif
869
870	/**
871	* @defgroup Acti Neural Network Activation Functions
872	*
873	* Perform activation layers, including ReLU (Rectified Linear Unit),
874	* sigmoid and tanh
875	*
876	*/
877
878	/**
879	* @brief Q7 RELU function
880	* @param[in,out] data pointer to input
881	* @param[in] size number of elements
882	* @return none.
883	*/
884
885	void arm_relu_q7(q7_t * data, uint16_t size);
886
887	/**
888	* @brief Q15 RELU function
889	* @param[in,out] data pointer to input
890	* @param[in] size number of elements
891	* @return none.
892	*/
893
894	void arm_relu_q15(q15_t * data, uint16_t size);
895
896	/**
897	* @brief Q7 neural network activation function using direct table look-up
898	* @param[in,out] data pointer to input
899	* @param[in] size number of elements
900	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
901	* @param[in] type type of activation functions
902	* @return none.
903	*/
904
905	void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
906	arm_nn_activation_type type);
907
908	/**
909	* @brief Q15 neural network activation function using direct table look-up
910	* @param[in,out] data pointer to input
911	* @param[in] size number of elements
912	* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
913	* @param[in] type type of activation functions
914	* @return none.
915	*/
916
917	void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
918	arm_nn_activation_type type);
919
920	/**
921	* @defgroup Pooling Neural Network Pooling Functions
922	*
923	* Perform pooling functions, including max pooling and average pooling
924	*
925	*/
926
927	/**
928	* @brief Q7 max pooling function
929	* @param[in] Im_in pointer to input tensor
930	* @param[in] dim_im_in input tensor dimention
931	* @param[in] ch_im_in number of input tensor channels
932	* @param[in] dim_kernel filter kernel size
933	* @param[in] padding padding sizes
934	* @param[in] stride convolution stride
935	* @param[in] dim_im_out output tensor dimension
936	* @param[in,out] bufferA pointer to buffer space for input
937	* @param[in,out] Im_out pointer to output tensor
938	* @return none.
939	*
940	*/
941
942	void arm_maxpool_q7_HWC(q7_t * Im_in,
943	const uint16_t dim_im_in,
944	const uint16_t ch_im_in,
945	const uint16_t dim_kernel,
946	const uint16_t padding,
947	const uint16_t stride,
948	const uint16_t dim_im_out,
949	q7_t * bufferA,
950	q7_t * Im_out);
951
952	/**
953	* @brief Q7 average pooling function
954	* @param[in] Im_in pointer to input tensor
955	* @param[in] dim_im_in input tensor dimention
956	* @param[in] ch_im_in number of input tensor channels
957	* @param[in] dim_kernel filter kernel size
958	* @param[in] padding padding sizes
959	* @param[in] stride convolution stride
960	* @param[in] dim_im_out output tensor dimension
961	* @param[in,out] bufferA pointer to buffer space for input
962	* @param[in,out] Im_out pointer to output tensor
963	* @return none.
964	*
965	*/
966
967	void arm_avepool_q7_HWC(q7_t * Im_in,
968	const uint16_t dim_im_in,
969	const uint16_t ch_im_in,
970	const uint16_t dim_kernel,
971	const uint16_t padding,
972	const uint16_t stride,
973	const uint16_t dim_im_out,
974	q7_t * bufferA,
975	q7_t * Im_out);
976
977	/**
978	* @defgroup Softmax Softmax Functions
979	*
980	* EXP(2) based softmax function
981	*
982	*/
983
984	/**
985	* @brief Q7 softmax function
986	* @param[in] vec_in pointer to input vector
987	* @param[in] dim_vec input vector dimention
988	* @param[out] p_out pointer to output vector
989	* @return none.
990	*
991	*/
992
993	void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
994
995	/**
996	* @brief Q15 softmax function
997	* @param[in] vec_in pointer to input vector
998	* @param[in] dim_vec input vector dimention
999	* @param[out] p_out pointer to output vector
1000	* @return none.
1001	*
1002	*/
1003
1004	void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
1005
1006	#ifdef __cplusplus
1007	}
1008	#endif
1009
1010	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: S-port/trunk/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Download in other formats: