source: S-port/trunk/Drivers/CMSIS/NN/Include/arm_nnfunctions.h

Last change on this file was 1, checked in by AlexLir, 3 years ago
File size: 48.5 KB
Line 
1/*
2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19/* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_nnfunctions.h
22 * Description: Public header file for CMSIS NN Library
23 *
24 * $Date: 13. July 2018
25 * $Revision: V.1.0.0
26 *
27 * Target Processor: Cortex-M cores
28 * -------------------------------------------------------------------- */
29
30/**
31 \mainpage CMSIS NN Software Library
32 *
33 * Introduction
34 * ------------
35 *
36 * This user manual describes the CMSIS NN software library,
37 * a collection of efficient neural network kernels developed to maximize the
38 * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
39 *
40 * The library is divided into a number of functions each covering a specific category:
41 * - Neural Network Convolution Functions
42 * - Neural Network Activation Functions
43 * - Fully-connected Layer Functions
44 * - Neural Network Pooling Functions
45 * - Softmax Functions
46 * - Neural Network Support Functions
47 *
48 * The library has separate functions for operating on different weight and activation data
49 * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
50 * kernels are included in the function description. The implementation details are also
51 * described in this paper [1].
52 *
53 * Block Diagram
54 * --------
55 * \image html CMSIS-NN-OVERVIEW.PNG
56 *
57 * Examples
58 * --------
59 *
60 * The library ships with a number of examples which demonstrate how to use the library functions.
61 *
62 * Pre-processor Macros
63 * ------------
64 *
65 * Each library project have differant pre-processor macros.
66 *
67 * - ARM_MATH_DSP:
68 *
69 * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
70 *
71 * - ARM_MATH_BIG_ENDIAN:
72 *
73 * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
74 *
75 * - ARM_NN_TRUNCATE:
76 *
77 * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
78 *
79 * Copyright Notice
80 * ------------
81 *
82 * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
83 *
84 * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
85 */
86
87/**
88 * @defgroup groupNN Neural Network Functions
89 * These functions perform basic operations for neural network layers.
90 */
91
92#ifndef _ARM_NNFUNCTIONS_H
93#define _ARM_NNFUNCTIONS_H
94
95#include "arm_nnsupportfunctions.h"
96#include "arm_nn_tables.h"
97
98#define USE_INTRINSIC
99
100//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
101
102#ifdef __cplusplus
103extern "C"
104{
105#endif
106
107/**
108 * @defgroup NNConv Neural Network Convolution Functions
109 *
110 * Perform convolution layer
111 *
112 * The convolution is implemented in 2 steps: im2col and GEMM
113 *
114 * im2col is a process of converting each patch of image data into
115 * a column. After im2col, the convolution is computed as matrix-matrix
116 * multiplication.
117 *
118 * To reduce the memory footprint, the im2col is performed partially.
119 * Each iteration, only a few column (i.e., patches) are generated and
120 * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
121 *
122 */
123
124 /**
125 * @brief Basic Q7 convolution function
126 * @param[in] Im_in pointer to input tensor
127 * @param[in] dim_im_in input tensor dimention
128 * @param[in] ch_im_in number of input tensor channels
129 * @param[in] wt pointer to kernel weights
130 * @param[in] ch_im_out number of filters, i.e., output tensor channels
131 * @param[in] dim_kernel filter kernel size
132 * @param[in] padding padding sizes
133 * @param[in] stride convolution stride
134 * @param[in] bias pointer to bias
135 * @param[in] bias_shift amount of left-shift for bias
136 * @param[in] out_shift amount of right-shift for output
137 * @param[in,out] Im_out pointer to output tensor
138 * @param[in] dim_im_out output tensor dimension
139 * @param[in,out] bufferA pointer to buffer space for input
140 * @param[in,out] bufferB pointer to buffer space for output
141 * @return The function returns <code>ARM_MATH_SUCCESS</code>
142 *
143 */
144
145 arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
146 const uint16_t dim_im_in,
147 const uint16_t ch_im_in,
148 const q7_t * wt,
149 const uint16_t ch_im_out,
150 const uint16_t dim_kernel,
151 const uint16_t padding,
152 const uint16_t stride,
153 const q7_t * bias,
154 const uint16_t bias_shift,
155 const uint16_t out_shift,
156 q7_t * Im_out,
157 const uint16_t dim_im_out,
158 q15_t * bufferA,
159 q7_t * bufferB);
160
161 /**
162 * @brief Basic Q7 convolution function (non-sqaure shape)
163 * @param[in] Im_in pointer to input tensor
164 * @param[in] dim_im_in_x input tensor dimention x
165 * @param[in] dim_im_in_y input tensor dimention y
166 * @param[in] ch_im_in number of input tensor channels
167 * @param[in] wt pointer to kernel weights
168 * @param[in] ch_im_out number of filters, i.e., output tensor channels
169 * @param[in] dim_kernel_x filter kernel size x
170 * @param[in] dim_kernel_y filter kernel size y
171 * @param[in] padding_x padding size x
172 * @param[in] padding_y padding size y
173 * @param[in] stride_x convolution stride x
174 * @param[in] stride_y convolution stride y
175 * @param[in] bias pointer to bias
176 * @param[in] bias_shift amount of left-shift for bias
177 * @param[in] out_shift amount of right-shift for output
178 * @param[in,out] Im_out pointer to output tensor
179 * @param[in] dim_im_out_x output tensor dimension x
180 * @param[in] dim_im_out_y output tensor dimension y
181 * @param[in,out] bufferA pointer to buffer space for input
182 * @param[in,out] bufferB pointer to buffer space for output
183 * @return The function returns <code>ARM_MATH_SUCCESS</code>
184 */
185
186 arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
187 const uint16_t dim_im_in_x,
188 const uint16_t dim_im_in_y,
189 const uint16_t ch_im_in,
190 const q7_t * wt,
191 const uint16_t ch_im_out,
192 const uint16_t dim_kernel_x,
193 const uint16_t dim_kernel_y,
194 const uint16_t padding_x,
195 const uint16_t padding_y,
196 const uint16_t stride_x,
197 const uint16_t stride_y,
198 const q7_t * bias,
199 const uint16_t bias_shift,
200 const uint16_t out_shift,
201 q7_t * Im_out,
202 const uint16_t dim_im_out_x,
203 const uint16_t dim_im_out_y,
204 q15_t * bufferA,
205 q7_t * bufferB);
206
207 /**
208 * @brief Basic Q15 convolution function
209 * @param[in] Im_in pointer to input tensor
210 * @param[in] dim_im_in input tensor dimention
211 * @param[in] ch_im_in number of input tensor channels
212 * @param[in] wt pointer to kernel weights
213 * @param[in] ch_im_out number of filters, i.e., output tensor channels
214 * @param[in] dim_kernel filter kernel size
215 * @param[in] padding padding sizes
216 * @param[in] stride convolution stride
217 * @param[in] bias pointer to bias
218 * @param[in] bias_shift amount of left-shift for bias
219 * @param[in] out_shift amount of right-shift for output
220 * @param[in,out] Im_out pointer to output tensor
221 * @param[in] dim_im_out output tensor dimension
222 * @param[in,out] bufferA pointer to buffer space for input
223 * @param[in,out] bufferB pointer to buffer space for output
224 * @return The function returns <code>ARM_MATH_SUCCESS</code>
225 *
226 */
227
228 arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
229 const uint16_t dim_im_in,
230 const uint16_t ch_im_in,
231 const q15_t * wt,
232 const uint16_t ch_im_out,
233 const uint16_t dim_kernel,
234 const uint16_t padding,
235 const uint16_t stride,
236 const q15_t * bias,
237 const uint16_t bias_shift,
238 const uint16_t out_shift,
239 q15_t * Im_out,
240 const uint16_t dim_im_out,
241 q15_t * bufferA,
242 q7_t * bufferB);
243
244 /**
245 * @brief Fast Q7 convolution function
246 * @param[in] Im_in pointer to input tensor
247 * @param[in] dim_im_in input tensor dimention
248 * @param[in] ch_im_in number of input tensor channels
249 * @param[in] wt pointer to kernel weights
250 * @param[in] ch_im_out number of filters, i.e., output tensor channels
251 * @param[in] dim_kernel filter kernel size
252 * @param[in] padding padding sizes
253 * @param[in] stride convolution stride
254 * @param[in] bias pointer to bias
255 * @param[in] bias_shift amount of left-shift for bias
256 * @param[in] out_shift amount of right-shift for output
257 * @param[in,out] Im_out pointer to output tensor
258 * @param[in] dim_im_out output tensor dimension
259 * @param[in,out] bufferA pointer to buffer space for input
260 * @param[in,out] bufferB pointer to buffer space for output
261 * @return The function returns either
262 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
263 *
264 * This function is the version with full list of optimization tricks, but with
265 * some contraints:
266 * ch_im_in is multiple of 4
267 * ch_im_out is multiple of 2
268 */
269
270 arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
271 const uint16_t dim_im_in,
272 const uint16_t ch_im_in,
273 const q7_t * wt,
274 const uint16_t ch_im_out,
275 const uint16_t dim_kernel,
276 const uint16_t padding,
277 const uint16_t stride,
278 const q7_t * bias,
279 const uint16_t bias_shift,
280 const uint16_t out_shift,
281 q7_t * Im_out,
282 const uint16_t dim_im_out,
283 q15_t * bufferA,
284 q7_t * bufferB);
285
286 /**
287 * @brief Fast Q7 convolution function (non-sqaure shape)
288 * @param[in] Im_in pointer to input tensor
289 * @param[in] dim_im_in_x input tensor dimention x
290 * @param[in] dim_im_in_y input tensor dimention y
291 * @param[in] ch_im_in number of input tensor channels
292 * @param[in] wt pointer to kernel weights
293 * @param[in] ch_im_out number of filters, i.e., output tensor channels
294 * @param[in] dim_kernel_x filter kernel size x
295 * @param[in] dim_kernel_y filter kernel size y
296 * @param[in] padding_x padding size x
297 * @param[in] padding_y padding size y
298 * @param[in] stride_x convolution stride x
299 * @param[in] stride_y convolution stride y
300 * @param[in] bias pointer to bias
301 * @param[in] bias_shift amount of left-shift for bias
302 * @param[in] out_shift amount of right-shift for output
303 * @param[in,out] Im_out pointer to output tensor
304 * @param[in] dim_im_out_x output tensor dimension x
305 * @param[in] dim_im_out_y output tensor dimension y
306 * @param[in,out] bufferA pointer to buffer space for input
307 * @param[in,out] bufferB pointer to buffer space for output
308 * @return The function returns either
309 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
310 *
311 * This function is the version with full list of optimization tricks, but with
312 * some contraints:
313 * ch_im_in is multiple of 4
314 * ch_im_out is multiple of 2
315 */
316
317 arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
318 const uint16_t dim_im_in_x,
319 const uint16_t dim_im_in_y,
320 const uint16_t ch_im_in,
321 const q7_t * wt,
322 const uint16_t ch_im_out,
323 const uint16_t dim_kernel_x,
324 const uint16_t dim_kernel_y,
325 const uint16_t padding_x,
326 const uint16_t padding_y,
327 const uint16_t stride_x,
328 const uint16_t stride_y,
329 const q7_t * bias,
330 const uint16_t bias_shift,
331 const uint16_t out_shift,
332 q7_t * Im_out,
333 const uint16_t dim_im_out_x,
334 const uint16_t dim_im_out_y,
335 q15_t * bufferA,
336 q7_t * bufferB);
337
338 /**
339 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
340 * @param[in] Im_in pointer to input tensor
341 * @param[in] dim_im_in_x input tensor dimention x
342 * @param[in] dim_im_in_y input tensor dimention y
343 * @param[in] ch_im_in number of input tensor channels
344 * @param[in] wt pointer to kernel weights
345 * @param[in] ch_im_out number of filters, i.e., output tensor channels
346 * @param[in] dim_kernel_x filter kernel size x
347 * @param[in] dim_kernel_y filter kernel size y
348 * @param[in] padding_x padding size x
349 * @param[in] padding_y padding size y
350 * @param[in] stride_x convolution stride x
351 * @param[in] stride_y convolution stride y
352 * @param[in] bias pointer to bias
353 * @param[in] bias_shift amount of left-shift for bias
354 * @param[in] out_shift amount of right-shift for output
355 * @param[in,out] Im_out pointer to output tensor
356 * @param[in] dim_im_out_x output tensor dimension x
357 * @param[in] dim_im_out_y output tensor dimension y
358 * @param[in,out] bufferA pointer to buffer space for input
359 * @param[in,out] bufferB pointer to buffer space for output
360 * @return The function returns either
361 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
362 *
363 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
364 * and dim_kernel_y=1). It can be used for
365 * second half of MobileNets after depthwise separable convolution.
366 *
367 * This function is the version with full list of optimization tricks, but with
368 * some contraints:
369 * ch_im_in is multiple of 4
370 * ch_im_out is multiple of 2
371 */
372 arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
373 const uint16_t dim_im_in_x,
374 const uint16_t dim_im_in_y,
375 const uint16_t ch_im_in,
376 const q7_t * wt,
377 const uint16_t ch_im_out,
378 const uint16_t dim_kernel_x,
379 const uint16_t dim_kernel_y,
380 const uint16_t padding_x,
381 const uint16_t padding_y,
382 const uint16_t stride_x,
383 const uint16_t stride_y,
384 const q7_t * bias,
385 const uint16_t bias_shift,
386 const uint16_t out_shift,
387 q7_t * Im_out,
388 const uint16_t dim_im_out_x,
389 const uint16_t dim_im_out_y,
390 q15_t * bufferA,
391 q7_t * bufferB);
392
393 /**
394 * @brief Q7 version of convolution for RGB image
395 * @param[in] Im_in pointer to input tensor
396 * @param[in] dim_im_in input tensor dimention
397 * @param[in] ch_im_in number of input tensor channels
398 * @param[in] wt pointer to kernel weights
399 * @param[in] ch_im_out number of filters, i.e., output tensor channels
400 * @param[in] dim_kernel filter kernel size
401 * @param[in] padding padding sizes
402 * @param[in] stride convolution stride
403 * @param[in] bias pointer to bias
404 * @param[in] bias_shift amount of left-shift for bias
405 * @param[in] out_shift amount of right-shift for output
406 * @param[in,out] Im_out pointer to output tensor
407 * @param[in] dim_im_out output tensor dimension
408 * @param[in,out] bufferA pointer to buffer space for input
409 * @param[in,out] bufferB pointer to buffer space for output
410 * @return The function returns either
411 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
412 *
413 * This kernel is written exclusively for convolution with ch_im_in
414 * equals 3. This applies on the first layer of CNNs which has input
415 * image with RGB format.
416 */
417
418 arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
419 const uint16_t dim_im_in,
420 const uint16_t ch_im_in,
421 const q7_t * wt,
422 const uint16_t ch_im_out,
423 const uint16_t dim_kernel,
424 const uint16_t padding,
425 const uint16_t stride,
426 const q7_t * bias,
427 const uint16_t bias_shift,
428 const uint16_t out_shift,
429 q7_t * Im_out,
430 const uint16_t dim_im_out,
431 q15_t * bufferA,
432 q7_t * bufferB);
433
434 /**
435 * @brief Fast Q15 convolution function
436 * @param[in] Im_in pointer to input tensor
437 * @param[in] dim_im_in input tensor dimention
438 * @param[in] ch_im_in number of input tensor channels
439 * @param[in] wt pointer to kernel weights
440 * @param[in] ch_im_out number of filters, i.e., output tensor channels
441 * @param[in] dim_kernel filter kernel size
442 * @param[in] padding padding sizes
443 * @param[in] stride convolution stride
444 * @param[in] bias pointer to bias
445 * @param[in] bias_shift amount of left-shift for bias
446 * @param[in] out_shift amount of right-shift for output
447 * @param[in,out] Im_out pointer to output tensor
448 * @param[in] dim_im_out output tensor dimension
449 * @param[in,out] bufferA pointer to buffer space for input
450 * @param[in,out] bufferB pointer to buffer space for output
451 * @return The function returns either
452 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
453 *
454 * This function is the version with full list of optimization tricks, but with
455 * some contraints:
456 * ch_im_in is multiple of 2
457 * ch_im_out is multiple of 2
458 */
459
460 arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
461 const uint16_t dim_im_in,
462 const uint16_t ch_im_in,
463 const q15_t * wt,
464 const uint16_t ch_im_out,
465 const uint16_t dim_kernel,
466 const uint16_t padding,
467 const uint16_t stride,
468 const q15_t * bias,
469 const uint16_t bias_shift,
470 const uint16_t out_shift,
471 q15_t * Im_out,
472 const uint16_t dim_im_out,
473 q15_t * bufferA,
474 q7_t * bufferB);
475
476 /**
477 * @brief Fast Q15 convolution function (non-sqaure shape)
478 * @param[in] Im_in pointer to input tensor
479 * @param[in] dim_im_in_x input tensor dimention x
480 * @param[in] dim_im_in_y input tensor dimention y
481 * @param[in] ch_im_in number of input tensor channels
482 * @param[in] wt pointer to kernel weights
483 * @param[in] ch_im_out number of filters, i.e., output tensor channels
484 * @param[in] dim_kernel_x filter kernel size x
485 * @param[in] dim_kernel_y filter kernel size y
486 * @param[in] padding_x padding size x
487 * @param[in] padding_y padding size y
488 * @param[in] stride_x convolution stride x
489 * @param[in] stride_y convolution stride y
490 * @param[in] bias pointer to bias
491 * @param[in] bias_shift amount of left-shift for bias
492 * @param[in] out_shift amount of right-shift for output
493 * @param[in,out] Im_out pointer to output tensor
494 * @param[in] dim_im_out_x output tensor dimension x
495 * @param[in] dim_im_out_y output tensor dimension y
496 * @param[in,out] bufferA pointer to buffer space for input
497 * @param[in,out] bufferB pointer to buffer space for output
498 * @return The function returns either
499 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
500 *
501 * @details
502 *
503 * <b>Buffer size:</b>
504 *
505 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
506 *
507 * bufferB size: 0
508 *
509 * <b>Input dimension constraints:</b>
510 *
511 * ch_im_in is multiple of 2
512 *
513 * ch_im_out is multipe of 2
514 *
515 */
516
517 arm_status
518 arm_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
519 const uint16_t dim_im_in_x,
520 const uint16_t dim_im_in_y,
521 const uint16_t ch_im_in,
522 const q15_t * wt,
523 const uint16_t ch_im_out,
524 const uint16_t dim_kernel_x,
525 const uint16_t dim_kernel_y,
526 const uint16_t padding_x,
527 const uint16_t padding_y,
528 const uint16_t stride_x,
529 const uint16_t stride_y,
530 const q15_t * bias,
531 const uint16_t bias_shift,
532 const uint16_t out_shift,
533 q15_t * Im_out,
534 const uint16_t dim_im_out_x,
535 const uint16_t dim_im_out_y,
536 q15_t * bufferA,
537 q7_t * bufferB);
538
539 /**
540 * @brief Q7 depthwise separable convolution function
541 * @param[in] Im_in pointer to input tensor
542 * @param[in] dim_im_in input tensor dimention
543 * @param[in] ch_im_in number of input tensor channels
544 * @param[in] wt pointer to kernel weights
545 * @param[in] ch_im_out number of filters, i.e., output tensor channels
546 * @param[in] dim_kernel filter kernel size
547 * @param[in] padding padding sizes
548 * @param[in] stride convolution stride
549 * @param[in] bias pointer to bias
550 * @param[in] bias_shift amount of left-shift for bias
551 * @param[in] out_shift amount of right-shift for output
552 * @param[in,out] Im_out pointer to output tensor
553 * @param[in] dim_im_out output tensor dimension
554 * @param[in,out] bufferA pointer to buffer space for input
555 * @param[in,out] bufferB pointer to buffer space for output
556 * @return The function returns either
557 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
558 *
559 * This function is the version with full list of optimization tricks, but with
560 * some contraints:
561 * ch_im_in is multiple of 2
562 * ch_im_out is multiple of 2
563 */
564
565 arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
566 const uint16_t dim_im_in,
567 const uint16_t ch_im_in,
568 const q7_t * wt,
569 const uint16_t ch_im_out,
570 const uint16_t dim_kernel,
571 const uint16_t padding,
572 const uint16_t stride,
573 const q7_t * bias,
574 const uint16_t bias_shift,
575 const uint16_t out_shift,
576 q7_t * Im_out,
577 const uint16_t dim_im_out,
578 q15_t * bufferA,
579 q7_t * bufferB);
580
581 /**
582 * @brief Q7 depthwise separable convolution function (non-square shape)
583 * @param[in] Im_in pointer to input tensor
584 * @param[in] dim_im_in_x input tensor dimention x
585 * @param[in] dim_im_in_y input tensor dimention y
586 * @param[in] ch_im_in number of input tensor channels
587 * @param[in] wt pointer to kernel weights
588 * @param[in] ch_im_out number of filters, i.e., output tensor channels
589 * @param[in] dim_kernel_x filter kernel size x
590 * @param[in] dim_kernel_y filter kernel size y
591 * @param[in] padding_x padding sizes x
592 * @param[in] padding_y padding sizes y
593 * @param[in] stride_x convolution stride x
594 * @param[in] stride_y convolution stride y
595 * @param[in] bias pointer to bias
596 * @param[in] bias_shift amount of left-shift for bias
597 * @param[in] out_shift amount of right-shift for output
598 * @param[in,out] Im_out pointer to output tensor
599 * @param[in] dim_im_out_x output tensor dimension x
600 * @param[in] dim_im_out_y output tensor dimension y
601 * @param[in,out] bufferA pointer to buffer space for input
602 * @param[in,out] bufferB pointer to buffer space for output
603 * @return The function returns either
604 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
605 *
606 * This function is the version with full list of optimization tricks, but with
607 * some contraints:
608 * ch_im_in is multiple of 2
609 * ch_im_out is multiple of 2
610 */
611 arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
612 const uint16_t dim_im_in_x,
613 const uint16_t dim_im_in_y,
614 const uint16_t ch_im_in,
615 const q7_t * wt,
616 const uint16_t ch_im_out,
617 const uint16_t dim_kernel_x,
618 const uint16_t dim_kernel_y,
619 const uint16_t padding_x,
620 const uint16_t padding_y,
621 const uint16_t stride_x,
622 const uint16_t stride_y,
623 const q7_t * bias,
624 const uint16_t bias_shift,
625 const uint16_t out_shift,
626 q7_t * Im_out,
627 const uint16_t dim_im_out_x,
628 const uint16_t dim_im_out_y,
629 q15_t * bufferA,
630 q7_t * bufferB);
631
632
633/**
634 * @defgroup FC Fully-connected Layer Functions
635 *
636 * Perform fully-connected layer
637 *
638 * Fully-connected layer is basically a matrix-vector multiplication
639 * with bias. The matrix is the weights and the input/output vectors
640 * are the activation values. Supported {weight, activation} precisions
641 * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
642 *
643 * Here we have two types of kernel functions. The basic function
644 * implements the function using regular GEMV approach. The opt functions
645 * operates with weights in interleaved formats.
646 *
647 */
648
649 /**
650 * @brief Q7 basic fully-connected layer function
651 * @param[in] pV pointer to input vector
652 * @param[in] pM pointer to matrix weights
653 * @param[in] dim_vec length of the vector
654 * @param[in] num_of_rows number of rows in weight matrix
655 * @param[in] bias_shift amount of left-shift for bias
656 * @param[in] out_shift amount of right-shift for output
657 * @param[in] bias pointer to bias
658 * @param[in,out] pOut pointer to output vector
659 * @param[in,out] vec_buffer pointer to buffer space for input
660 * @return The function returns <code>ARM_MATH_SUCCESS</code>
661 *
662 */
663
664 arm_status arm_fully_connected_q7(const q7_t * pV,
665 const q7_t * pM,
666 const uint16_t dim_vec,
667 const uint16_t num_of_rows,
668 const uint16_t bias_shift,
669 const uint16_t out_shift,
670 const q7_t * bias,
671 q7_t * pOut,
672 q15_t * vec_buffer);
673
674 /**
675 * @brief Q7 opt fully-connected layer function
676 * @param[in] pV pointer to input vector
677 * @param[in] pM pointer to matrix weights
678 * @param[in] dim_vec length of the vector
679 * @param[in] num_of_rows number of rows in weight matrix
680 * @param[in] bias_shift amount of left-shift for bias
681 * @param[in] out_shift amount of right-shift for output
682 * @param[in] bias pointer to bias
683 * @param[in,out] pOut pointer to output vector
684 * @param[in,out] vec_buffer pointer to buffer space for input
685 * @return The function returns <code>ARM_MATH_SUCCESS</code>
686 *
687 */
688
689 arm_status arm_fully_connected_q7_opt(const q7_t * pV,
690 const q7_t * pM,
691 const uint16_t dim_vec,
692 const uint16_t num_of_rows,
693 const uint16_t bias_shift,
694 const uint16_t out_shift,
695 const q7_t * bias,
696 q7_t * pOut,
697 q15_t * vec_buffer);
698
699 /**
700 * @brief Q15 basic fully-connected layer function
701 * @param[in] pV pointer to input vector
702 * @param[in] pM pointer to matrix weights
703 * @param[in] dim_vec length of the vector
704 * @param[in] num_of_rows number of rows in weight matrix
705 * @param[in] bias_shift amount of left-shift for bias
706 * @param[in] out_shift amount of right-shift for output
707 * @param[in] bias pointer to bias
708 * @param[in,out] pOut pointer to output vector
709 * @param[in,out] vec_buffer pointer to buffer space for input
710 * @return The function returns <code>ARM_MATH_SUCCESS</code>
711 *
712 */
713
714 arm_status arm_fully_connected_q15(const q15_t * pV,
715 const q15_t * pM,
716 const uint16_t dim_vec,
717 const uint16_t num_of_rows,
718 const uint16_t bias_shift,
719 const uint16_t out_shift,
720 const q15_t * bias,
721 q15_t * pOut,
722 q15_t * vec_buffer);
723
724 /**
725 * @brief Q15 opt fully-connected layer function
726 * @param[in] pV pointer to input vector
727 * @param[in] pM pointer to matrix weights
728 * @param[in] dim_vec length of the vector
729 * @param[in] num_of_rows number of rows in weight matrix
730 * @param[in] bias_shift amount of left-shift for bias
731 * @param[in] out_shift amount of right-shift for output
732 * @param[in] bias pointer to bias
733 * @param[in,out] pOut pointer to output vector
734 * @param[in,out] vec_buffer pointer to buffer space for input
735 * @return The function returns <code>ARM_MATH_SUCCESS</code>
736 *
737 */
738
739 arm_status arm_fully_connected_q15_opt(const q15_t * pV,
740 const q15_t * pM,
741 const uint16_t dim_vec,
742 const uint16_t num_of_rows,
743 const uint16_t bias_shift,
744 const uint16_t out_shift,
745 const q15_t * bias,
746 q15_t * pOut,
747 q15_t * vec_buffer);
748
749 /**
750 * @brief Mixed Q15-Q7 fully-connected layer function
751 * @param[in] pV pointer to input vector
752 * @param[in] pM pointer to matrix weights
753 * @param[in] dim_vec length of the vector
754 * @param[in] num_of_rows number of rows in weight matrix
755 * @param[in] bias_shift amount of left-shift for bias
756 * @param[in] out_shift amount of right-shift for output
757 * @param[in] bias pointer to bias
758 * @param[in,out] pOut pointer to output vector
759 * @param[in,out] vec_buffer pointer to buffer space for input
760 * @return The function returns <code>ARM_MATH_SUCCESS</code>
761 *
762 */
763
764 arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
765 const q7_t * pM,
766 const uint16_t dim_vec,
767 const uint16_t num_of_rows,
768 const uint16_t bias_shift,
769 const uint16_t out_shift,
770 const q7_t * bias,
771 q15_t * pOut,
772 q15_t * vec_buffer);
773
774 /**
775 * @brief Mixed Q15-Q7 opt fully-connected layer function
776 * @param[in] pV pointer to input vector
777 * @param[in] pM pointer to matrix weights
778 * @param[in] dim_vec length of the vector
779 * @param[in] num_of_rows number of rows in weight matrix
780 * @param[in] bias_shift amount of left-shift for bias
781 * @param[in] out_shift amount of right-shift for output
782 * @param[in] bias pointer to bias
783 * @param[in,out] pOut pointer to output vector
784 * @param[in,out] vec_buffer pointer to buffer space for input
785 * @return The function returns <code>ARM_MATH_SUCCESS</code>
786 *
787 */
788
789 arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
790 const q7_t * pM,
791 const uint16_t dim_vec,
792 const uint16_t num_of_rows,
793 const uint16_t bias_shift,
794 const uint16_t out_shift,
795 const q7_t * bias,
796 q15_t * pOut,
797 q15_t * vec_buffer);
798
799/**
800 * @brief Matrix-Multiplication Kernels for Convolution
801 *
802 * These functions are used within convolution layer functions for
803 * matrix multiplication.
804 *
805 * The implementation is similar to CMSIS-DSP arm_mat_mult functions
806 * with one Q7 and one Q15 operands. The Q15 operand is the im2col
807 * output which is always with 2 columns.
808 *
809 */
810
811 /**
812 * @brief Matrix-multiplication function for convolution
813 * @param[in] pA pointer to operand A
814 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
815 * @param[in] ch_im_out numRow of A
816 * @param[in] numCol_A numCol of A
817 * @param[in] bias_shift amount of left-shift for bias
818 * @param[in] out_shift amount of right-shift for output
819 * @param[in] bias the bias
820 * @param[in,out] pOut pointer to output
821 * @return The function returns the incremented output pointer
822 */
823
824 q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
825 const q15_t * pInBuffer,
826 const uint16_t ch_im_out,
827 const uint16_t numCol_A,
828 const uint16_t bias_shift,
829 const uint16_t out_shift,
830 const q7_t * bias,
831 q7_t * pOut);
832
833 /**
834 * @brief Matrix-multiplication function for convolution with reordered columns
835 * @param[in] pA pointer to operand A
836 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
837 * @param[in] ch_im_out numRow of A
838 * @param[in] numCol_A numCol of A
839 * @param[in] bias_shift amount of left-shift for bias
840 * @param[in] out_shift amount of right-shift for output
841 * @param[in] bias the bias
842 * @param[in,out] pOut pointer to output
843 * @return The function returns the incremented output pointer
844 */
845
846 q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
847 const q15_t * pInBuffer,
848 const uint16_t ch_im_out,
849 const uint16_t numCol_A,
850 const uint16_t bias_shift,
851 const uint16_t out_shift,
852 const q7_t * bias,
853 q7_t * pOut);
854
855#ifdef __cplusplus
856}
857#endif
858
859/*
860 * Other functions
861 * These layers are typically not timing critical
862 * Basic implementation is supported here
863 */
864
865#ifdef __cplusplus
866extern "C"
867{
868#endif
869
870/**
871 * @defgroup Acti Neural Network Activation Functions
872 *
873 * Perform activation layers, including ReLU (Rectified Linear Unit),
874 * sigmoid and tanh
875 *
876 */
877
878 /**
879 * @brief Q7 RELU function
880 * @param[in,out] data pointer to input
881 * @param[in] size number of elements
882 * @return none.
883 */
884
885 void arm_relu_q7(q7_t * data, uint16_t size);
886
887 /**
888 * @brief Q15 RELU function
889 * @param[in,out] data pointer to input
890 * @param[in] size number of elements
891 * @return none.
892 */
893
894 void arm_relu_q15(q15_t * data, uint16_t size);
895
896 /**
897 * @brief Q7 neural network activation function using direct table look-up
898 * @param[in,out] data pointer to input
899 * @param[in] size number of elements
900 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
901 * @param[in] type type of activation functions
902 * @return none.
903 */
904
905 void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
906 arm_nn_activation_type type);
907
908 /**
909 * @brief Q15 neural network activation function using direct table look-up
910 * @param[in,out] data pointer to input
911 * @param[in] size number of elements
912 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
913 * @param[in] type type of activation functions
914 * @return none.
915 */
916
917 void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
918 arm_nn_activation_type type);
919
920/**
921 * @defgroup Pooling Neural Network Pooling Functions
922 *
923 * Perform pooling functions, including max pooling and average pooling
924 *
925 */
926
927 /**
928 * @brief Q7 max pooling function
929 * @param[in] Im_in pointer to input tensor
930 * @param[in] dim_im_in input tensor dimention
931 * @param[in] ch_im_in number of input tensor channels
932 * @param[in] dim_kernel filter kernel size
933 * @param[in] padding padding sizes
934 * @param[in] stride convolution stride
935 * @param[in] dim_im_out output tensor dimension
936 * @param[in,out] bufferA pointer to buffer space for input
937 * @param[in,out] Im_out pointer to output tensor
938 * @return none.
939 *
940 */
941
942 void arm_maxpool_q7_HWC(q7_t * Im_in,
943 const uint16_t dim_im_in,
944 const uint16_t ch_im_in,
945 const uint16_t dim_kernel,
946 const uint16_t padding,
947 const uint16_t stride,
948 const uint16_t dim_im_out,
949 q7_t * bufferA,
950 q7_t * Im_out);
951
952 /**
953 * @brief Q7 average pooling function
954 * @param[in] Im_in pointer to input tensor
955 * @param[in] dim_im_in input tensor dimention
956 * @param[in] ch_im_in number of input tensor channels
957 * @param[in] dim_kernel filter kernel size
958 * @param[in] padding padding sizes
959 * @param[in] stride convolution stride
960 * @param[in] dim_im_out output tensor dimension
961 * @param[in,out] bufferA pointer to buffer space for input
962 * @param[in,out] Im_out pointer to output tensor
963 * @return none.
964 *
965 */
966
967 void arm_avepool_q7_HWC(q7_t * Im_in,
968 const uint16_t dim_im_in,
969 const uint16_t ch_im_in,
970 const uint16_t dim_kernel,
971 const uint16_t padding,
972 const uint16_t stride,
973 const uint16_t dim_im_out,
974 q7_t * bufferA,
975 q7_t * Im_out);
976
977/**
978 * @defgroup Softmax Softmax Functions
979 *
980 * EXP(2) based softmax function
981 *
982 */
983
984 /**
985 * @brief Q7 softmax function
986 * @param[in] vec_in pointer to input vector
987 * @param[in] dim_vec input vector dimention
988 * @param[out] p_out pointer to output vector
989 * @return none.
990 *
991 */
992
993 void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
994
995 /**
996 * @brief Q15 softmax function
997 * @param[in] vec_in pointer to input vector
998 * @param[in] dim_vec input vector dimention
999 * @param[out] p_out pointer to output vector
1000 * @return none.
1001 *
1002 */
1003
1004 void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
1005
1006#ifdef __cplusplus
1007}
1008#endif
1009
1010#endif
Note: See TracBrowser for help on using the repository browser.