Skip to content

Commit 2b7411f

Browse files
chunseokleechunseoklee
authored andcommitted
[onert-micro] Dequantize f16 to f32
Signed-off-by: chunseoklee <[email protected]>
1 parent c93d12c commit 2b7411f

File tree

3 files changed

+143
-4
lines changed

3 files changed

+143
-4
lines changed

onert-micro/onert-micro/include/pal/common/PALDequantize.h

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,29 @@
1414
* limitations under the License.
1515
*/
1616

17+
/*
18+
The MIT License (MIT)
19+
20+
Copyright (c) 2017 Facebook Inc.
21+
Copyright (c) 2017 Georgia Institute of Technology
22+
Copyright 2019 Google LLC
23+
24+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
25+
associated documentation files (the "Software"), to deal in the Software without restriction,
26+
including without limitation the rights to use, copy, modify, merge, publish, distribute,
27+
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
28+
furnished to do so, subject to the following conditions:
29+
30+
The above copyright notice and this permission notice shall be included in all copies or
31+
substantial portions of the Software.
32+
33+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
34+
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
35+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
36+
OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
37+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38+
*/
39+
1740
#ifndef ONERT_MICRO_EXECUTE_PAL_DEQUANTIZE_COMMON_H
1841
#define ONERT_MICRO_EXECUTE_PAL_DEQUANTIZE_COMMON_H
1942

@@ -23,6 +46,101 @@
2346
#include "PALUtils.h"
2447

2548
#include <cmath>
49+
#include <bit>
50+
51+
namespace
52+
{
53+
54+
/// Notice that this code comes from FP16(https://github.com/Maratyszcza/FP16) under MIT License
55+
56+
/*
57+
* Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
58+
* a 32-bit floating-point number in IEEE single-precision format, in bit representation.
59+
*
60+
* @note The implementation doesn't use any floating-point operations.
61+
*/
62+
uint32_t fp16_ieee_to_fp32_bits(uint16_t h)
63+
{
64+
/*
65+
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the
66+
* 32-bit word:
67+
* +---+-----+------------+-------------------+
68+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
69+
* +---+-----+------------+-------------------+
70+
* Bits 31 26-30 16-25 0-15
71+
*
72+
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
73+
*/
74+
const uint32_t w = (uint32_t)h << 16;
75+
/*
76+
* Extract the sign of the input number into the high bit of the 32-bit word:
77+
*
78+
* +---+----------------------------------+
79+
* | S |0000000 00000000 00000000 00000000|
80+
* +---+----------------------------------+
81+
* Bits 31 0-31
82+
*/
83+
const uint32_t sign = w & UINT32_C(0x80000000);
84+
/*
85+
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
86+
*
87+
* +---+-----+------------+-------------------+
88+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
89+
* +---+-----+------------+-------------------+
90+
* Bits 30 27-31 17-26 0-16
91+
*/
92+
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
93+
/*
94+
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number
95+
* normalized. If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit
96+
* exponent) equals one. In this case renorm_shift == 0. If the number is denormalize,
97+
* renorm_shift > 0. Note that if we shift denormalized nonsign by renorm_shift, the unit bit of
98+
* mantissa will shift into exponent, turning the biased exponent into 1, and making mantissa
99+
* normalized (i.e. without leading 1).
100+
*/
101+
#ifdef _MSC_VER
102+
unsigned long nonsign_bsr;
103+
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
104+
uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
105+
#else
106+
uint32_t renorm_shift = __builtin_clz(nonsign);
107+
#endif
108+
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
109+
/*
110+
* Iff half-precision number has exponent of 15, the addition overflows it into bit 31,
111+
* and the subsequent shift turns the high 9 bits into 1. Thus
112+
* inf_nan_mask ==
113+
* 0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or
114+
* infinity) 0x00000000 otherwise
115+
*/
116+
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
117+
/*
118+
* Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31
119+
* remains 0. The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
120+
* zero_mask ==
121+
* 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
122+
* 0x00000000 otherwise
123+
*/
124+
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
125+
/*
126+
* 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
127+
* 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and
128+
* 10-bit mantissa shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision
129+
* number.
130+
* 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
131+
* (0x7F for single-precision number less 0xF for half-precision number).
132+
* 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization.
133+
* As renorm_shift is less than 0x70, this can be combined with step 3.
134+
* 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity.
135+
* 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was
136+
* zero.
137+
* 7. Combine with the sign of the input number.
138+
*/
139+
return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) &
140+
~zero_mask);
141+
}
142+
143+
} // namespace
26144

27145
namespace onert_micro
28146
{
@@ -46,6 +164,20 @@ OMStatus Dequantize(const core::QuantizationParams op_params, const uint32_t fla
46164
}
47165
return Ok;
48166
}
167+
168+
OMStatus DequantizeF16toF32(const uint32_t flat_size, const uint16_t *input_data,
169+
float *output_data)
170+
{
171+
for (uint32_t i = 0; i < flat_size; i++)
172+
{
173+
uint32_t f32_in_bits = (fp16_ieee_to_fp32_bits(input_data[i]));
174+
float val;
175+
memcpy(&val, &f32_in_bits, sizeof(float));
176+
output_data[i] = val;
177+
}
178+
return Ok;
179+
}
180+
49181
} // namespace pal
50182
} // namespace execute
51183
} // namespace onert_micro

onert-micro/onert-micro/src/execute/kernels/Dequantize.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ OMStatus execute_kernel_CircleDequantize(const OMExecuteArgs &execute_args)
5656
switch (input->type())
5757
{
5858
#ifndef DIS_FLOAT
59+
case circle::TensorType_FLOAT16:
60+
{
61+
status = pal::DequantizeF16toF32(core::OMRuntimeShape(input).flatSize(),
62+
core::utils::castInputData<uint16_t>(input_data),
63+
core::utils::castOutputData<float>(output_data));
64+
}
65+
break;
5966
case circle::TensorType_INT8:
6067
{
6168
assert(input->quantization() != nullptr);

onert-micro/onert-micro/src/import/kernels/Dequantize.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ OMStatus configure_kernel_CircleDequantize(const OMConfigureArgs &config_args)
6767

6868
// Check input quantization params
6969
const auto *input_quantization = input->quantization();
70-
status = utils::checkCondition(input->type() != circle::TensorType_FLOAT32 or
71-
input_quantization != nullptr and
72-
input_quantization->scale() != nullptr and
73-
input_quantization->scale()->size() == 1);
70+
status = utils::checkCondition(
71+
input->type() != circle::TensorType_FLOAT32 or input->type() != circle::TensorType_FLOAT16 or
72+
input_quantization != nullptr and input_quantization->scale() != nullptr and
73+
input_quantization->scale()->size() == 1);
7474
if (status != Ok)
7575
return status;
7676

0 commit comments

Comments
 (0)