Home

quantized_mlp_inference.py

import numpy as np

# Quantization routines

def quantize(data: np.ndarray, scale: float, zero_point: int, bit_width: int):
    q_data = zero_point + data / scale

    min_qval, max_qval = -2.0 ** (bit_width - 1), 2.0 ** (bit_width - 1) - 1.0
    # Rescaled numbers which are smaller than the minimum quantized value `min_qval` are clipped to
    # this minimum value. Analogously for rescaled numbers greater than `max_qval`.
    q_data_clipped = np.clip(q_data, min_qval, max_qval)  
    q_data_boxed = np.array(np.rint(q_data_clipped), dtype=np.int64)

    return q_data_boxed

def dequantize(arr: np.ndarray, scale: float, zero_point: int | np.ndarray):
    return ((arr - zero_point) * scale).astype(np.float32)

def quant_parameters(min_val: np.float32, max_val: np.float32, bit_width: int, asymmetric: bool):
    min_qval, max_qval = -2.0 ** (bit_width - 1), 2.0 ** (bit_width - 1) - 1.0

    if asymmetric:
        scale = (max_val - min_val) / (max_qval - min_qval)
        zero_point0 = min_qval - min_val / scale
        zero_point = np.rint(zero_point0).astype(np.int64)
    else:
        scale = (2 * max(max_val, min_val)) / (max_qval - min_qval)
        zero_point = np.array(0, np.int64)

    return scale, zero_point

def q_matmul(arr_a: np.ndarray, scale_a: float, zero_point_a: int,
           arr_b: np.ndarray, scale_b: float, zero_point_b: int):
    q_matmul_result = np.matmul(arr_a.astype(np.int64), arr_b)
    scale = scale_a * scale_b
    zero_points = (arr_a.sum(axis=-1, keepdims=True) * zero_point_b
                  + arr_b.sum(axis=-2, keepdims=True) * zero_point_a
                  - zero_point_a * zero_point_b * arr_a.shape[-1])
    return q_matmul_result, scale, zero_points

def requantize(arr: np.ndarray, arr_scale: float, arr_zero_points: np.ndarray,
               res_scale: float, res_zero_point: int, bit_width: int):
    min_qval, max_qval = -2.0 ** (bit_width - 1), 2.0 ** (bit_width - 1) - 1.0
    dequant = dequantize(arr, arr_scale, arr_zero_points)
    qdata = np.clip(np.rint(res_zero_point + 1 / res_scale * dequant), min_qval, max_qval).astype(np.int64)
    return qdata

# Trained MLP for circles dataset

# # Every row of `inp` contains the x and y position of a point
inp = np.array([[0., 0.], [1., 1.]], dtype=np.float32) 

fc1_weight = np.array([[ 4.4037123 , -2.9683902 , -4.4077654 ,  2.3313837 ,  0.05330967], [-1.0420023 ,  3.5323772 , -1.5059234 ,  4.3279686 , -4.243471  ]], dtype=np.float32)
fc1_bias = np.array([-2.0229015, -2.446563 , -2.7381809, -2.715235 , -1.9951458], dtype=np.float32)
fc2_weight = np.array([[ 2.7341564, -2.7900338], [ 3.049221 , -3.114146 ], [ 2.761332 , -2.8246257], [ 3.0681298, -3.1184993], [ 2.8039508, -2.8363247]], dtype=np.float32)
fc2_bias = np.array([-4.372014,  4.457383], dtype=np.float32)

fc1_out = np.matmul(inp, fc1_weight) + fc1_bias  # dense layer #1
fc1_act = fc1_out.copy()
fc1_act[fc1_out < 0] = 0.0  # first layer activation  # activation #1 (relu)
fc2_out = np.matmul(fc1_act, fc2_weight) + fc2_bias  # dense layer #2
fc2_act = 1.0 / (1.0 + np.exp(-fc2_out))  # activation #2 (sigmoid)

# Quantize MLP

# # Input
inp_scale, inp_zero_point = quant_parameters(inp.min(), inp.max(), bit_width=8, asymmetric=True)
inp_q = quantize(inp, inp_scale, inp_zero_point, bit_width=8)

# # FC layer 1
fc1_weight_scale, fc1_weight_zero_point = quant_parameters(fc1_weight.min(), fc1_weight.max(), bit_width=8, asymmetric=False)
fc1_weight_q = quantize(fc1_weight, fc1_weight_scale, fc1_weight_zero_point, bit_width=8)
fc1_out_scale, fc1_out_zero_point = quant_parameters(fc1_out.min(), fc1_out.max(), bit_width=8, asymmetric=True)
fc1_bias_q = quantize(fc1_bias, inp_scale * fc1_weight_scale, 0, bit_width=32)

# # FC layer 2
fc2_weight_scale, fc2_weight_zero_point = quant_parameters(fc2_weight.min(), fc2_weight.max(), bit_width=8, asymmetric=False)
fc2_weight_q = quantize(fc2_weight, fc2_weight_scale, fc2_weight_zero_point, bit_width=8)
fc2_out_scale, fc2_out_zero_point = quant_parameters(fc2_out.min(), fc2_out.max(), bit_width=8, asymmetric=True)
fc2_bias_q = quantize(fc2_bias, fc1_out_scale * fc2_weight_scale, 0, bit_width=32)

# Run inference using quantized MLP

# # FC layer 1
fc1_y, fc1_y_scale, fc1_y_zero_points = q_matmul(inp_q, inp_scale, inp_zero_point,
                                                 fc1_weight_q, fc1_weight_scale, fc1_weight_zero_point)
fc1_out_q = requantize(fc1_y + fc1_bias_q, fc1_y_scale, fc1_y_zero_points,
                       fc1_out_scale, fc1_out_zero_point, bit_width=8)

# # ReLU activation
fc1_act_q = fc1_out_q.copy()
fc1_act_q[fc1_out_q < fc1_out_zero_point] = fc1_out_zero_point

# # FC layer 2
fc2_y, fc2_y_scale, fc2_y_zero_points = q_matmul(fc1_act_q, fc1_out_scale, fc1_out_zero_point,
                                                 fc2_weight_q, fc2_weight_scale, fc2_weight_zero_point)
fc2_out_q = requantize(fc2_y + fc2_bias_q, fc2_y_scale, fc2_y_zero_points,
                       fc2_out_scale, fc2_out_zero_point, bit_width=8)

# Dequantize output of 2. FC layer         
fc2_out_deq = dequantize(fc2_out_q, fc2_out_scale, fc2_out_zero_point)

# # Sigmoid activation on dequantized output of 2. FC layer
fc2_act_deq = 1.0 / (1.0 + np.exp(-fc2_out_deq))

with np.printoptions(precision=4, suppress=True):
    print("fc2_act:\n", np.array2string(fc2_act))
    print("fc2_act_deq:\n", np.array2string(fc2_act_deq))
    print("quantized inference error:\n", np.abs(fc2_act_deq - fc2_act))