// Auto-generated file. Do not edit!
//   Template: src/math/f32-tanh-scalar-expm1plus.c.in
//   Generator: tools/xngen
//
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <math.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/math-stubs.h>


void xnn_math_f32_tanh__fma_expm1plus_rr2_p6h4ts_div(
    size_t n,
    const float* input,
    float* output)
{
  assert(n % sizeof(float) == 0);

  // The smallest z for which tanhf(z) is saturated at 1.0f.
  const float vsat_cutoff = 0x1.205968p+3f;
  const float vlog2e = 0x1.715476p+0f;
  // Large number such that ulp(magic bias) == 0.5 and magic bias === 63.5 mod 2**21.
  const float vmagic_bias = 0x1.8000FEp+22f;
  const float vminus_ln2_hi = -0x1.62E430p-1f;
  const float vminus_ln2_lo = 0x1.05C610p-29f;
  // Coefficients of polynomial approximation
  //   exp(2t) - 1 ~ 2 * (t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))))
  // on [-log(2)/4, log(2)/4]
  const float vc6 = 0x1.6B7338p-5f;
  const float vc5 = 0x1.12278Ep-3f;
  const float vc4 = 0x1.555716p-2f;
  const float vc3 = 0x1.5554B0p-1f;
  const float vc2 = 0x1.FFFFFEp-1f;
  const float vone = 1.0f;
  const float vtwo = 2.0f;

  for (; n != 0; n -= sizeof(float)) {
    const float vx = *input++;

    // General structure of the algorithm:
    //
    //           / expm1(2x) / (2 + expm1(2x)) if x >= 0
    //   f(x) :=
    //           \ -f(-x) if x <= 0
    //
    // First we compute y := expm1(2z) / (2 + expm1(2z)) where z = abs(x),
    // then set its sign according to the sign of x: f(x) := sign(x) * abs(y).
    float vz = fabsf(vx);

    // The function saturates at -1 for large positive inputs: tanhf(-z) == -1.0f for z >= sat_cutoff ~= 9.010913.
    // To guarantee this behaviour, we clip input z at sat_cutoff, and leverage the fact that for our implementation
    // tanhf(sat_cutoff) == -1.0f. NaN inputs are passed unchanged.
    vz = math_pmin_f32(vz, vsat_cutoff);

    // Compute reduced argument n := round(z / log(2), 1).
    // We do it by adding a large number (magic bias), which cause rounding of the result to 1 fractional bit,
    // then subtracing the large number back. The trick with adding large number is valid only within certain bounds
    // (|z / log(2)| <= 2**21, i.e. |z| <= 0x1.62E43p+20 = 1453635.0), but that is acceptable, because inputs x
    // outside of [-9.010913, 9.010913] (i.e. z outsize [0, 9.010913]) saturate tanhf(x).
    // Additionally, we fuse addition of the floating-point exponent bias (127) into the magic bias.
    // Note that addition-subtraction of the large number doesn't cause overflow for inputs in this range.
    float vn = fmaf(vz, vlog2e, vmagic_bias);

    // Create a floating-point number s (scale) such that s == 2**(2n) for inputs which don't cause underflow, i.e.
    // 0 <= z <= 9.010913, and -13 <= n <= 0 accordingly.
    const float vs = uint32_as_float(float_as_uint32(vn) << 23);

    // Subtract the large number back to get final n := round(z / log(2), 1) as a floating-point number.
    vn -= vmagic_bias;

    // Compute reduced argument t := z - n * log(2).
    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
    float vt = fmaf(vn, vminus_ln2_hi, vz);
    vt = fmaf(vn, vminus_ln2_lo, vt);

    // Compute degree-6 polynomial approximation for exp(2t) - 1 on [-log(2)/4, log(2)/4].
    //   P(t) = 2 * (t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))))
    //        = 2 * (t + t * p)
    float vp = fmaf(vc6, vt, vc5);
    vp = fmaf(vp, vt, vc4);
    vp = fmaf(vp, vt, vc3);
    vp = fmaf(vp, vt, vc2);
    vp *= vt;

    // Reconstruct the exp(2z) - 1 value:
    //   exp(2z) - 1 = s * (2 * (t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))) + 1) - 1
    //               = s * (2 * (t + t * p) + 1) - 1
    //               = (s - 1) + 2 * ((t * s) + (t * s) * p)
    const float vts = vt * vs;
    const float vsmo = vs - vone;
    vp = fmaf(vp, vts, vts);
    const float vemo = fmaf(vp, vtwo, vsmo);

    // Denominator of the tanh fraction: exp(2z) + 1 = expm1(2z) + 2
    const float vepo = vemo + vtwo;

    // Reconstruct y = expm1(2z) / (expm1(2z) + 2)
    float vy = vemo / vepo;

    // Reconstruct tanh(x) = copysign(y, x)
    vy = copysignf(vy, vx);

    *output++ = vy;
  }
}
