/* dspfns.h
 *
 * Copyright 2001,2014 ARM Limited. All rights reserved.
 *
 * RCS $Revision: 185525 $
 * Checkin $Date: 2014-05-29 12:44:48 +0100 (Thu, 29 May 2014) $
 * Revising $Author: ransin01 $
 */

/* ----------------------------------------------------------------------
 * This header file provides a set of DSP-type primitive
 * operations, such as 16-bit and 32-bit saturating arithmetic. The
 * operations it provides are similar to the ones used by the ITU
 * for publishing specifications of DSP algorithms.
 */

#ifndef ARMDSP_DSPFNS_H
#define ARMDSP_DSPFNS_H
#define __ARMCLIB_VERSION 5050106

#ifdef __cplusplus
#define __STDC_LIMIT_MACROS 1
#define __STDC_FORMAT_MACROS 1
#define __STDC_CONSTANT_MACROS 1
#endif /* __cplusplus */
#include <stdint.h>
#include <assert.h>

#ifndef MAX_16
#define MAX_16 INT16_MAX
#define MIN_16 INT16_MIN
#define MAX_32 INT32_MAX
#define MIN_32 INT32_MIN
#endif /* MAX_16 etc. */

#if 0
#ifndef __TARGET_FEATURE_DSPMUL
#error ETSI intrinsics not currently emulated on this platform
#endif /* __TARGET_FEATURE_DSPMUL */

#if defined(__thumb) && (__TARGET_ARCH_THUMB < 4)
#error ETSI intrinsics not available on Thumb-1
#endif /* Thumb but not Thumb-2 */
#endif

#ifdef __cplusplus
#define __ARM_INTRINSIC __forceinline
#elif defined __GNUC__ || defined _USE_STATIC_INLINE
#define __ARM_INTRINSIC static __forceinline
#elif (defined(__STDC_VERSION__) && 199901L <= __STDC_VERSION__)
#define __ARM_INTRINSIC __forceinline
#else
#define __ARM_INTRINSIC __forceinline
#endif

/* Define this to 1 if you do not need add() etc. to set the saturation flag */
#ifndef __ARM_DSP_IGNORE_OVERFLOW
#define __ARM_DSP_IGNORE_OVERFLOW 0
#endif

/* Define this to 1 if you believe all shift counts are in the range [-255,255] */
#ifndef __ARM_DSP_SMALL_SHIFTS
#define __ARM_DSP_SMALL_SHIFTS 0
#endif

#ifdef __cplusplus
extern "C" {
#endif

#ifdef __TARGET_FEATURE_DSPMUL

#pragma recognize_itu_functions /* enable vectorization of ITU functions */

#if !defined(__ARM_BIG_ENDIAN) && !defined(__BIG_ENDIAN)

typedef union {
  struct {
    int _dnm:27;
    int Q:1;
    int V:1;
    int C:1;
    int Z:1;
    int N:1;
  } b;
  unsigned int word;
} _ARM_PSR;

#else /* defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN) */

typedef union {
  struct {
    int N:1;
    int Z:1;
    int C:1;
    int V:1;
    int Q:1;
    int _dnm:27;
  } b;
  unsigned int word;
} _ARM_PSR;

#endif /* defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN) */

register _ARM_PSR _apsr_for_q __asm("apsr");
#define Overflow _apsr_for_q.b.Q

#else

__ARM_INTRINSIC int *_arm_global_overflow(void) {
    static int v;
    return &v;
}

#define Overflow (*_arm_global_overflow())

__ARM_INTRINSIC int32_t __qadd(int32_t x, int32_t y)
{
    int32_t r;
#if __TARGET_ARCH_ARM > 0
    int ov = 0;
    __asm {
        adds r, x, y
        movvs ov, #1
    }
    if (ov) {
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        r = y < 0 ? INT32_MIN : INT32_MAX;
    }
#else
    r = x + y;
    if (y > 0 && r < x) {
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        return INT32_MAX;
    } else if (y < 0 && r > x) { 
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        return INT32_MIN;
    }
#endif
    return r;
}

__ARM_INTRINSIC int32_t __qsub(int32_t x, int32_t y)
{
    int32_t r;
#if __TARGET_ARCH_ARM > 0
    int ov = 0;
    __asm {
        subs r, x, y
        movvs ov, #1
    }
    if (ov) {
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        r = y >= 0 ? INT32_MIN : INT32_MAX;
    }
#else
    r = x - y;
    if (y > 0 && r > x) {
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        return INT32_MIN;
    } else if (y < 0 && r < x) {
#if !__ARM_DSP_IGNORE_OVERFLOW
        Overflow = 1;
#endif
        return INT32_MAX;
    }
#endif
    return r;
}

__ARM_INTRINSIC int32_t __qdbl(int32_t x)
{
    return __qadd(x, x);
}

#endif

__ARM_INTRINSIC int *_arm_global_carry(void) {
    static int c;
    return &c;
}

#define Carry (*_arm_global_carry())

/*
 * Convert a 32-bit signed integer into a 16-bit signed integer by
 * saturation.
 */
__ARM_INTRINSIC int16_t saturate(int32_t x)
{
#if (defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6)
    return (int16_t)__ssat(x, 16);
#else
    /* ARM v5E has no SSAT instruction */      
    if (x > INT16_MAX || x < INT16_MIN)
        x = __qdbl(INT32_MAX - ((x) >> 31)) >> 16;   /* Saturate and set Overflow */
    return (int16_t) x;
#endif
}

/*
 * Add two 16-bit signed integers with saturation.
 */
__ARM_INTRINSIC int16_t add(int16_t x, int16_t y)
{
#if __ARM_DSP_IGNORE_OVERFLOW && ((defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6))
    return (int16_t)__qadd16(x, y);
#else
    return (int16_t)(__qadd(x<<16, y<<16) >> 16);
#endif
}

/*
 * Subtract one 16-bit signed integer from another with saturation.
 */
__ARM_INTRINSIC int16_t sub(int16_t x, int16_t y)
{
#if __ARM_DSP_IGNORE_OVERFLOW && ((defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6))
    return (int16_t)__qsub16(x, y);
#else
    return (int16_t)(__qsub(x<<16, y<<16) >> 16);
#endif
}

/*
 * Absolute value of a 16-bit signed integer. Saturating, so
 * abs(-0x8000) becomes +0x7FFF.
 */
__ARM_INTRINSIC int16_t abs_s(int16_t x)
{
    if (x >= 0)
        return x;
#if (defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6)
    return (int16_t)__qsub16(0, x);
#else
    else if (x == INT16_MIN)
        return INT16_MAX;
    else
        return (int16_t) -x;
#endif
}

/*
 * Shift a 16-bit signed integer left (or right, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int16_t shl(int16_t x, int16_t shift)
{
    if (shift <= 0 || x == 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        if (shift < -63) shift = -63;
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return (int16_t) (x >> (-shift));
    }
    if (shift > 15)
        shift = 16;
    return saturate(x << shift);
}

/*
 * Shift a 16-bit signed integer right (or left, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int16_t shr(int16_t x, int16_t shift)
{
    if (shift >= 0 || x == 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        if (shift > 63) shift = 63;
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return (int16_t) (x >> shift);
    }
    if (shift < -15)
        shift = -16;
    return saturate(x << (-shift));
}

/*
 * Multiply two 16-bit signed integers, shift the result right by
 * 15 and saturate it. (Saturation is only necessary if both inputs
 * were -0x8000, in which case the result "should" be 0x8000 and is
 * saturated to 0x7FFF.)
 */
__ARM_INTRINSIC int16_t mult(int16_t x, int16_t y)
{
    return (int16_t)(__qdbl(x*y) >> 16);
}

/*
 * Multiply two 16-bit signed integers to give a 32-bit signed
 * integer. Shift left by one, and saturate the result. (Saturation
 * is only necessary if both inputs were -0x8000, in which case the
 * result "should" be 0x40000000 << 1 = +0x80000000, and is
 * saturated to +0x7FFFFFFF.)
 */
__ARM_INTRINSIC int32_t L_mult(int16_t x, int16_t y)
{
    return __qdbl(x*y);
}

/*
 * Negate a 16-bit signed integer, with saturation. (Saturation is
 * only necessary when the input is -0x8000.)
 */
__ARM_INTRINSIC int16_t negate(int16_t x)
{
#if (defined(__thumb) && (__TARGET_ARCH_THUMB >= 4)) || (__TARGET_ARCH_ARM >= 6)
    return (int16_t)__qsub16(0, x);
#else
    if (x == INT16_MIN)
        return INT16_MAX;
    return (int16_t) -x;
#endif
}

/*
 * Return the top 16 bits of a 32-bit signed integer.
 */
__ARM_INTRINSIC int16_t extract_h(int32_t x)
{
    return (int16_t) (x >> 16);
}

/*
 * Return the bottom 16 bits of a 32-bit signed integer, with no
 * saturation, just coerced into a two's complement 16 bit
 * representation.
 */
__ARM_INTRINSIC int16_t extract_l(int32_t x)
{
    return (int16_t) x;
}

/*
 * Divide a 32-bit signed integer by 2^16, rounding to the nearest
 * integer (round up on a tie). Equivalent to adding 0x8000 with
 * saturation, then shifting right by 16.
 */
__ARM_INTRINSIC int16_t round(int32_t x)
{
    return extract_h(__qadd(x, 0x8000));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and add to
 * another 32-bit integer with saturation.
 * 
 * Note the intermediate saturation operation in the definition:
 * 
 *    L_mac(-1, -0x8000, -0x8000)
 * 
 * will give 0x7FFFFFFE and not 0x7FFFFFFF:
 *    the unshifted product is:   0x40000000
 *    shift left with saturation: 0x7FFFFFFF
 *    add to -1 with saturation:  0x7FFFFFFE
 */
__ARM_INTRINSIC int32_t L_mac(int32_t accumulator, int16_t x, int16_t y)
{
    return __qadd(accumulator, __qdbl(x*y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and subtract
 * from another 32-bit integer with saturation.
 * 
 * Note the intermediate saturation operation in the definition:
 * 
 *    L_msu(1, -0x8000, -0x8000)
 * 
 * will give 0x80000002 and not 0x80000001:
 *    the unshifted product is:         0x40000000
 *    shift left with saturation:       0x7FFFFFFF
 *    subtract from 1 with saturation:  0x80000002
 */
__ARM_INTRINSIC int32_t L_msu(int32_t accumulator, int16_t x, int16_t y)
{
    return __qsub(accumulator, __qdbl(x*y));
}

/*
 * Add two 32-bit signed integers with saturation.
 */
__ARM_INTRINSIC int32_t L_add(int32_t x, int32_t y)
{
    return __qadd(x, y);
}

/*
 * Subtract one 32-bit signed integer from another with saturation.
 */
__ARM_INTRINSIC int32_t L_sub(int32_t x, int32_t y)
{
    return __qsub(x, y);
}

/*
 * Add together the Carry variable and two 32-bit signed integers,
 * without saturation.
 * Note: the reference implementation has INT32_MIN + -1 + (Carry=1)
 * set the cumulative overflow flag.  This does not match intuition,
 * or the natural behavior of ARM's ADCS instruction.
 */
__ARM_INTRINSIC int32_t L_add_c(int32_t x, int32_t y)
{
    int32_t result;
#if __TARGET_ARCH_ARM > 0
    int32_t flags;
    __asm {
        movs flags, Carry, lsr #1
        adcs result, x, y;
        mrs flags, CPSR;
    }
#if !__ARM_DSP_IGNORE_OVERFLOW
    if (flags & 0x10000000) Overflow = 1;  /* V -> Q */
#endif
    Carry = (flags & 0x20000000) != 0;
#else
    /* Inline assembler not available */
    result = x + y + Carry;
    Carry = (uint32_t)((x & y) | ((x | y) & ~result)) >> 31;
#if !__ARM_DSP_IGNORE_OVERFLOW
    if (((result ^ x) & (result ^ y) & 0x80000000) != 0) Overflow = 1;
#endif
#endif
    return result;
}

/*
 * Subtract one 32-bit signed integer, together with the logical
 * negation of the Carry variable, from another 32-bit signed integer,
 * without saturation.
 * N.b. the computation matches that of the ETSI reference function
 * (in basicop2.c).  The comment above the ETSI reference function says
 * that L_sub_c(a,b) = a-b-C, but that does not match their code.
 */
__ARM_INTRINSIC int32_t L_sub_c(int32_t x, int32_t y)
{
    int32_t result;
#if __TARGET_ARCH_ARM > 0
    int32_t flags;
    __asm {
        movs flags, Carry, lsr #1
        sbcs result, x, y;
        mrs flags, CPSR;
    }
#if !__ARM_DSP_IGNORE_OVERFLOW
    if (flags & 0x10000000) Overflow = 1;  /* V -> Q */
#endif
    Carry = (flags & 0x20000000) != 0;
#else
    /* Inline assembler not available */
    result = x + ~y + Carry;
    Carry = ((uint32_t)((x & ~y) | ((x | ~y) & ~result)) >> 31);
#if !__ARM_DSP_IGNORE_OVERFLOW
    if (((x ^ y) & (result ^ y) & 0x80000000) != 0) Overflow = 1;
#endif
#endif
    return result;
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one _with_ saturation, and add
 * with carry to another 32-bit integer _without_ saturation.
 */
__ARM_INTRINSIC int32_t L_macNs(int32_t accumulator, int16_t x, int16_t y)
{
    return L_add_c(accumulator, L_mult(x, y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one _with_ saturation, and
 * subtract with carry from another 32-bit integer _without_
 * saturation.
 */
__ARM_INTRINSIC int32_t L_msuNs(int32_t accumulator, int16_t x, int16_t y)
{
    return L_sub_c(accumulator, L_mult(x, y));
}

/*
 * Negate a 32-bit signed integer, with saturation. (Saturation is
 * only necessary when the input is -0x80000000.)
 */
__ARM_INTRINSIC int32_t L_negate(int32_t x)
{
    return __qsub(0, x);
}

/*
 * Multiply two 16-bit signed integers, shift the result right by
 * 15 with rounding, and saturate it. (Saturation is only necessary
 * if both inputs were -0x8000, in which case the result "should"
 * be 0x8000 and is saturated to 0x7FFF.)
 */
__ARM_INTRINSIC int16_t mult_r(int16_t x, int16_t y)
{
    return (int16_t)(__qdbl(x*y + 0x4000) >> 16);
}

/*
 * Return the number of bits of left shift needed to arrange for a
 * 16-bit signed integer to have value >= 0x4000 or <= -0x4001.
 * 
 * Returns 0 if x is zero (following C reference implementation).
 */
__ARM_INTRINSIC int16_t norm_s(int16_t x)
{
    return __clz(x ^ ((int32_t)x << 17)) & 15;
}

/*
 * Return the number of bits of left shift needed to arrange for a
 * 32-bit signed integer to have value >= 0x40000000 (if +ve)
 * or <= -0x40000001 (if -ve).
 * 
 * Returns 0 if x is zero (following C reference implementation).
 */
__ARM_INTRINSIC int16_t norm_l(int32_t x)
{
    return __clz(x ^ (x << 1)) & 31;
}

/*
 * Shift a 32-bit signed integer left (or right, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int32_t L_shl(int32_t x, int16_t shift)
{
    if (shift <= 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        if (shift < -63) shift = -63;
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return x >> (-shift);
    }
    if (shift <= norm_l(x) || x == 0)
        return x << shift;
    return __qdbl((x < 0) ? INT32_MIN : INT32_MAX);
}

/*
 * Shift a 32-bit signed integer right (or left, if the shift count
 * is negative). Saturate on overflow.
 */
__ARM_INTRINSIC int32_t L_shr(int32_t x, int16_t shift)
{
    if (shift >= 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        if (shift > 63) shift = 63;
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return x >> shift;
    }
    if ((-shift) <= norm_l(x) || x == 0)
        return x << (-shift);
    return __qdbl((x < 0) ? INT32_MIN : INT32_MAX);
}

/*
 * Shift a 16-bit signed integer right, with rounding. Shift left
 * with saturation if the shift count is negative.
 */
__ARM_INTRINSIC int16_t shr_r(int16_t x, int16_t shift)
{
    if (shift == 0 || x == 0)
        return (int16_t)x;
    if (shift > 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        if (shift > 32) shift = 32;
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return (int16_t) (((x >> (shift-1)) + 1) >> 1);
    }
    if (shift < -15)
        shift = -16;
    return saturate(x << (-shift));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and add to
 * another 32-bit integer with saturation (like L_mac). Then shift
 * the result right by 15 bits with rounding (like round).
 */
__ARM_INTRINSIC int16_t mac_r(int32_t accumulator, int16_t x, int16_t y)
{
    return round(L_mac(accumulator, x, y));
}

/*
 * Multiply two 16-bit signed integers together to give a 32-bit
 * signed integer, shift left by one with saturation, and subtract
 * from another 32-bit integer with saturation (like L_msu). Then
 * shift the result right by 15 bits with rounding (like round).
 */
__ARM_INTRINSIC int16_t msu_r(int32_t accumulator, int16_t x, int16_t y)
{
    return round(L_msu(accumulator, x, y));
}

/*
 * Shift a 16-bit signed integer left by 16 bits to generate a
 * 32-bit signed integer. The bottom 16 bits are zeroed.
 */
__ARM_INTRINSIC int32_t L_deposit_h(int16_t x)
{
    return ((int32_t)x) << 16;
}

/*
 * Sign-extend a 16-bit signed integer by 16 bits to generate a
 * 32-bit signed integer.
 */
__ARM_INTRINSIC int32_t L_deposit_l(int16_t x)
{
    return (int32_t)x;
}

/*
 * Shift a 32-bit signed integer right, with rounding. Shift left
 * with saturation if the shift count is negative.
 */
__ARM_INTRINSIC int32_t L_shr_r(int32_t x, int16_t shift)
{
    if (shift == 0 || x == 0)
        return x;
    if (shift > 0) {
#if !__ARM_DSP_SMALL_SHIFTS
        int32_t x2 = (shift > 32) ? 0 : x >> (shift-1);
#else
        int32_t x2 = x >> (shift-1);
#endif /* __ARM_DSP_SMALL_SHIFTS */
        return (x2 >> 1) + (x2 & 1);
    }
    if (-shift <= norm_l(x) || x == 0)
        return x << (-shift);
    return __qdbl((x < 0) ? INT32_MIN : INT32_MAX);
}

/*
 * Absolute value of a 32-bit signed integer. Saturating, so
 * abs(-0x80000000) becomes +0x7FFFFFFF.
 */
__ARM_INTRINSIC int32_t L_abs(int32_t x)
{
    if (x >= 0)
        return x;
    else
        return __qsub(0, x);
}

/*
 * Return a saturated value appropriate to the most recent carry-
 * affecting operation (L_add_c, L_macNs, L_sub_c, L_msuNs).
 * 
 * In other words: return the argument if the Q flag is clear.
 * Otherwise, return -0x80000000 or +0x7FFFFFFF depending on
 * whether the Carry flag is set or clear (respectively).
 */
__ARM_INTRINSIC int32_t L_sat(int32_t x)
{
    if (Overflow) {
        Overflow = 0;
        x = (int32_t)((uint32_t)INT32_MAX + Carry);
        Carry = 0;
    }
    return x;
}

/*
 * Divide one 16-bit signed integer by another, and produce a
 * 15-bit fixed point fractional result (by multiplying the true
 * mathematical result by 0x8000). The divisor (denominator) is
 * assumed to be non-zero and also assumed to be greater or equal
 * to the dividend (numerator). Hence the (unscaled) result is
 * necessarily within the range [0,1].
 * 
 * Both operands are assumed to be positive.
 * 
 * After division, the result is saturated to fit into a 16-bit
 * signed integer. (The only way this can happen is if the operands
 * are equal, so that the result would be 1, i.e. +0x8000 in 15-bit
 * fixed point.)
 */
__ARM_INTRINSIC int16_t div_s(int16_t x, int16_t y)
{
    int32_t quot;
    assert(y > 0);
    assert(x >= 0);
    assert(x <= y);
    quot = 0x8000 * x;
    quot /= y;
    if (quot > INT16_MAX)
        return INT16_MAX;
    else
        return (int16_t)quot;
}

#ifdef __cplusplus
}
#endif

#endif /* ARMDSP_DSPFNS_H */